In [None]:
!pip install pip install iterative-stratification
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import pickle 
import numpy as np
import sys
import gc

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn.functional as F

from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint,LearningRateScheduler,ReduceLROnPlateau

from transformers import AutoModel,AutoTokenizer, AutoConfig\

from tqdm import tqdm

In [None]:
def read_file(filename):
  df = pd.read_csv(filename)

  df["src"]="train"

  # Split 80-20
  dftr_train = df[:3128]
  dftr_test = df[3128:]

  return dftr_train, dftr_test

def get_target_cols():
  target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
  return target_cols

def read_pickle(filename):
  with open(filename, 'rb') as f:
    obj = pickle.load(f)
    return obj

def dump_pickle(obj, filename):
  with open(filename, 'wb') as f:
    obj = pickle.dump(obj, f, protocol=4)
    return obj

def array_to_txt(array, filename):
  np.savetxt(filename, array)
  return

In [None]:
def make_k_folds(df, FOLDS = 5):
  skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
  for i,(train_index, val_index) in enumerate(skf.split(df,df[target_cols])):
      df.loc[val_index,'FOLD'] = i
  print('Train samples per fold:')
  print(df.FOLD.value_counts())
  return df

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sentence_embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    token_embeddings = token_embeddings.numpy()
    
    return sentence_embedding, token_embeddings

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,
            return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens

In [None]:
def get_sentence_token_embeddings(embed_dataloader_tr, MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN
    MAX_LEN = MAX

    if torch.cuda.is_available():
        DEVICE = torch.device("cuda")
        print("Using GPU.")
    else:
        print("No GPU available, using the CPU instead.")
        DEVICE = torch.device("cpu")
    
    cfg = AutoConfig.from_pretrained(MODEL_NM, output_hidden_states=True)
    cfg.hidden_dropout_prob = 0
    cfg.attention_probs_dropout_prob = 0
    #cfg.save_pretrained('./tokenizer/')
    
    model = AutoModel.from_pretrained(MODEL_NM, config=cfg)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NM)

    model = model.to(DEVICE)

    all_train_text_feats = []
    all_train_token_embedding = []

    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):

        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)

        model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings, token_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        
        # print(sentence_embeddings.shape)
        # print(token_embeddings.shape)
        
        all_train_text_feats.append(torch.tensor(sentence_embeddings))
        all_train_token_embedding.append(torch.tensor(token_embeddings))
        
        del sentence_embeddings
        del token_embeddings

        input_ids = batch["input_ids"].to("cpu")
        attention_mask = batch["attention_mask"].to("cpu")

        gc.collect()
    
    all_train_text_feats = torch.cat(all_train_text_feats,0).numpy()
    all_train_token_embedding = torch.cat(all_train_token_embedding,0).numpy()

    print(all_train_token_embedding.shape)
    print(all_train_text_feats.shape)

    if verbose:
        print('Embeddings shape',all_train_text_feats.shape)

    return all_train_text_feats, all_train_token_embedding

In [None]:
# INitialization

tokenizer = None
MAX_LEN = 640
BATCH_SIZE = 4
MODEL_NM = 'microsoft/deberta-base'

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
target_cols = get_target_cols()
df_train, df_test = read_file('/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/train.csv')

df_train_folds = make_k_folds(df_train) # There's no need for test folds

Train samples per fold:
4.0    626
1.0    626
2.0    626
3.0    625
0.0    625
Name: FOLD, dtype: int64


In [None]:
ds_train = EmbedDataset(df_train_folds)
train_dataloader = torch.utils.data.DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=False)

ds_test = EmbedDataset(df_test)
test_dataloader = torch.utils.data.DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
train_sent_emb, train_tok_emb = get_sentence_token_embeddings(train_dataloader, MODEL_NM)
dump_pickle(train_tok_emb, '/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/train_tok_emb.pkl')
dump_pickle(train_sent_emb, '/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/train_sent_emb.pkl')

In [None]:
test_sent_emb, test_tok_emb = get_sentence_token_embeddings(test_dataloader, MODEL_NM)

dump_pickle(test_sent_emb, '/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/test_sent_emb.pkl')
dump_pickle(test_tok_emb, '/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/test_tok_emb.pkl')

In [None]:
y_train = df_train[target_cols]
y_test = df_test[target_cols]

In [None]:
dump_pickle(y_train, '/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/y_train.pkl')
dump_pickle(y_test, '/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/y_test.pkl')

In [None]:
dump_pickle(df_train_folds, '/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/df_train_folds.pkl')