In [None]:
import pandas as pd
import numpy as np
import os
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

!pip install sentencepiece
!pip install transformers
import transformers
from transformers import AdamW

import warnings
warnings.filterwarnings("ignore")

print(torch.__version__)

# Section 2

## 2.1. Load the Data

In [None]:
sampled_df = pd.read_csv(r'sampled_data.csv', sep='\t', encoding='ISO-8859-1')

df_train = sampled_df.groupby('sentiment').sample(n=40000).sample(frac=1)
df_test = sampled_df.filter(items=list(filter(lambda x: x not in df_train.index, sampled_df.index)), axis=0)
df_train = df_train.reset_index()[['text']]
df_test = df_test.reset_index()[['text']]
df_test


## 2.2. Create Folds

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

# shuffle
df = shuffle(df_train)

# initialize kfold
kf = KFold(n_splits=10, shuffle=True, random_state=1024)

# Note:
# Each fold is a tuple ([train_index_values], [val_index_values])
# fold_0, fold_1, fold_2, fold_3, fold_5 = kf.split(df, y)

# Put the folds into a list. This is a list of tuples.
fold_list = list(kf.split(df))

train_df_list = []
val_df_list = []

for i, fold in enumerate(fold_list):

    # map the train and val index values to dataframe rows
    df_train = df[df.index.isin(fold[0])]
    df_val = df[df.index.isin(fold[1])]
    
    train_df_list.append(df_train)
    val_df_list.append(df_val)
    
    

print(len(train_df_list))
print(len(val_df_list))

# Section 3

## 3.1. Train Model

In [None]:
MODEL_TYPE = 'xlm-roberta-base'

NUM_FOLDS = 10
NUM_FOLDS_TO_TRAIN = 5

L_RATE = 2e-5
MAX_LEN = 256
NUM_EPOCHS = 100
BATCH_SIZE = 16
NUM_CORES = os.cpu_count()

NUM_CORES

## Instantiate the tokenizer

In [None]:
from transformers import XLMRobertaTokenizer

# Load the tokenizer.
print('Loading tokenizer...')
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE, do_lower_case=True)

## Create the Dataloader

In [None]:
from language_tokens import get_lang_tokens

In [None]:
probability = 0.15

def to_masked_bert(input_ids):
  result = input_ids
  actual_tokens = list(set(range(100)) - 
                      set(np.where((result == 101) | (result == 102) 
                          | (result == 0))[0].tolist()))
  
  #We need to select 15% random tokens from the given list
  num_of_token_to_mask = int(len(actual_tokens)*0.15)
  token_to_mask = np.random.choice(np.array(actual_tokens), 
                                  size=num_of_token_to_mask, 
                                  replace=False).tolist()

  #Now we have the indices where we need to mask the tokens
  result[token_to_mask] = 103

  return result

def to_masked_xlmr(input_ids):
  result = input_ids
  actual_tokens = list(set(range(100)) - 
                      set(np.where((result == 0) | (result == 1) | 
                                   (result == 2) | (result == 3))[0].tolist()))
  
  #We need to select 15% random tokens from the given list
  num_of_token_to_mask = int(len(actual_tokens)*0.15)
  token_to_mask = np.random.choice(np.array(actual_tokens), 
                                  size=num_of_token_to_mask, 
                                  replace=False).tolist()

  #Now we have the indices where we need to mask the tokens
  result[token_to_mask] = 250001

  return result

In [None]:
class CompDataset(Dataset):

    def __init__(self, df):
        self.df_data = df



    def __getitem__(self, index):

        # get the sentence from the dataframe
        features = self.df_data.loc[index, 'text']

        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    features,           # Sentences to encode.
                    add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                    truncation = True,
                    max_length = MAX_LEN,           # Pad or truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )  
        
        # These are torch tensors already.
        input_ids = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        # token_type_ids = encoded_dict['token_type_ids'][0]
        
        # clone input ids to target
        target = input_ids.detach().clone()
        # add mask token to input ids
        input_ids = to_masked_xlmr(input_ids)

        language_ids = torch.tensor(get_lang_tokens(
            [x.replace(' ', '') for x in tokenizer.batch_decode(input_ids.tolist())]
        ))

        sample = (input_ids,
                  att_mask,
                  # token_type_ids,
                  language_ids,
                  target)
        return sample


    def __len__(self):
        return len(self.df_data)

class TestDataset(Dataset):

    def __init__(self, df):
        self.df_data = df



    def __getitem__(self, index):

        # get the sentence from the dataframe
        features = self.df_data.loc[index, 'text']

        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    features,           # Sentence to encode.
                    add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                    truncation = True,
                    max_length = MAX_LEN,           # Pad or truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )
        
        # These are torch tensors already.
        input_ids = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        token_type_ids = encoded_dict['token_type_ids'][0]
        
        # add mask token to input ids
        input_ids = to_masked_xlmr(input_ids)

        language_ids = torch.tensor(get_lang_tokens(
            [x.replace(' ', '') for x in tokenizer.batch_decode(input_ids.tolist())]
        ))

        sample = (input_ids,
                  att_mask,
                  # token_type_ids,
                  language_ids)
        return sample


    def __len__(self):
        return len(self.df_data)

## Test the dataloader

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_val

In [None]:
train_data = CompDataset(df_train)
val_data = CompDataset(df_val)
test_data = TestDataset(df_test)



train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                       num_workers=NUM_CORES)



print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

input_ids, att_mask, language_ids, target = next(iter(train_dataloader))

## Define the Model

In [None]:
import torch

# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
from transformers import XLMRobertaConfig
from modeling_xlm_roberta import XLMRobertaForMaskedLM

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
config = XLMRobertaConfig.from_pretrained(
    MODEL_TYPE,
    num_labels = len(set([x.item() for x in target])),
    output_attentions = False,
    output_hidden_states = False,
    num_hidden_layers = 5,
    num_attention_heads = 8,
    hidden_dropout_prob = 0.2,
    attention_probs_dropout_prob = 0.2,
    ignore_mismatched_sizes=True
    )

model = XLMRobertaForMaskedLM.from_pretrained(
    MODEL_TYPE,
    config=config
)

# Send the model to the device.
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(),
  lr = L_RATE, 
  eps = 1e-8
)

## Train the Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# with open ('/content/drive/MyDrive/model.bin', 'rb') as fp:
#   states = torch.load(fp)

#   curr_epoch = states['epoch']
#   model.load_state_dict(states['model'])
#   optimizer.load_state_dict(states['optimizer'])

#   del states

# curr_epoch

In [None]:
# initial settings

curr_epoch = 0

In [None]:
%%time

import pickle
from tqdm.auto import tqdm

# Set a seed value.
seed_val = 1024

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
    

# For each epoch...
for epoch in range(curr_epoch, NUM_EPOCHS):
    
    print("\nNum folds used for training:", NUM_FOLDS_TO_TRAIN)
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    
    # Get the number of folds
    num_folds = len(train_df_list)

    # For this epoch, store the val acc scores for each fold in this list.
    # We will use this list to calculate the cv at the end of the epoch.
    epoch_acc_scores_list = []
    
    # For each fold...
    for fold_index in range(1, NUM_FOLDS_TO_TRAIN):
        
        print('\n== Fold Model', fold_index)

        # ========================================
        #               Training
        # ========================================
        
        stacked_val_labels = []
        targets_list = []

        print('Training...')

        progress_bar_train =  tqdm(range(len(train_dataloader)))

        # put the model into train mode
        model.train()

        # This turns gradient calculations on and off.
        torch.set_grad_enabled(True)


        # Reset the total loss for this epoch.
        total_train_loss = 0

        for i, batch in enumerate(train_dataloader):

            train_status = 'Batch ' + str(i+1) + ' of ' + str(len(train_dataloader))

            print(train_status, end='\r')


            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            # b_token_type_ids = batch[2].to(device)
            b_language_ids = batch[2].to(device)
            b_labels = batch[3].to(device)

            model.zero_grad()        


            outputs = model(b_input_ids, 
                        # token_type_ids=b_token_type_ids, 
                        attention_mask=b_input_mask,
                        language_ids=b_language_ids,
                        labels=b_labels)
            
            progress_bar_train.update(1)

            # Get the loss from the outputs tuple: (loss, logits)
            loss = outputs[0]

            # Convert the loss from a torch tensor to a number.
            # Calculate the total loss.
            total_train_loss = total_train_loss + loss.item()

            # Zero the gradients
            optimizer.zero_grad()

            # Perform a backward pass to calculate the gradients.
            loss.backward()
            
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Use the optimizer to update Weights
            
            # Optimizer for GPU
            optimizer.step() 
            
        print('Train loss:' ,total_train_loss)

        # Save the Model
        torch.save({
            'epoch': epoch,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }, '/content/drive/MyDrive/model.bin')
        print('Saved model.')              
        
        # Use the garbage collector to save memory.
        gc.collect()

In [None]:
model.save('/content/drive/MyDrive/final_model')