In [None]:
import pandas as pd
import numpy as np
import os
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

!pip install sentencepiece
!pip install transformers
import transformers
from transformers import AdamW

import warnings
warnings.filterwarnings("ignore")


print(torch.__version__)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
1.13.0+cu116


# Section 2

## 2.1. Load the Data

In [None]:
sentiment_to_num = {
    'Negative': 0,
    'Neutral': 1,
    'Positive': 2
}

In [None]:
sampled_df = pd.read_csv(r'sampled_data.csv', sep='\t', encoding='ISO-8859-1')

df_train = sampled_df.groupby('sentiment').sample(n=40000).sample(frac=1)
df_test = sampled_df.filter(items=list(filter(lambda x: x not in df_train.index, sampled_df.index)), axis=0)
df_train = df_train.reset_index()[['text', 'sentiment']]
df_test = df_test.reset_index()[['text', 'sentiment']]
df_train['sentiment'] =  df_train['sentiment'].apply(lambda x: sentiment_to_num[x])
df_test['sentiment'] =  df_test['sentiment'].apply(lambda x: sentiment_to_num[x])
df_test

Unnamed: 0,text,sentiment
0,kenyataan media yb menteri kerajaan tempatan d...,1
1,berakhir sebelum memulai rasanya bagaimana sik,1
2,wonder jika works untuk compressor also,0
3,cut awak losses short since it might jadi the ...,2
4,samurai jin messy sia jangan want,0
...,...,...
29995,after the successfully formed a planet,2
29996,tidak hal abang naik moto itu jadi polis bantuan,0
29997,kami di official facebook page pejabat pembang...,1
29998,- not banyak welfare and no satu dare kepada t...,0


## 2.2. Create Folds

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

# shuffle
df = shuffle(df_train)

# initialize kfold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1024)

# for stratification
y = df['sentiment']

# Note:
# Each fold is a tuple ([train_index_values], [val_index_values])
# fold_0, fold_1, fold_2, fold_3, fold_5 = kf.split(df, y)

# Put the folds into a list. This is a list of tuples.
fold_list = list(kf.split(df, y))

train_df_list = []
val_df_list = []

for i, fold in enumerate(fold_list):

    # map the train and val index values to dataframe rows
    df_train = df[df.index.isin(fold[0])]
    df_val = df[df.index.isin(fold[1])]
    
    train_df_list.append(df_train)
    val_df_list.append(df_val)
    
    

print(len(train_df_list))
print(len(val_df_list))

5
5


# Section 3

## 3.1. Train a Model

In [None]:
MODEL_TYPE = 'gpt2'

NUM_FOLDS = 5

# Saving 5 TPU models will exceed the 4.9GB disk space.
# Therefore, will will only train on 3 folds.
NUM_FOLDS_TO_TRAIN = 5 

L_RATE = 2e-5
MAX_LEN = 256
NUM_EPOCHS = 100
BATCH_SIZE = 64
NUM_CORES = os.cpu_count()

NUM_CORES

2

## Instantiate the tokenizer

In [None]:
token_dict = {
    'bos_token': '<|beginoftext|>',
    'pad_token': '<|pad|>',
    'sep_token': '<|sep|>',
    'mask_token': '<|mask|>'
}

In [None]:
from transformers import GPT2Tokenizer

# Load the tokenizer.
print('Loading tokenizer...')
# tokenizer = GPT2Tokenizer.from_pretrained(MODEL_TYPE, do_lower_case=True)
tokenizer = GPT2Tokenizer.from_pretrained("tokenizer-gpt2/", do_lower_case=True)
tokenizer.add_special_tokens(token_dict)
print(len(set(tokenizer.get_vocab().keys())))

Loading tokenizer...
397909


In [None]:
# add vocabulary to tokenizer
# ===========================

# from tqdm.auto import tqdm
# import nltk
# nltk.download('words')

# with open('combined-malay-dict.txt', encoding="utf8") as fp:
#   malay_words = set([x.strip() for x in fp.readlines()])

# english_words = set(nltk.corpus.words.words())

# new_tokens = (malay_words | english_words) - set(tokenizer.get_vocab().keys())

# new_tokens = list(new_tokens)
# batchsize = 10000
# progress_bar =  tqdm(range(len(new_tokens)))

# for i in range(0, len(new_tokens), batchsize):
#   batch = new_tokens[i:i+batchsize]
#   tokenizer.add_tokens(batch)
#   progress_bar.update(batchsize)

# tokenizer.add_special_tokens(token_dict)
# tokenizer.save_pretrained("tokenizer/")

## Create the Dataloader

In [None]:
from language_tokens import get_lang_tokens

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
class CompDataset(Dataset):

    def __init__(self, df):
        self.df_data = df



    def __getitem__(self, index):

        # get the sentence from the dataframe
        features = self.df_data.loc[index, 'text']

        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    features,           # Sentences to encode.
                    add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                    truncation = True,
                    max_length = MAX_LEN,           # Pad or truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )  
        
        # These are torch tensors already.
        input_ids = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        language_ids = torch.tensor(get_lang_tokens(
            [x.replace(' ', '') for x in tokenizer.batch_decode(input_ids.tolist())]
            ))
        
        # Convert the target to a torch tensor
        target = torch.tensor(self.df_data.loc[index, 'sentiment'])

        sample = (input_ids, att_mask, language_ids, target)


        return sample


    def __len__(self):
        return len(self.df_data)
    
    
    
    
    

class TestDataset(Dataset):

    def __init__(self, df):
        self.df_data = df



    def __getitem__(self, index):

        # get the sentence from the dataframe
        features = self.df_data.loc[index, 'text']

        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    features,           # Sentence to encode.
                    add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                    truncation = True,
                    max_length = MAX_LEN,           # Pad or truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )
        
        # These are torch tensors already.
        input_ids = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        language_ids = torch.tensor(get_lang_tokens(
            [x.replace(' ', '') for x in tokenizer.batch_decode(input_ids.tolist())]
            ))
               

        sample = (input_ids, att_mask, language_ids)


        return sample


    def __len__(self):
        return len(self.df_data)



## Test the dataloader

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_val

Unnamed: 0,text,sentiment
0,lim kim san toh chin chye,1
1,virgin - top of the list,2
2,bukan kerajaan salah akak sebab akak lone rang...,0
3,hospital cyberjaya pun ada conveyor belt yang ...,1
4,lima lokaliti di sabah dikenakan pkpd mulai,1
...,...,...
23995,yang memburukkan islam sendiri adalah pas tung...,0
23996,looks like im doing this next,2
23997,kamu semua boleh ada kepada cart kalau tak nak...,1
23998,jika awak know why lao goh purposely sayang st...,2


In [None]:
train_data = CompDataset(df_train)
val_data = CompDataset(df_val)
test_data = TestDataset(df_test)



train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        # shuffle=True,
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                       num_workers=NUM_CORES)



print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

input_ids, att_mask, language_ids, target = next(iter(train_dataloader))

1500
375
469


In [None]:
for i, b in enumerate(train_dataloader):
  print(i, len(b), b[0].shape)

0 4 torch.Size([64, 256])
1 4 torch.Size([64, 256])
2 4 torch.Size([64, 256])
3 4 torch.Size([64, 256])
4 4 torch.Size([64, 256])
5 4 torch.Size([64, 256])
6 4 torch.Size([64, 256])
7 4 torch.Size([64, 256])
8 4 torch.Size([64, 256])
9 4 torch.Size([64, 256])
10 4 torch.Size([64, 256])
11 4 torch.Size([64, 256])
12 4 torch.Size([64, 256])
13 4 torch.Size([64, 256])
14 4 torch.Size([64, 256])
15 4 torch.Size([64, 256])
16 4 torch.Size([64, 256])
17 4 torch.Size([64, 256])
18 4 torch.Size([64, 256])
19 4 torch.Size([64, 256])
20 4 torch.Size([64, 256])
21 4 torch.Size([64, 256])
22 4 torch.Size([64, 256])
23 4 torch.Size([64, 256])
24 4 torch.Size([64, 256])
25 4 torch.Size([64, 256])
26 4 torch.Size([64, 256])
27 4 torch.Size([64, 256])
28 4 torch.Size([64, 256])
29 4 torch.Size([64, 256])
30 4 torch.Size([64, 256])
31 4 torch.Size([64, 256])
32 4 torch.Size([64, 256])
33 4 torch.Size([64, 256])
34 4 torch.Size([64, 256])
35 4 torch.Size([64, 256])
36 4 torch.Size([64, 256])
37 4 torch.

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fab1d08f160>
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 1466, in __del__
    self._shutdown_workers()
Exception ignored in:   File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 1449, in _shutdown_workers
    if w.is_alive():<function _MultiProcessingDataLoaderIter.__del__ at 0x7fab1d08f160>

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 1466, in __del__
  File "/usr/lib/python3.8/multiprocessing/process.py", line 160, in is_alive
        self._shutdown_workers()
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 1449, in _shutdown_workers
    if w.is_alive():assert self._parent_pid == os.getpid(), 'can only test a child process'
  File "/usr/lib/python3.8/multiprocessing/process.py", line 160, in 

172 4 torch.Size([64, 256])
173 4 torch.Size([64, 256])
174 4 torch.Size([64, 256])
175 4 torch.Size([64, 256])
176 4 torch.Size([64, 256])
177 4 torch.Size([64, 256])
178 4 torch.Size([64, 256])
179 4 torch.Size([64, 256])
180 4 torch.Size([64, 256])
181 4 torch.Size([64, 256])
182 4 torch.Size([64, 256])
183 4 torch.Size([64, 256])
184 4 torch.Size([64, 256])
185 4 torch.Size([64, 256])
186 4 torch.Size([64, 256])
187 4 torch.Size([64, 256])
188 4 torch.Size([64, 256])
189 4 torch.Size([64, 256])
190 4 torch.Size([64, 256])
191 4 torch.Size([64, 256])
192 4 torch.Size([64, 256])
193 4 torch.Size([64, 256])
194 4 torch.Size([64, 256])
195 4 torch.Size([64, 256])
196 4 torch.Size([64, 256])
197 4 torch.Size([64, 256])
198 4 torch.Size([64, 256])
199 4 torch.Size([64, 256])
200 4 torch.Size([64, 256])
201 4 torch.Size([64, 256])
202 4 torch.Size([64, 256])
203 4 torch.Size([64, 256])
204 4 torch.Size([64, 256])
205 4 torch.Size([64, 256])
206 4 torch.Size([64, 256])
207 4 torch.Size([64

RuntimeError: ignored

In [None]:
[x.shape for x in train_data[311]]

[torch.Size([256]), torch.Size([256]), torch.Size([256]), torch.Size([])]

## Define the Model

In [None]:
import torch

# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
GPU is: Tesla T4


In [None]:
from transformers import GPT2Config
from modeling_gpt2 import GPT2ForSequenceClassification

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
config = GPT2Config.from_pretrained(
    MODEL_TYPE,
    num_labels = len(set([x.item() for x in target])) ,
    output_attentions = False,
    output_hidden_states = False,
    num_hidden_layers = 5,
    num_attention_heads = 8,
    hidden_dropout_prob = 0.2,
    attention_probs_dropout_prob = 0.2,
    ignore_mismatched_sizes=True,
    bos_token_id = tokenizer.bos_token_id,
    pad_token_id = tokenizer.pad_token_id,
    eos_token_id = tokenizer.eos_token_id,
    sep_token_id = tokenizer.sep_token_id
    )

model = GPT2ForSequenceClassification.from_pretrained(
    MODEL_TYPE,
    config=config
)

tokenizer.padding_side = "left"
model.resize_token_embeddings(len(tokenizer))

# Send the model to the device.
model.to(device)

Some weights of the model checkpoint at gpt2 were not used when initializing GPT2ForSequenceClassification: ['h.5.mlp.c_fc.weight', 'h.7.mlp.c_proj.weight', 'h.7.mlp.c_proj.bias', 'h.7.attn.c_attn.bias', 'h.5.ln_1.bias', 'h.5.attn.c_attn.weight', 'h.7.mlp.c_fc.weight', 'h.8.mlp.c_proj.bias', 'h.10.mlp.c_proj.weight', 'h.11.ln_2.weight', 'h.7.ln_2.weight', 'h.8.mlp.c_fc.weight', 'h.5.ln_2.weight', 'h.11.mlp.c_proj.bias', 'h.11.ln_1.weight', 'h.7.ln_1.bias', 'h.11.attn.c_attn.weight', 'h.6.ln_1.bias', 'h.10.mlp.c_fc.bias', 'h.6.attn.c_proj.weight', 'h.5.mlp.c_proj.weight', 'h.6.attn.c_attn.bias', 'h.10.ln_2.weight', 'h.5.ln_1.weight', 'h.7.mlp.c_fc.bias', 'h.9.mlp.c_proj.bias', 'h.7.attn.c_proj.weight', 'h.10.ln_1.bias', 'h.10.attn.c_attn.weight', 'h.6.mlp.c_fc.weight', 'h.11.mlp.c_fc.weight', 'h.11.attn.c_attn.bias', 'h.10.attn.c_proj.weight', 'h.5.mlp.c_proj.bias', 'h.10.attn.bias', 'h.11.attn.c_proj.bias', 'h.5.ln_2.bias', 'h.11.ln_2.bias', 'h.9.mlp.c_fc.bias', 'h.8.attn.c_proj.weight

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (wle): Embedding(50257, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1

In [None]:
optimizer = AdamW(model.parameters(),
  lr = L_RATE, 
  eps = 1e-8
)

## Train the Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# with open ('/content/drive/MyDrive/model.bin', 'rb') as fp:
#   states = torch.load(fp)

#   curr_epoch = states['epoch']
#   model.load_state_dict(states['model'])
#   optimizer.load_state_dict(states['optimizer'])
#   fold_val_acc_list = states['losses']

#   del states

# curr_epoch

In [None]:
# initial settings

curr_epoch = 0

# Store the accuracy scores for each fold model in this list.
# [[model_0 scores], [model_1 scores], [model_2 scores], [model_3 scores], [model_4 scores]]
# [[ecpoch 1, epoch 2, ...], [ecpoch 1, epoch 2, ...], [ecpoch 1, epoch 2, ...], [ecpoch 1, epoch 2, ...], [ecpoch 1, epoch 2, ...]]

# Create a list of lists to store the val acc results.
# The number of items in this list will correspond to
# the number of folds that the model is being trained on.
fold_val_acc_list = []

for i in range(0, NUM_FOLDS):
    
    # append an empty list
    fold_val_acc_list.append([])

In [None]:
%%time

import pickle
from tqdm.auto import tqdm

# Set a seed value.
seed_val = 1024

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
    

# For each epoch...
for epoch in range(curr_epoch, NUM_EPOCHS):
    
    print("\nNum folds used for training:", NUM_FOLDS_TO_TRAIN)
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    
    # Get the number of folds
    num_folds = len(train_df_list)

    # For this epoch, store the val acc scores for each fold in this list.
    # We will use this list to calculate the cv at the end of the epoch.
    epoch_acc_scores_list = []
    
    # For each fold...
    for fold_index in range(1, NUM_FOLDS_TO_TRAIN):
        
        print('\n== Fold Model', fold_index)

        # ========================================
        #               Training
        # ========================================
        
        stacked_val_labels = []
        targets_list = []

        print('Training...')

        progress_bar_train =  tqdm(range(len(train_dataloader)))

        # put the model into train mode
        model.train()

        # This turns gradient calculations on and off.
        torch.set_grad_enabled(True)


        # Reset the total loss for this epoch.
        total_train_loss = 0

        for i, batch in enumerate(train_dataloader):

            train_status = 'Batch ' + str(i+1) + ' of ' + str(len(train_dataloader))

            print(train_status, end='\r')


            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_language_ids = batch[2].to(device)
            b_labels = batch[3].to(device)

            model.zero_grad()        


            outputs = model(b_input_ids, 
                        attention_mask=b_input_mask,
                        language_ids=b_language_ids,
                        labels=b_labels)
            
            progress_bar_train.update(1)

            # Get the loss from the outputs tuple: (loss, logits)
            loss = outputs[0]

            # Convert the loss from a torch tensor to a number.
            # Calculate the total loss.
            total_train_loss = total_train_loss + loss.item()

            # Zero the gradients
            optimizer.zero_grad()

            # Perform a backward pass to calculate the gradients.
            loss.backward()
            
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Use the optimizer to update Weights
            
            # Optimizer for GPU
            optimizer.step() 
            

        print('Train loss:' ,total_train_loss)


        # ========================================
        #               Validation
        # ========================================

        print('\nValidation...')

        progress_bar_eval =  tqdm(range(len(val_dataloader)))

        # Put the model in evaluation mode.
        model.eval()

        # Turn off the gradient calculations.
        # This tells the model not to compute or store gradients.
        # This step saves memory and speeds up validation.
        torch.set_grad_enabled(False)


        # Reset the total loss for this epoch.
        total_val_loss = 0


        for j, val_batch in enumerate(val_dataloader):

            val_status = 'Batch ' + str(j+1) + ' of ' + str(len(val_dataloader))

            print(val_status, end='\r')

            b_input_ids = val_batch[0].to(device)
            b_input_mask = val_batch[1].to(device)
            b_language_ids = batch[2].to(device)
            b_labels = val_batch[3].to(device)


            outputs = model(b_input_ids, 
                    attention_mask=b_input_mask, 
                    language_ids=b_language_ids,
                    labels=b_labels)
            
            progress_bar_eval.update(1)

            # Get the loss from the outputs tuple: (loss, logits)
            loss = outputs[0]

            # Convert the loss from a torch tensor to a number.
            # Calculate the total loss.
            total_val_loss = total_val_loss + loss.item()

            # Get the preds
            preds = outputs[1]


            # Move preds to the CPU
            val_preds = preds.detach().cpu().numpy()

            # Move the labels to the cpu
            targets_np = b_labels.to('cpu').numpy()

            # Append the labels to a numpy list
            targets_list.extend(targets_np)

            if j == 0:  # first batch
                stacked_val_preds = val_preds

            else:
                stacked_val_preds = np.vstack((stacked_val_preds, val_preds))
                
                
        # .........................................
        # Calculate the val accuracy for this fold
        # .........................................      


        # Calculate the validation accuracy
        y_true = targets_list
        y_pred = np.argmax(stacked_val_preds, axis=1)

        val_acc = accuracy_score(y_true, y_pred)
        
        
        epoch_acc_scores_list.append(val_acc)


        print('Val loss:' ,total_val_loss)
        print('Val acc: ', val_acc)
        
        
        # .........................
        # Save the best model
        # .........................
        
        if epoch == 0:
            
            # Save the Model
            torch.save({
                'epoch': epoch,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'losses': fold_val_acc_list
            }, '/content/drive/MyDrive/model.bin')
            print('Saved model.')
            
        if epoch != 0:
        
            val_acc_list = fold_val_acc_list[fold_index]
            best_val_acc = max(val_acc_list)
            
            if val_acc > best_val_acc:
                # save the model
                torch.save({
                    'epoch': epoch,
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'losses': fold_val_acc_list
                }, '/content/drive/MyDrive/model.bin')
                print('Val acc improved. Saved model')
                
                
        # .....................................
        # Save the val_acc for this fold model
        # .....................................
        
        # Note: Don't do this before the above 'Save Model' code or 
        # the save model code won't work. This is because the best_val_acc will
        # become current val accuracy.
                
        # fold_val_acc_list is a list of lists.
        # Each fold model has it's own list corresponding to the fold index.
        # Here we choose a list corresponding to the fold number and append the acc score to that list.
        fold_val_acc_list[fold_index].append(val_acc)
        
            

        # Use the garbage collector to save memory.
        gc.collect()
        
    # .............................................................
    # Calculate the CV accuracy score over all folds in this epoch
    # .............................................................   
        
        
    # Print the average val accuracy for all 5 folds
    cv_acc = sum(epoch_acc_scores_list)/NUM_FOLDS_TO_TRAIN
    print("\nCV Acc:", cv_acc)


Num folds used for training: 5

== Fold Model 1
Training...


  0%|          | 0/1500 [00:00<?, ?it/s]



RuntimeError: ignored

In [None]:
b_input_mask[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')