# About the Notebook

This notebook is about creating a classification model to evaluate the performance of text preprocess technique. 

In [None]:
!pip install transformers

In [2]:
import tensorflow as tf
import torch

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

#To confirm that we are using GPU for the training later

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [3]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import re
import nltk
#from sentence_transformers import SentenceTransformer
import datetime
from scipy.spatial.distance import cosine
from sklearn.model_selection import KFold
import random
import time
from torch import nn
from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler
from transformers import BertForTokenClassification, AdamW, BertConfig, BertForSequenceClassification, get_linear_schedule_with_warmup
import os
from transformers import AutoTokenizer, AutoModel
import torch
from transformers import RobertaTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
model = AutoModel.from_pretrained("allenai/longformer-base-4096")
tokenizer.model_max_length = model.config.max_position_embeddings
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text_file_path = '/content/text.txt'
source_file_path = '/content/now-samples-sources.txt'

## Import data

In [5]:
sources = pd.read_csv(source_file_path, sep="\t{1,2}", encoding="ISO-8859-1", skiprows=2, engine='python')
# rename columns
sources.columns = ["id", "n_words", "date", "country", "website", "url", "title"]

# date column -> pandas.DateTime"
sources["date"] = pd.to_datetime(sources["date"], format="%y-%m-%d")

pd.set_option('display.max_colwidth', 40 )

with open(text_file_path, "r") as f:
    text = pd.DataFrame(
        [re.search("(\d+)\s(.*)", l[2:]).groups() for l in f.readlines() if l.startswith("@@")],
        columns=["id", "text"]     )
    f.close()

# id should be an integer
text["id"] = text["id"].astype(int)
#text['text'] = text.text.str.lower()

reports = sources.merge(text, on="id", how="outer")
reports["year"] = reports["date"].dt.strftime("%Y")
reports = reports[reports['text'].notna()]

## Basic Preprocessing

The below function will clean the html codes and some special characters.

In [6]:
import html
def preprocess(doc):
  doc = html.unescape(doc)
  doc = doc.replace('{', '')
  doc = doc.replace('}', '')
  doc = doc.replace("\n", '')
  doc = doc.rstrip("\n") #remove empty lines
  doc = doc.replace("@ @ @ @ @ @ @ @ @ @ ", '')
  doc = doc.replace(" @", '')
  doc = doc.replace(" '", "'")
  doc = doc.replace("\"", "")
  doc = doc.replace(",", "")
  doc = doc.replace("(", "")
  doc = doc.replace(")", "")
  doc = doc.replace(" <p>", ".")
  doc = doc.replace(" <h>", ".")
  doc = doc.replace("<p>", "")
  doc = doc.replace("<h>", "")
  doc = doc.replace('<', '')
  doc = doc.replace('>', '')
  doc = doc.replace(":", "")
  #doc = doc.replace("?", ".")
  #doc = doc.replace("!", ".")
  doc = doc.replace(" ?", "?")
  doc = doc.replace(" !", "!")
  doc = doc.replace(r"\.\s[\.\s]+", ". ") #converting . . to .
  doc = doc.replace(r"\.+", ".") #converting ... to .
  doc = doc.replace("--", "") 
  doc = doc.replace("-", " ")
  doc = doc.replace(" +", " ")
  doc = doc.replace(" n't", "n't")
  doc = doc.replace(" ..", ".")
  doc = doc.replace("..", ".")
  doc = doc.replace("  ", " ")
  doc = doc.replace(" .", ".")
  return doc

## Create Random sampled data for manual cleaning

In [7]:
#Randomly Sample 100 reports for the cleaning
reports['text'] = reports['text'].apply(preprocess)
reports = reports.dropna()
reports['text'] = reports['text'].apply(nltk.sent_tokenize)
rand_index = np.random.randint(0, high = len(reports), size = 100)
report_to_clean = reports.iloc[rand_index, :]
report_to_clean.to_csv('report_to_clean.csv')

In [8]:
#Import the cleaned data (cleaning done by human)
report_cleaned = pd.read_csv('/content/clean_report.csv', encoding="ISO-8859-1" )
report_cleaned = report_cleaned.iloc[:,3:]
noise_data_base = report_cleaned['text']
clean_report = report_cleaned.copy()

In [9]:
#EDA Part
#from easy_data_augmentation import *#Reference: https://github.com/jasonwei20/eda_nlp
'''
With the EDA, we will create total of 500 data with label 'Clean' (400 generation from EDA)
'''
from easy_data_augmentation import *

df_dict = {"id": [], 'n_words':[], 'date':[], "website": [], "url":[], "title": [],"text":[], "year":[],}
for i in range(len(report_cleaned)):
  text = report_cleaned['text'][i]
  for j in range(4):
    tok_sent = nltk.sent_tokenize(text)
    for k in range(len(tok_sent)):
      alpha = 0.1 + j * 0.1
      try:
        tok_sent[k] = eda_for_one_sentence(tok_sent[k], alpha_sr = alpha, alpha_ri = alpha, alpha_rs = alpha, p_rd = alpha)
      except:
        pass
    df_dict['id'].append(report_cleaned['id'][i])
    df_dict['url'].append(report_cleaned['url'][i])
    df_dict['title'].append(report_cleaned['title'][i])
    df_dict['website'].append(report_cleaned['website'][i])
    df_dict['text'].append(' '.join(tok_sent))
    df_dict['date'].append(report_cleaned['date'][i])
    df_dict['n_words'].append(report_cleaned['n_words'][i])
    df_dict['year'].append(report_cleaned['year'][i])
    ret_text = ' '.join(tok_sent)


df_append = pd.DataFrame.from_dict(df_dict)
df_append

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,n_words,date,website,url,title,text,year
0,3071244,761,2015-02-27,The Guardian,http://www.theguardian.com/sport/201...,UCI moves to revoke licence of Vince...,in an unprecedented move inward the ...,2015
1,3071244,761,2015-02-27,The Guardian,http://www.theguardian.com/sport/201...,UCI moves to revoke licence of Vince...,in licence an act human beings gover...,2015
2,3071244,761,2015-02-27,The Guardian,http://www.theguardian.com/sport/201...,UCI moves to revoke licence of Vince...,put forward hold unprecedented move ...,2015
3,3071244,761,2015-02-27,The Guardian,http://www.theguardian.com/sport/201...,UCI moves to revoke licence of Vince...,in jacques incite of duty for bespea...,2015
4,4621241,263,2014-02-24,The Star Online,http://www.thestar.com.my/news/natio...,Cyber-bullying reports up 55.6% in 2013,cyber intimidation reports 6 up 201...,2014
...,...,...,...,...,...,...,...,...
395,1721243,495,2010-11-20,Telegraph.co.uk,http://www.telegraph.co.uk/finance/f...,IMF's Dominique Strauss-Kahn wants f...,fund likely make do study torso on i...,2010
396,1681243,836,2010-10-21,Toronto Star,http://www.thestar.com/news/gta/2010...,James: Mississauga needs watchdog 'g...,mccallion hasnt campaigned hazel thi...,2010
397,1681243,836,2010-10-21,Toronto Star,http://www.thestar.com/news/gta/2010...,James: Mississauga needs watchdog 'g...,pomaderris this mccallion hasnt camp...,2010
398,1681243,836,2010-10-21,Toronto Star,http://www.thestar.com/news/gta/2010...,James: Mississauga needs watchdog 'g...,filbert if always crusade this decad...,2010


In [10]:
report_cleaned = report_cleaned.append(df_append)
report_cleaned.index = np.arange(0, len(report_cleaned))
report_cleaned

Unnamed: 0,id,n_words,date,country,website,url,title,text,year
0,3071244,761,2015-02-27,GB,The Guardian,http://www.theguardian.com/sport/201...,UCI moves to revoke licence of Vince...,In an unprecedented move the world ...,2015
1,4621241,263,2014-02-24,MY,The Star Online,http://www.thestar.com.my/news/natio...,Cyber-bullying reports up 55.6% in 2013,Cyber bullying reports up 55.6% in 2...,2014
2,4041244,200,2011-03-08,GB,Express.co.uk,http://www.express.co.uk/celebrity-n...,Klum's underwear gift was part of pl...,Klum's underwear gift was part of p...,2011
3,8901244,412,2016-05-18,ZA,Citizen,http://www.citizen.co.za/1121347/inj...,Injured Jobodwana retains focus on O...,Injured Jobodwana retains focus on ...,2016
4,2781242,382,2014-09-15,ZA,SuperSport,http://www.supersport.com/cricket/do...,Morkel out of action for at least te...,Cricket Domestic Cricket. Morkel ou...,2014
...,...,...,...,...,...,...,...,...,...
495,1721243,495,2010-11-20,,Telegraph.co.uk,http://www.telegraph.co.uk/finance/f...,IMF's Dominique Strauss-Kahn wants f...,fund likely make do study torso on i...,2010
496,1681243,836,2010-10-21,,Toronto Star,http://www.thestar.com/news/gta/2010...,James: Mississauga needs watchdog 'g...,mccallion hasnt campaigned hazel thi...,2010
497,1681243,836,2010-10-21,,Toronto Star,http://www.thestar.com/news/gta/2010...,James: Mississauga needs watchdog 'g...,pomaderris this mccallion hasnt camp...,2010
498,1681243,836,2010-10-21,,Toronto Star,http://www.thestar.com/news/gta/2010...,James: Mississauga needs watchdog 'g...,filbert if always crusade this decad...,2010


In [11]:
from transformers import *

#Tokenize the text (sent_tok), and create text that will be less than 512 tokens
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [12]:
test = report_cleaned.copy()
text_for_preprocess = []
for i in range(len(test)):
  text = test['text'][i]
  sent_tok = nltk.sent_tokenize(text)
  text_divided = []
  tok_length = 0
  text_temp = ''
  for sent in sent_tok:
    length_tokens = len(tokenizer.tokenize(sent))
    tok_length += length_tokens
    if tok_length < 400:
      text_temp += ' ' + sent
    else:
      text_divided.append(text_temp)
      text_temp = sent
      tok_length = length_tokens
  if text_temp != '':
    text_divided.append(text_temp)
  text_for_preprocess.append(text_divided)

In [13]:
flat_list_clean = [item for sublist in text_for_preprocess for item in sublist]

In [14]:
length = []
for i in range(len(flat_list_clean)):
  length.append(len(tokenizer.tokenize(flat_list_clean[i])))

length = pd.DataFrame(length)
length.describe()

Unnamed: 0,0
count,938.0
mean,292.801706
std,110.532277
min,13.0
25%,215.0
50%,352.0
75%,385.75
max,399.0


## Add Noise to the text data

The Noise data is the sentences that we have deleted. We will going to create the noise data with changing the some part of the sentences as the same method as EDA, and just put them into clean data, then label them as noise data.

In [15]:
'''
With the Noise, we will create total of 500 data with label 'Noise'
'''
Noise_example = [
                'footballisfun saysMay 30 2012 837 PM.', 
                 'Baha all comments on this thread about the jags moving are now proven to be moronic.', 
                 'Fact!', 
                 'read latest article ; Hahahaha GO jAGS!',
                 'Get daily news by email.',
                 'Invalid e mailThanks for subscribing!',
                 'Could not subscribe try again later.',
                 'COMMENT DISCLAIMER Reader comments posted on this Web site are not in any way endorsed by The Standard.', 
                 'Comments are views by thestandard.ph readers who exercise their right to free position or viewpoint of thestandard.ph.', 
                 "While reserving this publication's right to delete comments that are deemed offensive indecent or inconsistent with The Standard editorial standards The Standard may not be held liable for any false information posted by readers in this comments section.",
                 'Add New Comment.', 
                 '3 Comments.', 
                 'We believe that many of the attributes you wisely advocate can be learned and mastered.',
                 'Perhaps you and your readers may find our related research organizing principles and tools to be of relevance to your important content see the latter two links the last being a related white paper.',
                 'Well this is related http **50;0;TOOLONG from the panel discussion Empowered Communities The Art and Science of Building Networks from Nov 20th 2009 at 1000 AM PT at the Monterey Institute of International Studies in Monterey CA.',
                 'What can we do in the context of relationships?',
                 'Thank you.',
                 ' Share this.',
                 'From Around the Web.', 
                 'More From The Times of India.', 
                 'Recommended By Colombia.', 
                 'Comments.', 
                 'Characters Remaining 3000.',
                 'OR PROCEED.', 
                 'FacebookGoogleEmail.', 
                 'Refrain from posting comments that are obscene defamatory or inflammatory and do not indulge in personal attacks name calling or inciting hatred against any community.', 
                 'Help us delete comments that do not follow these guidelines by marking them offensive.', 
                 "Let's work together to keep the conversation civil.",
                 "Subscribe us on Youtube.",
                 "Follow us on Instagram.",
                 "Follow us on Twitter.",
                 'TNN Jun 2 2015 09.01 PM IST.',
                 'Aditya Singh/AFP/Getty Images.',
                 'Getty Images.',
                 'Sponsored.',
                 'Advert.',
                 "Only Buchanan's commentaries in your email.", 
                 "BONUS By signing up for Pat Buchanan's weekly alerts you will also be signed up for news and special offers from WND via email.", 
                 'Name*.', 
                 'FirstLast.', 
                 'Email*.', 
                 'Where we will email your daily updates.', 
                 'Postal code*.', 
                 'A valid zip code or postal code is required.', 
                 "Click the button below to sign up for Pat Buchanan's commentaries by email and keep up to date with special offers from WND.", 
                 'You may change your email preferences at any time.',
                 'We encourage but we ask you to follow our guidelines for respecting community standards.', 
                 'Personal attacks inappropriate language and off topic comments may be removed and comment privileges revoked per our Terms of Use.', 
                 'Please see our FAQ if you have questions or concerns about using Facebook to comment.',
                 'Read more.',
                 'Disclaimer.', 
                 'You understand and agree that no content published on the Site constitutes a recommendation that any particular security portfolio of securities transaction or investment strategy is suitable or advisable for any specific person.', 
                 'You further understand that none of the information providers or their affiliates will advise you personally concerning the nature potential advisability value or suitability of any particular stock share security portfolio of securities transaction investment strategy or other matter.', 
                 'We openly disclose that we and our contributors may have interests in investments and/or providers of services referred to within the website and that we receive remuneration from certain of the companies referred to on this website.'
  ]
eda_noise_sen = []

for sen in Noise_example:
  for i in range(6):
    alpha = 0.1 + i * 0.1
    aug_sent = eda_for_one_sentence(sen, alpha_sr = alpha, alpha_ri = alpha, alpha_rs = alpha, p_rd = alpha)
    eda_noise_sen.append(aug_sent)
eda_noise_sen = pd.DataFrame(eda_noise_sen)

In [18]:
#Example of EDA sentence
eda_noise_sen

Unnamed: 0,0
0,footballisfun 30 saysmay 2012 837 p...
1,footballisfun saysmay pm 2012.
2,footballisfun xxx 837 pm.
3,footballisfun rector saysmay ground ...
4,postmortem xxx 2012 prove testing sa...
...,...
319,we discover of we and our company su...
320,we openly that along inward contribu...
321,matter to openly make for web inside...
322,genus crataegus this upward we indis...


In [20]:
#Create noise data by inserting the noise text withint the context
noise_data = []
noise_sentences = eda_noise_sen.iloc[:,0].values
for i in range(len(clean_report)):
  text = clean_report['text'][i]
  for j in range(5):
    alpha =  j * 0.1
    sent_tok = nltk.sent_tokenize(text)
    tok_length = 0
    text_temp = ''
    text_divided = []
    for k in range(len(sent_tok)):
      if alpha > 0:
        sent_tok[k] = eda_for_one_sentence(sent_tok[k], alpha_sr = alpha, alpha_ri = alpha, alpha_rs = alpha, p_rd = alpha)
      length_tokens = len(tokenizer.tokenize(sent_tok[k]))
      tok_length += length_tokens
      if tok_length < 370:
        text_temp += ' ' + sent_tok[k]
      else:
        subsen_tok = nltk.sent_tokenize(text_temp)
        num_sent_to_add = np.random.randint(low = 1, high = 3)
        sent_index = np.random.randint(low = 0, high = len(eda_noise_sen), size = num_sent_to_add)
        place_to_insert = list(np.random.randint(low = 0, high = len(subsen_tok), size = num_sent_to_add))
        place_to_insert.sort()
        place_to_insert.reverse()
        for index in range(len(sent_index)):
          subsen_tok.insert(place_to_insert[index], noise_sentences[index])
        text_temp = ' '.join(subsen_tok)
        text_divided.append(text_temp)
        text_temp = sent_tok[k]
        tok_length = length_tokens
    if text_temp != '':
      text_divided.append(text_temp)
    noise_data.append(text_divided)

In [21]:
flat_list_noise = [item for sublist in noise_data for item in sublist]
length = []
for i in range(len(flat_list_noise)):
  length.append(len(tokenizer.tokenize(flat_list_noise[i])))

length = pd.DataFrame(length)
length.describe()

Unnamed: 0,0
count,988.0
mean,287.40081
std,106.667809
min,9.0
25%,219.0
50%,344.0
75%,375.0
max,392.0


In [22]:
print('We have ', len(flat_list_clean), ' number of clean data')
print('We have ', len(flat_list_noise), ' number of noise data')

We have  938  number of clean data
We have  988  number of noise data


In [23]:
label = [0] * len(flat_list_clean) + [1] * len(flat_list_noise)
data = flat_list_clean + flat_list_noise
dataset = pd.DataFrame(data, columns = ['data'],)
dataset['label'] = label
dataset

Unnamed: 0,data,label
0,In an unprecedented move the world...,0
1,The Italian media alleged in Decembe...,0
2,Speaking to reporters at the world t...,0
3,Cyber bullying reports up 55.6% in ...,0
4,Klum's underwear gift was part of ...,0
...,...,...
1921,its a mccallion wallpaper wild fake ...,1
1922,that is gamy nation was equal alread...,1
1923,effort difficult tree hasnt crusade...,1
1924,what not real number locution ace ad...,1


## Model for classification

In [26]:
#===============================================================================
# The preprocess code for the BERT
# The following code is from https://mccormickml.com/2019/07/22/BERT-fine-tuning/
#===============================================================================

MAX_LEN = 512


def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      # Return attention mask
            truncation = True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [27]:
#===============================================================================
# Declare the BERT Classifier for later train and test purpose
# if one wants to change the model, one can modify the below section
#===============================================================================

%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False, layers_to_freeze = []):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 100, 3

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        else:
          if layers_to_freeze != []:
            for i in layers_to_freeze:
              for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = False
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 45 µs, sys: 0 ns, total: 45 µs
Wall time: 56.3 µs


In [29]:
#===============================================================================
# Initialize model for the later train purpose
#===============================================================================

def initialize_model(epochs=4, layers_to_freeze = []):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False, layers_to_freeze = layers_to_freeze)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=2e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [30]:
#===============================================================================
# Train & Evaluate function 
#===============================================================================

import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")

    return float(val_loss), float(val_accuracy)
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [31]:
#===============================================================================
# Part where we train using the 5-fold cv
#===============================================================================


from sklearn.model_selection import KFold
#train and val are indices
kf = KFold(n_splits=5, shuffle = True, random_state = 42)

batch_size = 16
train_loss = []
val_loss = []
val_accuracy = []
X = dataset['data']
y1 = dataset['label']
y1 = y1.astype(int)

for train_index, val_index in kf.split(X):
  #Data Preparation
  X_train = X[train_index]
  X_val = X[val_index]
  y1_train = y1[train_index]
  y1_val = y1[val_index]
  
  train_inputs, train_masks = preprocessing_for_bert(X_train)
  val_inputs, val_masks = preprocessing_for_bert(X_val)
  train_labels = torch.tensor(y1_train.values)
  val_labels = torch.tensor(y1_val.values)
  
  #Data Loader Class
  train_data = TensorDataset(train_inputs, train_masks, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

  val_data = TensorDataset(val_inputs, val_masks, val_labels)
  val_sampler = RandomSampler(val_data)
  val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

  #Fine Tune and Evaluation
  set_seed(42)    # Set seed for reproducibility
  bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
  val_loss1, val_accuracy1 = train(bert_classifier, train_dataloader, val_dataloader, epochs=2, evaluation=True)
  
  val_loss.append(val_loss1)
  val_accuracy.append(val_accuracy1)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.791206   |     -      |     -     |   10.51  
   1    |   40    |   0.709401   |     -      |     -     |   9.75   
   1    |   60    |   0.590000   |     -      |     -     |   9.83   
   1    |   80    |   0.586745   |     -      |     -     |   9.80   
   1    |   96    |   0.474932   |     -      |     -     |   7.50   
----------------------------------------------------------------------
   1    |    -    |   0.638527   |  0.479735  |   68.75   |   51.18  
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   0.453002   |     -      |     -     |   10.29  
   2    |   40    |   0.443626   |     -      |     -     |   9.