# MODEL TESTING

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import time
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import nltk
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# define the path
train_file_path = '/content/drive/MyDrive/Colab Notebooks/archive/train.csv'
test_file_path = '/content/drive/MyDrive/Colab Notebooks/archive/test.csv'

In [None]:
#load the data
data = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

In [None]:
len(data)

120000

In [None]:
len(test)

7600

In [None]:
data.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


## Multinomial Logistic Regression

In [None]:
# combine news titles with their content
x_train = data['Title']+" "+data['Description']
x_test = test['Title']+" "+test['Description']

In [None]:
#set labels to 0,1,2,3
y_train = data['Class Index'].apply(lambda x:x-1).values
y_test = test['Class Index'].apply(lambda x:x-1).values

In [None]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
def clean_text(text, remove_stopwords = True):
   
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [None]:
# clean the texts
x_train = x_train.apply(clean_text)
x_test = x_test.apply(clean_text)

In [None]:
#define BoW vectorizer
bow_transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[1,3], lowercase=False)

# transform texts to BoW features
X_tr_bow = bow_transform.fit_transform(x_train)
X_te_bow = bow_transform.transform(x_test)

In [None]:
len(bow_transform.vocabulary_)

3718562

In [None]:
#define Tf-idf transformer
tfidf_transform = text.TfidfTransformer(norm=None)

# transform BoW matrix to Tf-idf features
X_tr_tfidf = tfidf_transform.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_transform.transform(X_te_bow)

In [None]:
#define logistic regression classifier, set multi_class parameter to "multinomial"
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C, multi_class="multinomial").fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

In [None]:
# fit the model to BoW and Tf-idf features
model_bow = simple_logistic_classify(X_tr_bow, y_train, X_te_bow, y_test, 'bow')
model_tfidf = simple_logistic_classify(X_tr_tfidf, y_train, X_te_tfidf, y_test, 'tf-idf')

Test Score with bow features 0.9236842105263158
Test Score with tf-idf features 0.9244736842105263


In [None]:
# get predictions
bow_pred = model_bow.predict(X_te_bow)
tfidf_pred = model_tfidf.predict(X_te_tfidf)

In [None]:
# BoW classification report
print(classification_report(y_test, bow_pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1900
           1       0.96      0.98      0.97      1900
           2       0.90      0.89      0.90      1900
           3       0.90      0.90      0.90      1900

    accuracy                           0.92      7600
   macro avg       0.92      0.92      0.92      7600
weighted avg       0.92      0.92      0.92      7600



In [None]:
# Tf-idf classification report
print(classification_report(y_test, tfidf_pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1900
           1       0.96      0.98      0.97      1900
           2       0.91      0.89      0.90      1900
           3       0.90      0.90      0.90      1900

    accuracy                           0.92      7600
   macro avg       0.92      0.92      0.92      7600
weighted avg       0.92      0.92      0.92      7600



## Binary Logistic Regression

In [None]:
data = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

In [None]:
# filter "Business" and "World" news
data = data[(data['Class Index'] != 2) & (data['Class Index'] != 4)]
test = test[(test['Class Index'] != 2) & (test['Class Index'] != 4)]

In [None]:
# combine news titles with their content
x_train = data['Title']+" "+data['Description']
x_test = test['Title']+" "+test['Description']

In [None]:
# clean the text
x_train = x_train.apply(clean_text)
x_test = x_test.apply(clean_text)

In [None]:
#set labels
y_train = data['Class Index']
y_test = test['Class Index']

In [None]:
# define BoW vectorizer
bow_transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[1,3], lowercase=False)

# get BoW fetures
X_tr_bow = bow_transform.fit_transform(x_train)
X_te_bow = bow_transform.transform(x_test)

In [None]:
# define Tf-idf transformer
tfidf_transform = text.TfidfTransformer(norm=None)

# get Tf-idf features
X_tr_tfidf = tfidf_transform.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_transform.transform(X_te_bow)

In [None]:
# define logistic regression, default class parameter is "auto" that works for binary classification
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

In [None]:
#fit the model to BoW and Tf-idf features
model_bow = simple_logistic_classify(X_tr_bow, y_train, X_te_bow, y_test, 'bow')
model_tfidf = simple_logistic_classify(X_tr_tfidf, y_train, X_te_tfidf, y_test, 'tf-idf')

Test Score with bow features 0.9623684210526315
Test Score with tf-idf features 0.963421052631579


In [None]:
# get predictions
bow_pred = model_bow.predict(X_te_bow)
tfidf_pred = model_tfidf.predict(X_te_tfidf)

In [None]:
# BoW classification report
print(classification_report(y_test, bow_pred))

              precision    recall  f1-score   support

           1       0.97      0.96      0.96      1900
           3       0.96      0.97      0.96      1900

    accuracy                           0.96      3800
   macro avg       0.96      0.96      0.96      3800
weighted avg       0.96      0.96      0.96      3800



In [None]:
# Tf-idf classification report
print(classification_report(y_test, tfidf_pred))

              precision    recall  f1-score   support

           1       0.97      0.96      0.96      1900
           3       0.96      0.97      0.96      1900

    accuracy                           0.96      3800
   macro avg       0.96      0.96      0.96      3800
weighted avg       0.96      0.96      0.96      3800



## GloVe + LSTM Model

In [None]:
import torch
import torchtext
from torchtext import data
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator
import torch.nn as nn

In [None]:
data = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

In [None]:
df = data.append(test, ignore_index=True)

In [None]:
df = df[(df['Class Index'] != 2) & (df['Class Index'] != 4)]

In [None]:
df["Title"] = df["Title"] + " " + df["Description"]
df = df[["Title", "Class Index"]]

In [None]:
# save the dataframe
df.to_csv("/content/drive/MyDrive/Colab Notebooks/test_lstm.csv")

In [None]:
# define fields for tabular dataset in directory
TEXT = Field(tokenize="spacy",lower = True, sequential=True, batch_first=True,include_lengths=True)
LABEL = LabelField(dtype = torch.float,batch_first=True)
fields = [(None, None),('text',TEXT), ('label', LABEL)]

In [None]:
#call data from directory with Tabular Dataset
data=TabularDataset(path = "/content/drive/MyDrive/Colab Notebooks/test_lstm.csv",format = 'csv',fields = fields,skip_header = True)

In [None]:
# a sequence with its label
vars(data[0])

{'label': '3',
 'text': ['wall',
  'st.',
  'bears',
  'claw',
  'back',
  'into',
  'the',
  'black',
  '(',
  'reuters',
  ')',
  'reuters',
  '-',
  'short',
  '-',
  'sellers',
  ',',
  'wall',
  'street',
  "'s",
  'dwindling\\band',
  'of',
  'ultra',
  '-',
  'cynics',
  ',',
  'are',
  'seeing',
  'green',
  'again',
  '.']}

In [None]:
# split data into training and validation sets
train_data, valid_data = data.split(split_ratio=0.8)

In [None]:
# build vocabulary from training set using 100 dimensional GloVe embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.37MB/s]                          
 99%|█████████▉| 397983/400000 [00:14<00:00, 27953.60it/s]

In [None]:
print(len(TEXT.vocab))
print("---------------", "\n")
print(len(LABEL.vocab))
print("---------------", "\n")
print(TEXT.vocab.freqs.most_common(10)) 
print("---------------", "\n")
print(TEXT.vocab.stoi) 
print("---------------", "\n")
print(LABEL.vocab.stoi)

In [None]:
# set devide to "cuda"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [None]:
# define classifier

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        # pass sequences through embedding layer
        embedded = self.embedding(text)
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
            
        #pass sequences through LSTM
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #pass hidden state through fully connected layer
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [None]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [None]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(22393, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 2,298,757 trainable parameters
torch.Size([22393, 100])


In [None]:
!pip install transformers

In [None]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-4)

#define the loss
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 15
best_valid_loss = float('inf')
best_valid_accuracy = 0

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    if valid_acc > best_valid_accuracy:
      best_valid_accuracy = valid_acc 

In [None]:
print(best_valid_accuracy)

0.9614322918653488


## BERT

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoModel, BertTokenizerFast

pd.set_option('display.max_colwidth', 200)
# specify GPU
device = torch.device("cuda")

In [None]:
# encode labels
le = preprocessing.LabelEncoder()
"""df is the same dataframe as in LSTM, which is 2-class data.
Model can be adapted by changing the dimension of output layer)"""
le.fit(df['Class Index'])

LabelEncoder()

In [None]:
y = le.transform(df['Class Index'])

In [None]:
# split data
train_text, val_text, train_labels, val_labels = train_test_split(df['Title'], y, 
                                                                    random_state= 42, 
                                                                    test_size=0.2, 
                                                                    stratify=y)

In [None]:
# download tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
# get maximum length for max_len parameter
seq_len = [len(i.split()) for i in train_text]
maxlen = max(seq_len)
print(maxlen)

145


In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = maxlen,
    padding='max_length',
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = maxlen,
    padding='max_length',
    truncation=True
)

In [None]:
# convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
# download the pre-trained model
bert = AutoModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# freeze the parameters
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Classification(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Classification, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [None]:
# pass the pre-trained BERT to our define architecture
model = BERT_Classification(bert)

# push the model to GPU
model = model.to(device)

In [None]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-4)

# define the loss function
cross_entropy  = nn.NLLLoss() 

# number of training epochs
epochs = 5

In [None]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 300 batches.
    if step % 300 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 300 batches.
    if step % 300 == 0 and not step == 0:
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 5
  Batch   300  of  1,595.
  Batch   600  of  1,595.
  Batch   900  of  1,595.
  Batch 1,200  of  1,595.
  Batch 1,500  of  1,595.

Evaluating...
  Batch   300  of    399.

Training Loss: 0.318
Validation Loss: 0.224

 Epoch 2 / 5
  Batch   300  of  1,595.
  Batch   600  of  1,595.
  Batch   900  of  1,595.
  Batch 1,200  of  1,595.
  Batch 1,500  of  1,595.

Evaluating...
  Batch   300  of    399.

Training Loss: 0.226
Validation Loss: 0.196

 Epoch 3 / 5
  Batch   300  of  1,595.
  Batch   600  of  1,595.
  Batch   900  of  1,595.
  Batch 1,200  of  1,595.
  Batch 1,500  of  1,595.

Evaluating...
  Batch   300  of    399.

Training Loss: 0.215
Validation Loss: 0.187

 Epoch 4 / 5
  Batch   300  of  1,595.
  Batch   600  of  1,595.
  Batch   900  of  1,595.
  Batch 1,200  of  1,595.
  Batch 1,500  of  1,595.

Evaluating...
  Batch   300  of    399.

Training Loss: 0.209
Validation Loss: 0.199

 Epoch 5 / 5
  Batch   300  of  1,595.
  Batch   600  of  1,595.
  Batch   900 

In [None]:
# load weights of best saved model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
# get validation predictions
valid_loss, preds = evaluate()
preds = np.argmax(preds, axis = 1)


Evaluating...
  Batch   300  of    399.


In [None]:
# classification report
print(classification_report(val_y, preds))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93      6380
           1       0.92      0.95      0.94      6380

    accuracy                           0.94     12760
   macro avg       0.94      0.94      0.94     12760
weighted avg       0.94      0.94      0.94     12760

