Nicole Joseph

Deep Learning

HW 5

In [None]:
# Import necessary libraries
import csv
!pip install datasets
import datasets
from datasets.tasks import TextClassification

!pip install transformers
import transformers 
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch 
from torch import nn, optim 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import numpy as np 
import pandas as pd 
import seaborn as sns 
from pylab import rcParams 
import matplotlib.pyplot as plt 
from matplotlib import rc

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from collections import defaultdict
from textwrap import wrap 

# To utilize GPUs for computation
# citation:  https://stackoverflow.com/questions/50954479/using-cuda-with-pytorch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# dataset: https://huggingface.co/datasets/ag_news/blob/main/ag_news.py
#from datasets import load_dataset
#train, test = load_dataset('ag_news', split=['train', 'test'])
#train[0]['text']
#train[0]['label']

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
train = pd.read_csv('drive/MyDrive/deep-learning/hw5/train.csv')
test = pd.read_csv('drive/MyDrive/deep-learning/hw5/test.csv')

train.head()
# test.head()
# The train.csv and test.csv files I used also have a column with descriptions that correlate with each article titles

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [5]:
print(train.shape, test.shape)
# output: ((120000, 3), (7600, 3))

(120000, 3) (7600, 3)


In [6]:
#train['Class Index'] = train['Class Index'].replace(['1','2', '3', '4'], ['0','1', '2', '3'])
#test['Class Index'] = test['Class Index'].replace(['1','2', '3', '4'], ['0','1', '2', '3'])
# train.to_csv("train.csv", index=False)

In [7]:
# Fixing the labels in train.csv and test.csv so that they are World (0), Sports (1), Business (2), Sci/Tech (3)
# I also read online that for multiclass classification, the labels should be integers starting from 0

# citation: https://www.geeksforgeeks.org/applying-lambda-functions-to-pandas-dataframe/
labeling = {
    1:0, 
    2:1,
    3:2,
    4:3
}

In [8]:
train['Class Index'] = train['Class Index'].apply(lambda x : labeling[x])
test['Class Index'] = test['Class Index'].apply(lambda x: labeling[x])

In [9]:
train.head()

Unnamed: 0,Class Index,Title,Description
0,2,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,2,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,2,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,2,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [10]:
# Implement BERT (for tokenizing, and as a pre-trained transformer model) with pytorch 

# Sentiment Analysis with BERT using huggingface, PyTorch and Python Tutorial: https://www.youtube.com/watch?v=Osj0Z6rwJB4
# citation: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
# citation: https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613 

pre_trained_model_name = 'bert-base-cased' # cased version empirically works better than uncased according to source above
tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
# Notes: 'content' is the full argument for encoding and tokenizing the training data
# In this case, the training 'content' includes the article title and description 
train['content'] = train['Title'] + ' ' + train['Description']
del train['Title']
del train['Description']

In [12]:
# The below code blocks are for tokenizing and encoding the training data
# TOKENIZATION : BERT Tokenizer is based on WordPiece
# sentence --> tokens --> token IDs
# Encoding also returns attention masks

# Choose sequence length
token_lens = []
MAX_LEN = 100

# citation https://www.analyticsvidhya.com/blog/2021/09/an-explanatory-guide-to-bert-tokenizer/
for txt in train.content:
    # calling the encode function returns indices of the input tokens 
    tokens = tokenizer.encode(txt, max_length = 512, truncation = True)
    token_lens.append(len(tokens))
# note: 1 min 40 sec to execute this block
# print(tokens[0:5])

In [13]:
# citation: https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613
# Sentiment Analysis with BERT using huggingface, PyTorch and Python Tutorial: https://www.youtube.com/watch?v=Osj0Z6rwJB4 

class AgNewsData(Dataset):
    
    def __init__(self, content, targets, tokenizer, max_len):
        self.content = content 
        self.targets = targets # this is the target class
        self.tokenizer = tokenizer
        self.max_len = max_len # max_len for padding the data
        
    def __len__(self):
        return len(self.content)
    
    def __getitem__(self, item):
        content = str(self.content[item]) # ensures it's a string (Title + Description)
        target = self.targets[item]

        # Instantiate a pre-trained BERT model configuration to encode our data
        encoding = self.tokenizer.encode_plus(
            content, 
            max_length = self.max_len,
            # Special tokens include: SEP, CLS, PAD, UNK
            # Adding CLS and SEP tokens distinguishes the beginning and the end of a sentence
            add_special_tokens = True, 
            return_token_type_ids = False, 
            padding = "max_length",
            truncation = True,
            return_attention_mask = True, # This returns tensor of the padded attention masks
            return_tensors = 'pt'
        )
          
        # split the training data into input_ids, attention_masks and labels
        return {
            'content_text':content, 
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets':torch.tensor(target, dtype=torch.long)
        }

In [14]:
# 42 is a popular choice
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# split into training and validation sets
df_train, df_val = train_test_split(train, test_size=0.1, random_state=RANDOM_SEED)
#print(df_train.shape, test.shape, df_val.shape)

In [15]:
# preparing test data in the same way we prepared training data
test['content'] = test['Title'] + ' ' + test['Description']
del test['Title']
del test['Description']

In [16]:
df_test = test
del test

In [None]:
#df_test.head()

In [18]:
# citation: https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613
# citation: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html  

def create_data_loader(df, tokenizer, max_len, batch_size):
    # define dataset argument for DataLoader
    ds = AgNewsData(
        content = df.content.to_numpy(),
        targets = df['Class Index'].to_numpy(),
        tokenizer = tokenizer, 
        max_len = max_len
    )
    
    # DataLoader wraps an iteratable around the dataset
    return DataLoader(
        ds, 
        batch_size = batch_size, 
        num_workers = 2
    )

In [19]:
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [20]:
# Implement a classifier class with a super contructor and forward pass method
# Sentiment Analysis with BERT using huggingface, PyTorch and Python Tutorial: https://www.youtube.com/watch?v=Osj0Z6rwJB4
# citation: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

class_names = ['1', '2', '3', '4']

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pre_trained_model_name)
        self.drop = nn.Dropout(p = 0.45)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        o = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        # return the raw output of the last layer for the cross-entropy loss function
        output = self.drop(o.get('pooler_output'))
        
        return self.out(output)

In [21]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
# TRAINING 
# citation: https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613

EPOCHS = 1
# recomended learning rate = 2e-5 and number of epochs = 2,3,4 (I chose 1 for the sake of time)
# citation: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/ 
optimizer = AdamW(model.parameters(), lr = 2e-5, correct_bias = False)
total_steps = len(train_data_loader) * EPOCHS

# Creates a schedule with a learning rate that decreases linearly from the initial learning rate set in the 
# optimizer to 0, after a warmup period during which it increases linearly from 0 to the initial learning rate set in the optimizer
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)



In [23]:
# Helper function for training the model for one epoch
# Sentiment Analysis with BERT using huggingface, PyTorch and Python Tutorial: https://www.youtube.com/watch?v=Osj0Z6rwJB4
# citation: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

def train_epoch(
    model, 
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model = model.train()
    losses= []
    correct_predictions = 0 
    
    for d in data_loader:
        # Iterate over each element in data_loader and move input_ids, attention_masks, and targets to GPU
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        outputs = model(
            input_ids = input_ids, 
            attention_mask = attention_mask
        )
        
        # highest probabilities stored in pred
        _, preds = torch.max(outputs, dim = 1)
        # apply loss function over loss function
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # back propagation steps
        loss.backward()
        # avoid exploding gradients with clip_grad_norm
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double()/n_examples, np.mean(losses)

In [24]:
# Sentiment Analysis with BERT using huggingface, PyTorch and Python Tutorial: https://www.youtube.com/watch?v=Osj0Z6rwJB4
# citation: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    # disabling gradient function makes torch faster
    with torch.no_grad():
        # Similar methodology as above train_epoch function
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(
                input_ids = input_ids, 
                attention_mask = attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
        
    return correct_predictions.double() / n_examples , np.mean(losses)
        

In [25]:
# Sentiment Analysis with BERT using huggingface, PyTorch and Python Tutorial: https://www.youtube.com/watch?v=Osj0Z6rwJB4
# citation: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

# way to store training and validation accuracies
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch+1}/{EPOCHS}')
    print('-'*10)
    
    # training loss and accuracy
    train_acc, train_loss = train_epoch(
        model, 
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    # validation loss and accuracy
    val_acc, val_loss = eval_model(
        model, 
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    print(f'Val loss {val_loss} val accuracy {val_acc}')
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # save the best validation accuracy
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc
    
    # Note: 68 minutes to execute one epoch

Epoch 1/1
----------
Train loss 0.23362746742271162 accuracy 0.9259537037037037
Val loss 0.18004265934787692 val accuracy 0.9438333333333333



In [28]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

0.9415789473684211