In [1]:
import sys
sys.path.append("../")  # go to parent directory

from src.model_manager.sentiment_dataset import SentimentDataset
from src.utils import compute_metrics
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup, create_optimizer, TrainingArguments, Trainer

from src.utils import read_csv
import numpy as np
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OrdinalEncoder
from torch.utils.data import Dataset, DataLoader
from tqdm import trange

In [2]:
# constants
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 20
NUM_EPOCHS = 5
MAX_LENGTH = 100

RANDOM_SEED = 42  # for reproducibilty
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

PATH_TO_DATA = "../data/training_cleanded.csv"

In [3]:
df = read_csv(path=PATH_TO_DATA)
df.head(5)

Unnamed: 0,Id,Entity,Sentiment,Text,Word_count
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,11
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,13


In [4]:
# # tmp
# df = df.iloc[:10000]
# df

In [5]:
df_columns = df.columns.str.strip()
df_columns

Index(['Id', 'Entity', 'Sentiment', 'Text', 'Word_count'], dtype='object')

#### Some preprocessing

In [6]:
# Encode sentiments as integers
sentiment_mapping = {'Negative': 0, 'Positive': 1, 'Neutral': 2, 'Irrelevant': 3}
df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)
df.head(4)

Unnamed: 0,Id,Entity,Sentiment,Text,Word_count
0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,1,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,1,im coming on borderlands and i will murder you...,11


In [7]:
# Split the data into training and validation sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=RANDOM_SEED)
print(f"Shapes:\ntrain_df: {train_df.shape}\nval_df: {val_df.shape}\ntest_df: {test_df.shape}")

Shapes:
train_df: (57324, 5)
val_df: (7166, 5)
test_df: (7166, 5)


In [8]:
# Reset index
train_df.reset_index(inplace=True)
val_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

#### Initialize a BERT tokenizer

In [9]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

In [10]:
# Tokenize the data
train_tokenizer = tokenizer(list(train_df['Text']), truncation=True, padding=True, max_length=MAX_LENGTH)
val_tokenizer = tokenizer(list(val_df['Text']), truncation=True, padding=True, max_length=MAX_LENGTH)
test_tokenizer = tokenizer(list(test_df['Text']), truncation=True, padding=True, max_length=MAX_LENGTH)

In [11]:
# # cross check
# tokenizer.decode(train_tokenizer['input_ids'][10])

#### Create PyTorch Dataset and DataLoader

In [12]:
# Create datasets
train_dataset = SentimentDataset(train_tokenizer, list(train_df['Sentiment']))
val_dataset = SentimentDataset(val_tokenizer, list(val_df['Sentiment']))
test_dataset = SentimentDataset(test_tokenizer, list(test_df['Sentiment']))
# val_dataset.__getitem__(10)

#### Set up BERT model

In [13]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [14]:
# Model initialization
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(sentiment_mapping))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps =len(train_dataset) // BATCH_SIZE * NUM_EPOCHS 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

####  Train the model

In [16]:
%%time

# Define training arguments
training_args = TrainingArguments(
    output_dir='../results',
    logging_dir='../logs',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_steps=10,
    eval_strategy="epoch",
    gradient_accumulation_steps=2  # Number of updates steps to accumulate the gradients 
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics  # Define metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6222,0.535157,0.80561,0.805666,0.80561,0.803895
2,0.2206,0.330864,0.888362,0.890437,0.888362,0.888129
3,0.1718,0.336969,0.910131,0.910177,0.910131,0.91015
4,0.1568,0.362582,0.917527,0.91775,0.917527,0.917556
5,0.0601,0.363734,0.920876,0.921175,0.920876,0.920919


CPU times: user 19min 10s, sys: 50min 27s, total: 1h 9min 38s
Wall time: 1h 9min 33s


TrainOutput(global_step=14335, training_loss=0.30745384560921984, metrics={'train_runtime': 4173.5719, 'train_samples_per_second': 68.675, 'train_steps_per_second': 3.435, 'total_flos': 1.4729344705296e+16, 'train_loss': 0.30745384560921984, 'epoch': 5.0})

In [17]:
# Evaluate the model
eval_result = trainer.evaluate(eval_dataset=test_dataset)
print(eval_result)
print(f"Validation Accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"Validation Precision: {eval_result['eval_precision']:.4f}")
print(f"Validation Recall: {eval_result['eval_recall']:.4f}")
print(f"Validation F1 Score: {eval_result['eval_f1']:.4f}")

{'eval_loss': 0.3688706159591675, 'eval_accuracy': 0.9182249511582473, 'eval_precision': 0.9185249101338531, 'eval_recall': 0.9182249511582473, 'eval_f1': 0.9182225767646979, 'eval_runtime': 30.187, 'eval_samples_per_second': 237.387, 'eval_steps_per_second': 11.893, 'epoch': 5.0}
Validation Accuracy: 0.9182
Validation Precision: 0.9185
Validation Recall: 0.9182
Validation F1 Score: 0.9182


In [18]:
# Save the model
model.save_pretrained('../bert_model')
tokenizer.save_pretrained('../bert_model')

('../bert_model/tokenizer_config.json',
 '../bert_model/special_tokens_map.json',
 '../bert_model/vocab.txt',
 '../bert_model/added_tokens.json',
 '../bert_model/tokenizer.json')