In [1]:
import sys
sys.path.append("../")  # go to parent directory

from src.model_manager.sentiment_dataset import SentimentDataset
from src.utils import compute_metrics
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup, create_optimizer, TrainingArguments, Trainer

from src.utils import read_csv
import numpy as np
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OrdinalEncoder
from torch.utils.data import Dataset, DataLoader
from tqdm import trange

In [2]:
# constants
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 20
NUM_EPOCHS = 3

RANDOM_SEED = 42  # for reproducibilty
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

PATH_TO_DATA = "../data/training_cleanded.csv"

In [3]:
df = read_csv(path=PATH_TO_DATA)
df.head(5)

Unnamed: 0,Id,Entity,Sentiment,Text,Word_count
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,11
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,13


In [4]:
# # tmp
# df = df.iloc[:10000]
# df

In [5]:
df_columns = df.columns.str.strip()
df_columns

Index(['Id', 'Entity', 'Sentiment', 'Text', 'Word_count'], dtype='object')

#### Some preprocessing

In [6]:
# Encode sentiments as integers
sentiment_mapping = {'Negative': 0, 'Positive': 1, 'Neutral': 2, 'Irrelevant': 3}
df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)
df.head(4)

Unnamed: 0,Id,Entity,Sentiment,Text,Word_count
0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,1,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,1,im coming on borderlands and i will murder you...,11


In [7]:
# Split the data into training and validation sets
train_data_df, val_data_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

In [8]:
# Reset index
train_data_df.reset_index(inplace=True)
val_data_df.reset_index(inplace=True)

#### Initialize a BERT tokenizer

In [9]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

In [10]:
# Tokenize the data
max_length = 100
train_tokenizer = tokenizer(list(train_data_df['Text']), truncation=True, padding=True, max_length=max_length)
val_tokenizer = tokenizer(list(val_data_df['Text']), truncation=True, padding=True, max_length=max_length)

In [11]:
# # cross check
# tokenizer.decode(train_tokenizer['input_ids'][10])

#### Create PyTorch Dataset and DataLoader

In [12]:
# Create datasets
train_dataset = SentimentDataset(train_tokenizer, list(train_data_df['Sentiment']))
val_dataset = SentimentDataset(val_tokenizer, list(val_data_df['Sentiment']))
# val_dataset.__getitem__(10)

#### Set up BERT model

In [13]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [14]:
# Model initialization
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(sentiment_mapping))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps =len(train_dataset) // BATCH_SIZE * NUM_EPOCHS 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

####  Train the model

In [None]:
%%time

# Define training arguments
training_args = TrainingArguments(
    output_dir='../results',
    logging_dir='../logs',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="epoch",
    # use_cpu=True
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics  # Define metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)
print(f"Validation Accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"Validation Precision: {eval_result['eval_precision']:.4f}")
print(f"Validation Recall: {eval_result['eval_recall']:.4f}")
print(f"Validation F1 Score: {eval_result['eval_f1']:.4f}")
print(f"Validation Confusion Matrix: \n{eval_result['eval_confusion_matrix']}")

In [None]:
eval_result.

In [None]:
# Save the model
model.save_pretrained('../bert_model')
tokenizer.save_pretrained('../bert_model')