In [1]:
import sys
sys.path.append("../")  # go to parent directory

# from src.model_manager.sentiment_dataset import SentimentDataset

from src.utils import compute_metrics
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, create_optimizer, TrainingArguments, Trainer

from src.utils import read_csv, get_sentiment_mapping
import numpy as np
import torch

import torch.nn as nn

#from torch.utils.data import Dataset, DataLoader
from datasets import Dataset
from sklearn.preprocessing import OrdinalEncoder
# from torch.utils.data import Dataset, DataLoader

In [2]:
# constants
MODEL_NAME = 'bert-base-cased'
BATCH_SIZE = 20
NUM_EPOCHS = 3
MAX_LENGTH = 100

RANDOM_SEED = 42  # for reproducibilty
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

PATH_TO_DATA = "../data/training_cleanded.csv"

In [3]:
df = read_csv(path=PATH_TO_DATA)
df.head(5)

Unnamed: 0,id,entity,labels,text,word_count
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,11
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,13


In [4]:
df_columns = df.columns.str.strip()
df_columns

Index(['id', 'entity', 'labels', 'text', 'word_count'], dtype='object')

#### Some preprocessing

In [6]:
SENT_LABELS = ['Irrelevant', 'Neutral', 'Negative', 'Positive']

In [7]:
# Encode sentiments as integers
sentiment_mapping = get_sentiment_mapping(sent_list=SENT_LABELS, is_label_to_idx=True)
df['labels'] = df['labels'].map(sentiment_mapping)
df.head(4)

Unnamed: 0,id,entity,labels,text,word_count
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,11


In [8]:
# Split the data into training and validation sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
print(f"Shapes:\ntrain_df: {train_df.shape}\ntest_df: {test_df.shape}")

Shapes:
train_df: (57324, 5)
test_df: (14332, 5)


In [9]:
print(train_df.columns)
print(test_df.columns)

Index(['id', 'entity', 'labels', 'text', 'word_count'], dtype='object')
Index(['id', 'entity', 'labels', 'text', 'word_count'], dtype='object')


In [10]:
# Reset index
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [12]:
# test and train set label counts must be same
len(train_df.labels.value_counts()), len(test_df.labels.value_counts()) 

(4, 4)

#### Initialize a BERT model and tokenizer

In [13]:
# Set your number of unique labels for your target
num_labels = len(SENT_LABELS)  # 4

In [14]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(SENT_LABELS))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
# tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

In [17]:
# Tokenize the data

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)
    
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [19]:
# Tokenize by batches
train_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/14332 [00:00<?, ? examples/s]

In [20]:
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/14332 [00:00<?, ? examples/s]

#### Create PyTorch Dataset and DataLoader

In [21]:
# # Create datasets
# train_dataset = SentimentDataset(train_tokenizer, list(train_df['Sentiment']))
# test_dataset = SentimentDataset(test_tokenizer, list(test_df['Sentiment']))
# val_dataset.__getitem__(10)

#### Set up BERT model

In [22]:
# Set training args
NUM_EPOCH = 3
WARMUP_STEPS = 500
LEARNING_RATE = 2e-5
BATCH_SIZE = 16

In [23]:
# # Set up optimizer and scheduler
# optimizer = AdamW(model.parameters(), lr=2e-5)
# total_steps =len(train_dataset) // BATCH_SIZE * NUM_EPOCHS 
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [24]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='../results',
    logging_dir='../logs',
    save_strategy = "epoch",
    eval_strategy="epoch",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=128, # batch size for evaluation
    warmup_steps=WARMUP_STEPS,
    weight_decay=0.01,
    learning_rate=LEARNING_RATE,
    load_best_model_at_end=True,       # load best model at the end according to chosen metric
    metric_for_best_model='f1'         # chosen metric to use for picking best model
)

####  Train the model

In [25]:
%%time
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Define metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.231,0.816278,0.683924,0.682791,0.683924,0.666931
2,0.8054,0.491585,0.836659,0.835724,0.836659,0.835104
3,0.5078,0.332447,0.891641,0.891237,0.891641,0.890929


CPU times: user 3min 41s, sys: 9min 42s, total: 13min 23s
Wall time: 13min 22s


TrainOutput(global_step=2688, training_loss=0.8134904276757013, metrics={'train_runtime': 802.124, 'train_samples_per_second': 53.603, 'train_steps_per_second': 3.351, 'total_flos': 2209555875196800.0, 'train_loss': 0.8134904276757013, 'epoch': 3.0})

In [26]:
# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)
print(f"Validation Accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"Validation Precision: {eval_result['eval_precision']:.4f}")
print(f"Validation Recall: {eval_result['eval_recall']:.4f}")
print(f"Validation F1 Score: {eval_result['eval_f1']:.4f}")

{'eval_loss': 0.33244746923446655, 'eval_accuracy': 0.8916410828914317, 'eval_precision': 0.8912371548599353, 'eval_recall': 0.8916410828914317, 'eval_f1': 0.8909287387768433, 'eval_runtime': 62.7804, 'eval_samples_per_second': 228.288, 'eval_steps_per_second': 1.784, 'epoch': 3.0}
Validation Accuracy: 0.8916
Validation Precision: 0.8912
Validation Recall: 0.8916
Validation F1 Score: 0.8909


In [27]:
# Save the model
model.save_pretrained('../bert_model')
tokenizer.save_pretrained('../bert_model')

('../bert_model/tokenizer_config.json',
 '../bert_model/special_tokens_map.json',
 '../bert_model/vocab.txt',
 '../bert_model/added_tokens.json',
 '../bert_model/tokenizer.json')