In [1]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from torch import nn, optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

In [2]:
# Reading Data
df = pd.read_csv("../input/financial-sentiment-analysis/data.csv")
encoder = LabelEncoder()
labels = encoder.fit_transform(df.Sentiment)

In [3]:
# Init tokenizer
# Bert has its own tokenizer which also include some special tokens
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Spliting data
X_train, X_test, y_train, y_test = train_test_split(df.Sentence.values, labels, test_size=.2, stratify=df.Sentiment.values)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# Data generator
# Inherit Dataset from torch which need to implement __len__ and __getitem__ function

class Datagen(Dataset):
    def __init__(self, sentence, target, tokenizer, max_len):
        self.sentence = sentence
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.sentence)
    def __getitem__(self, idx):
        sentence = self.sentence[idx]
        bert_encoding = tokenizer.encode_plus(
            sentence, 
            max_length = self.max_len, 
            add_special_tokens = True, # include special tokens
            padding = 'max_length', 
            truncation = True, 
            return_attention_mask = True, # return attention mask which is required during training 
            return_token_type_ids = False, 
            return_tensors = 'pt' # pt is pytorch format tensor
        )
        return {
            "input_ids": torch.squeeze(bert_encoding["input_ids"]),
            "attention_mask": torch.squeeze(bert_encoding["attention_mask"]),
            "target": torch.tensor(self.target[idx], dtype=torch.long)
        }
train_data = Datagen(X_train, y_train, tokenizer, 50)
test_data = Datagen(X_test, y_test, tokenizer, 50)
# DataLoader created batch generator and have prefatch ability
train_dataloader = DataLoader(train_data, batch_size=16, num_workers=2)
test_dataloader = DataLoader(test_data, batch_size=15, num_workers=2)

In [5]:
# Model
class SentimentModel(nn.Module):
    def __init__(self, n_class):
        # Here we are initializing some nn.Module attributes and functions
        super(SentimentModel, self).__init__()
        # Initializing bert model
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_class)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, input_ids, attention_mask):
        # bert model gives two outputs, sequenced output and pooled output
        # Sequence output is last layer output for each token in sentence usualy used for similarity task
        # Pooled output is cls token(starting token for each sentence) output from model used for classification task
        _, pooled_output = self.bert(input_ids, attention_mask, return_dict=False)
        output = self.drop(pooled_output)
        output = self.out(output)
        return self.softmax(output)
model = SentimentModel(n_class=3)
# Moving model to GPU
model = model.to(torch.device('cuda'))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(optimizer, 'min')

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def check_acc(preds, labels):
    correct = 0
    for pred, label in zip(preds, labels):
        if pred == label:
            correct += 1
    return correct/len(preds)

In [7]:
def accuracy(dataloader, model):
    model.eval()
    batch_acc = 0
    for inputs in dataloader:
        to_cuda = lambda x: x.to(torch.device('cuda'), non_blocking=True).long()
        input_ids, attention_mask, target = list(map(to_cuda, inputs.values()))

        outputs = model(input_ids, attention_mask)
        prediction = outputs.argmax(dim=1)
        acc = check_acc(prediction, target)
        batch_acc += acc
    return batch_acc/len(dataloader)

In [8]:
#  Training
for epoch in range(5):
    # Telling model to start training mode for dropout and BN
    model.train()
    batch_acc = 0
    for steps, inputs in enumerate(train_dataloader):
        to_cuda = lambda x: x.to(torch.device('cuda'), non_blocking=True).long()
        # Moving variables to GPU
        input_ids, attention_mask, target = list(map(to_cuda, inputs.values()))
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, target)
        # Initializing gradient to 0 so that it dont add up previous gradient        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_acc = accuracy(train_dataloader, model)
    test_acc = accuracy(test_dataloader, model)
    
    scheduler.step(test_acc)
    print(f"Epoch: {epoch}\t Train accuracy: {train_acc}\t Test accuracy: {test_acc}")

Epoch: 0	 Train accuracy: 0.7542662116040956	 Test accuracy: 0.7296092796092795
Epoch: 1	 Train accuracy: 0.814419795221843	 Test accuracy: 0.7810134310134307
Epoch: 2	 Train accuracy: 0.8483361774744027	 Test accuracy: 0.7997557997557997
Epoch: 3	 Train accuracy: 0.8575085324232082	 Test accuracy: 0.7989010989010986
Epoch: 4	 Train accuracy: 0.8630546075085325	 Test accuracy: 0.7989621489621487
