<a href="https://colab.research.google.com/github/mlEngAnu/sentiment_analysis/blob/main/emotion_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing transformers library
!pip install transformers

In [17]:
# import necessary libraries
import numpy as np
import pandas as pd
import transformers as ppb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
import torch
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [18]:
# loading data 
emotion_dataset = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [19]:
emotion_dataset.shape

(6920, 2)

In [20]:
emotion_dataset.head(5)

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [21]:
# max sentence length of sentence of the dataset
print(emotion_dataset[0].str.len().max())
# Unique number of labels in dataset
print(emotion_dataset[1].nunique())

271
2


In [22]:
# BERT
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
text = emotion_dataset[0].tolist()
print(text)

In [25]:
tokenized = emotion_dataset[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
# Check the max length of tolenized text
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

print(max_len)

67


In [None]:
features= np.empty((0, 768))

model.eval()

batch_size = 32

for idx in range(0, len(text), batch_size):
    batch = text[idx : min(len(text), idx+batch_size)]
    
    # tokenizing, padding
    encoded = tokenizer.batch_encode_plus(batch,max_length=67, padding='max_length', truncation=True)
    encoded = {key:torch.LongTensor(value) for key, value in encoded.items()}

    with torch.no_grad():
        
        outputs = model(**encoded)
        # get the last hidden states
        last_hidden_states = outputs[0][:,0,:]
        # get the CLS tokens from last hidden states to a numpy array
        f = last_hidden_states.numpy()
        # append the outputs to the features
        features = np.append(features, f, axis = 0)

In [27]:
print(features)

[[-0.55664974 -0.33129364 -0.22280592 ... -0.22786188  0.63191968
   0.24306628]
 [-0.28789204 -0.14285493 -0.06857792 ... -0.31690586  0.18455292
   0.31989801]
 [-0.18645325  0.30229419 -0.18511124 ... -0.33493006  0.98487359
   0.52977449]
 ...
 [-0.10313777  0.27795506 -0.45855811 ...  0.06911698  0.78224915
   0.57749891]
 [-0.30236343 -0.15446037 -0.18571363 ... -0.1518053   0.72149092
   0.09804416]
 [ 0.07386218 -0.17063418 -0.03027087 ... -0.12601498  0.46106932
   0.20001717]]


In [28]:
features.shape

(6920, 768)

In [29]:
labels = emotion_dataset[1]

In [30]:
# train test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [31]:
print(train_labels)

3875    0
4377    1
1735    0
3713    0
756     1
       ..
4139    1
4266    1
4633    0
4283    0
3572    0
Name: 1, Length: 5190, dtype: int64


In [41]:
# Train a logistic regression model on training set
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

In [34]:
print(test_features)

[[-0.00827334 -0.1539838  -0.13339958 ... -0.35909629  0.15695181
   0.44999924]
 [ 0.2137866   0.60637081 -0.56289107 ... -0.60069811  0.36876115
   1.02119017]
 [ 0.12204102  0.34742224  0.12262959 ... -0.54890859  0.26159024
   0.36416006]
 ...
 [-0.45724216 -0.08256365 -0.21932998 ... -0.15496208  0.37513605
   0.20498526]
 [-0.01540317  0.01535242  0.27054039 ... -0.01284905  0.38260213
   0.39556494]
 [-0.27253616 -0.06650821 -0.39774039 ... -0.32066688  0.31617862
   0.46787518]]


In [35]:
test_features.shape

(1730, 768)

In [36]:
print(test_labels)

235     1
3142    0
6679    0
1061    0
5337    1
       ..
2070    0
38      1
4794    1
6608    0
445     0
Name: 1, Length: 1730, dtype: int64


In [45]:
# Predict on the test set
lr_pred = lr_clf.predict(test_features)

In [46]:
# Accuracy and F1 score of the predictions
lr_test_accuracy = accuracy_score(test_labels, lr_pred)
lr_test_f1 = f1_score(test_labels, lr_pred, average='weighted')
print('Logistic Regression Model Accuracy: ', "%.2f" % (lr_test_accuracy*100))
print('Logistic Regression Model F1 Score: ', "%.2f" % (lr_test_f1*100))

Logistic Regression Model Accuracy:  86.18
Logistic Regression Model F1 Score:  86.19
