<a href="https://colab.research.google.com/github/lov435/SOEmotions/blob/main/hugging_bert_goemotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install PyTorch and BERT transformers from HuggingFace

In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import necessary packages

In [2]:
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import torch

### Read the emotions prediction spreadsheet

In [3]:
url='https://drive.google.com/file/d/1OW1PZ-MvXFGd4KbqE8zjSakPNVKXXVLy/view?usp=sharing'
file_id=url.split('/')[-2]
dwn_url='https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

### Extract the Comment text and the corresponding Haoxiang's group from the spreadsheet. Also, remove the rows with empty comments

In [4]:
df = df.dropna(axis=0, subset=['CommentTextProc'])
#GoEmotions features columns
goEmoCols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'] 
X_cols = ['CommentTextProc']
#Append the comment text to the goEmotions features to form a complete feature set
X_cols.extend(goEmoCols)
X = df[X_cols]
#Group column is the class
y = df['Group']

### Split the data into training and test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                   random_state=104, 
                                   test_size=0.40, 
                                   shuffle=True)

### Initialize the BERT tokenizer and model

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Produce the features (BERT vectors) from text

In [7]:
#X_train, X_test, y_train, y_test
encoded_x_train = tokenizer(list(X_train['CommentTextProc']), padding = True, truncation = True, return_tensors='pt')
encoded_x_test = tokenizer(list(X_test['CommentTextProc']), padding = True, truncation = True, return_tensors='pt')

#move on device (GPU)
encoded_x_train = {k:torch.tensor(v).to(device) for k,v in encoded_x_train.items()}
encoded_x_test = {k:torch.tensor(v).to(device) for k,v in encoded_x_test.items()}

with torch.no_grad():
  output_train = model(**encoded_x_train)
  output_test = model(**encoded_x_test)

#We need the [CLS] output for our classification task  
cls_output_train = output_train.last_hidden_state[:,0,:]
cls_output_test = output_test.last_hidden_state[:,0,:]

print("Shape is")
print(cls_output_train.shape)
print(cls_output_test.shape)

  
  import sys


Shape is
torch.Size([2190, 768])
torch.Size([1460, 768])


#### Perform PCA

In [8]:
# Use PCA to reduce dimensions from 768 to 10
pca = PCA(n_components = 20, random_state = 7)
#Temporarily concatenate the training and test features for PCA
nump_features = np.concatenate((cls_output_train.detach().cpu().numpy(), cls_output_test.detach().cpu().numpy()))
#print(nump_features.shape)
#print(nump_features)
X1 = pca.fit_transform(nump_features)
print(X1.shape)
print(X1)
#Now slice the PCA feature set to training and test
X_pca_train = X1[:len(y_train),]
X_pca_test = X1[len(y_train):,]
print(X_pca_train.shape)



(3650, 20)
[[ 3.4247313  -1.8072591   1.629929   ...  0.5630206  -0.5559147
  -0.02455406]
 [ 0.24995746  4.0505266  -0.76881146 ... -0.04837902 -0.71838933
   0.62327296]
 [ 1.352505    0.71638167 -0.01074351 ... -0.07519045  0.76614106
   0.08123524]
 ...
 [ 1.590624   -2.0225754  -0.972284   ...  0.12262599  0.17904675
  -0.21681839]
 [-1.9809672   1.7551185   0.5498128  ...  0.104182   -0.6239363
   0.10105332]
 [-0.35026196  0.31541175 -0.04346808 ... -0.6486864   0.99373025
   1.1642106 ]]
(2190, 20)


### Combine the PCA BERT features with the GoEmotions features

In [9]:
X_goEmo_train = X_train[goEmoCols].to_numpy()
X_goEmo_test = X_test[goEmoCols].to_numpy()
X_train_final = np.concatenate((X_goEmo_train, X_pca_train), axis=1)
X_test_final = np.concatenate((X_goEmo_test, X_pca_test), axis=1)



### Perform a classification task

In [10]:
rf = RandomForestClassifier()
rf.fit(X_train_final, y_train)
rf.score(X_test_final, y_test)


0.5910958904109589