# Goal : Predict the Sentiments of Tweets

# A. Import Files & Modules

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CosineSimilarityLoss

from xgboost import XGBClassifier
from sklearn.metrics import classification_report

from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

  from .autonotebook import tqdm as notebook_tqdm


# B. Process Data
Concatenate Data for EDA

In [3]:
# Train
train = pd.read_csv("data/twitter_training.csv", header=None)
train.rename(columns={0:'tweet_id', 1:'entity', 2:'sentiment', 3:'tweet'},inplace=True)
train = train.dropna().reset_index(drop=True)
train_id = train['tweet_id']
train['sentiment_label'] = train['sentiment'].astype('category').cat.codes

# Val
val = pd.read_csv("data/twitter_validation.csv", header=None)
val.rename(columns={0:'tweet_id', 1:'entity', 2:'sentiment', 3:'tweet'},inplace=True)
val = val.dropna().reset_index(drop=True)
val_id = val['tweet_id']
val['sentiment_label'] = val['sentiment'].astype('category').cat.codes

df = pd.concat([train, val],axis=0).reset_index(drop=True)

### EDA on unique tweets
df = df.groupby("tweet_id").head(1).reset_index(drop=True)
df

Unnamed: 0,tweet_id,entity,sentiment,tweet,sentiment_label
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,3
1,2402,Borderlands,Positive,So I spent a few hours making something for fu...,3
2,2403,Borderlands,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM...",2
3,2404,Borderlands,Positive,that was the first borderlands session in a lo...,3
4,2405,Borderlands,Negative,the biggest dissappoinment in my life came out...,1
...,...,...,...,...,...
12442,9196,Nvidia,Negative,Cheap doesn't mean better btw! . . techsall.co...,1
12443,9197,Nvidia,Neutral,Nvidia doesn’t want to give up its 2017 ‘crypt...,2
12444,9198,Nvidia,Negative,Nvidia really delayed the 3070 2 weeks .,1
12445,9199,Nvidia,Positive,Let no elim go unnoticed. . . . NVIDIA Highlig...,3


# D. Modeling - Using a Sentence Transformer 

In [16]:
model = SentenceTransformer('all-mpnet-base-v2')
model.max_seq_length = 512

## 1. Train embeddings

In [17]:
train_embeddings = model.encode(train['tweet'], show_progress_bar=True)

Batches: 100%|██████████| 2313/2313 [09:49<00:00,  3.92it/s]


In [18]:
train_embeddings_dataframe = pd.DataFrame(train_embeddings)
train_embeddings_dataframe['tweet_id'] = train_id
train_embeddings_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,tweet_id
0,-0.004617,0.061283,-0.026159,-0.007170,-0.035278,0.034527,-0.047308,0.006524,-0.002472,-0.018194,...,0.026060,0.001863,0.014621,0.015788,-0.066787,0.019828,0.007364,-0.061734,0.012765,2401
1,-0.006876,0.105905,-0.020807,-0.002566,-0.011347,-0.043241,0.024239,-0.001461,0.039824,-0.012489,...,-0.001388,0.007216,0.062698,0.008089,-0.081022,0.033359,0.019799,-0.055573,0.018820,2401
2,-0.003253,0.044494,-0.024069,0.004845,-0.036237,0.025705,-0.044950,-0.000436,-0.000442,-0.022665,...,0.015292,0.012439,0.031093,0.015085,-0.058663,0.025133,0.007039,-0.048116,0.012398,2401
3,0.027048,0.080196,-0.023949,0.010769,-0.036519,0.001435,-0.029009,-0.003588,-0.003324,-0.025846,...,0.014377,-0.005896,0.042844,0.016844,-0.085140,0.022476,0.020015,-0.060308,0.021683,2401
4,0.003457,0.075470,-0.022148,-0.009670,-0.039551,0.039448,-0.042425,0.013165,-0.009157,-0.016789,...,0.044914,-0.011441,0.013813,0.021353,-0.067120,0.027415,0.016213,-0.065374,0.020063,2401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73991,-0.050199,0.039013,-0.035952,0.044471,-0.037699,0.023799,0.007762,0.009322,0.049473,-0.002142,...,0.057798,0.034082,0.022493,0.013459,0.014401,-0.021274,-0.037121,0.020137,0.010220,9200
73992,-0.021803,0.068168,-0.029014,0.017124,-0.045833,0.006114,0.015141,0.020924,0.046358,-0.001873,...,0.035875,0.018525,0.022787,0.007367,0.019934,-0.028675,-0.035400,0.012645,0.014458,9200
73993,-0.028264,0.072747,-0.040564,0.033166,-0.050929,0.014612,0.033860,0.033071,0.047482,-0.021461,...,0.043838,0.029096,0.017731,-0.003120,0.017059,-0.023954,-0.022154,0.009286,-0.013900,9200
73994,-0.042405,0.037337,-0.028063,0.041067,-0.029423,0.028646,0.005083,0.006783,0.029464,0.003568,...,0.044512,0.039287,0.010695,0.012766,-0.009456,-0.040325,-0.018848,0.014728,0.014870,9200


In [19]:
train_embeddings_dataframe.to_csv("data/train_embeddings.csv", index=False)

## 2. Validation embeddings

In [20]:
val_embeddings = model.encode(val['tweet'])

In [21]:
val_embeddings_dataframe = pd.DataFrame(val_embeddings)
val_embeddings_dataframe['tweet_id'] = val_id
val_embeddings_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,tweet_id
0,0.010028,0.041176,0.001556,-0.007906,0.032132,0.043689,-0.028800,0.017471,0.013177,-0.026209,...,0.068223,0.018860,0.001690,-0.005224,0.037791,-0.034591,0.031234,-0.019440,-0.015769,3364
1,0.022006,0.004681,-0.001522,0.035478,-0.005673,-0.007469,0.026337,0.000375,0.040817,-0.031456,...,0.046734,-0.012422,0.028537,-0.024301,-0.016314,-0.033017,-0.047450,-0.028373,-0.005811,352
2,0.001982,0.008807,0.006777,-0.026844,-0.005257,0.041049,0.063114,0.039603,0.019801,0.005603,...,0.063490,0.035720,0.011735,0.022934,0.032918,0.063919,0.043698,-0.068932,-0.022622,8312
3,-0.018225,0.011594,-0.025488,0.036877,-0.069658,-0.014771,0.017983,0.008925,0.078132,0.012246,...,0.027324,-0.068154,-0.025240,-0.009497,0.049963,0.037710,-0.003114,0.018665,-0.010487,4371
4,-0.022458,0.065612,0.010126,-0.048099,-0.000116,-0.005188,-0.011193,0.029495,0.040480,-0.013162,...,0.057348,0.026728,-0.008797,-0.014817,-0.035626,-0.094311,0.066463,-0.074944,0.018681,4433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.035246,0.059236,-0.014567,0.018904,0.054500,-0.012023,-0.003557,-0.031856,-0.029554,0.040292,...,-0.027925,0.069191,0.013431,0.007736,0.006485,-0.009365,0.024520,0.001368,0.017153,4891
996,-0.048472,0.041755,-0.044990,0.000054,-0.012035,-0.008340,0.024311,-0.004403,0.033418,0.034670,...,0.033121,-0.083698,-0.052909,-0.018549,-0.020644,-0.021184,0.059052,-0.028826,-0.030303,4359
997,-0.025803,0.087023,-0.031333,-0.041780,-0.041503,0.044861,-0.009172,0.011742,0.069554,0.031934,...,-0.023242,0.041217,-0.019295,0.055158,-0.044003,0.029334,-0.018008,-0.010207,0.005391,2652
998,-0.026586,0.043627,-0.022420,-0.014337,-0.001109,-0.027577,-0.016090,0.043594,-0.044097,0.020931,...,-0.027766,0.066325,-0.003922,0.010533,0.017148,0.008838,-0.002437,0.012043,-0.070995,8069


In [22]:
val_embeddings_dataframe.to_csv("data/val_embeddings.csv", index=False)

## 3. Classification Head
* Run from Here

In [23]:
train_embeddings_dataframe = pd.read_csv("data/train_embeddings.csv")
val_embeddings_dataframe = pd.read_csv("data/val_embeddings.csv")

In [24]:
X_train = train_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_train = train['sentiment_label']

X_val = val_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_val = val['sentiment_label']

In [26]:
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [27]:
print(classification_report(y_val, classifier.predict(X_val)))

              precision    recall  f1-score   support

           0       0.91      0.85      0.88       172
           1       0.92      0.94      0.93       266
           2       0.92      0.84      0.88       285
           3       0.83      0.92      0.87       277

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000

