# Goal : Predict the Sentiments of Tweets



<img src="https://www.al.al.leg.br/imagens/Twitterlogo.png/image" width="400">

# The Workflow

- Import Files & Modules
- Process Data
- Exploratory Data Analysis
- Modeling (Senetence Transformers)
- Explainabilty (Using Lime)


# A. Import Files & Modules

In [None]:
pip install pandas

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CosineSimilarityLoss

from xgboost import XGBClassifier
from sklearn.metrics import classification_report

from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

# B. Process Data
Concatenate Data for EDA

In [None]:
# Train
train = pd.read_csv("data/twitter_training.csv", header=None)
train.rename(columns={0:'tweet_id', 1:'entity', 2:'sentiment', 3:'tweet'},inplace=True)
train = train.dropna().reset_index(drop=True)
train_id = train['tweet_id']
train['sentiment_label'] = train['sentiment'].astype('category').cat.codes

# Val
val = pd.read_csv("data/twitter_validation.csv", header=None)
val.rename(columns={0:'tweet_id', 1:'entity', 2:'sentiment', 3:'tweet'},inplace=True)
val = val.dropna().reset_index(drop=True)
val_id = val['tweet_id']
val['sentiment_label'] = val['sentiment'].astype('category').cat.codes

df = pd.concat([train, val],axis=0).reset_index(drop=True)

### EDA on unique tweets
df = df.groupby("tweet_id").head(1).reset_index(drop=True)
df

# C. EDA

## 1. Sentiment Distribution among all tweets
* Most common Sentiment among all tweets?

In [None]:
plt.figure(figsize=(8,5))
ax = sns.countplot(x='sentiment',data=df, palette='rainbow')
plt.title("Sentiment Distribution ")

for container in ax.containers:
    ax.bar_label(container)
    

## 2. Entity Distribution among all tweets


In [None]:
ax = sns.catplot(data=df, x="entity", kind="count", aspect=2.5)
a = plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontsize='large'  
)

## 3. Top 10 Topics

In [None]:
topics = pd.DataFrame({'# of Tweets':df.entity.value_counts()}).head(10)
topics

**Games** and **Social Media** are topics most discussed in this dataset

## 4. Most common Tweet Sentiment (Topic Wise)

In [None]:
most_common_sentiment = []
for i in topics.index:
    most_common_sentiment.append(df[df.entity==i]['sentiment'].value_counts().idxmax())

In [None]:
topics = pd.DataFrame({'# of Tweets':df.entity.value_counts().head(10), 'Most Common Sentiment': most_common_sentiment}).head(10)
topics = topics.reset_index().rename(columns={'index':'entity'}) 
topics

# D. Modeling - Using a Sentence Transformer 

In [None]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')

## 1. Train embeddings

In [None]:
train_embeddings = model.encode(train['tweet'])

In [None]:
train_embeddings_dataframe = pd.DataFrame(train_embeddings)
train_embeddings_dataframe['tweet_id'] = train_id
train_embeddings_dataframe

In [None]:
train_embeddings_dataframe.to_csv("train_embeddings.csv", index=False)

## 2. Validation embeddings

In [None]:
val_embeddings = model.encode(val['tweet'])

In [None]:
val_embeddings_dataframe = pd.DataFrame(val_embeddings)
val_embeddings_dataframe['tweet_id'] = val_id
val_embeddings_dataframe

In [None]:
val_embeddings_dataframe.to_csv("val_embeddings.csv", index=False)

## 3. Classification Head
* Run from Here

In [None]:
train_embeddings_dataframe = pd.read_csv("train_embeddings.csv")
val_embeddings_dataframe = pd.read_csv("val_embeddings.csv")

In [None]:
X_train = train_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_train = train['sentiment_label']

X_val = val_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_val = val['sentiment_label']

In [None]:
classifier = XGBClassifier(tree_method='gpu_hist')
classifier.fit(X_train, y_train)

In [None]:
print(classification_report(y_val, classifier.predict(X_val)))

# E. Explainabilty

## 1. Pipeline

In [None]:
class CustomSentenceTransformer():
    
    def __init__(self):
        self.model = SentenceTransformer('paraphrase-mpnet-base-v2')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return self.model.encode(X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X) 

In [None]:
classifier = XGBClassifier(tree_method='gpu_hist')
pipe = make_pipeline(CustomSentenceTransformer(), classifier)
display(pipe)

In [None]:
pipe.fit(train['tweet'], train['sentiment_label'])

In [None]:
class_names = ['Irrelevant', 'Negative', 'Neutral', 'Positive']

In [None]:
Explainer = LimeTextExplainer(class_names=class_names)

## 2. Checking Different Classes

### 1. Negative

In [None]:
Experiment = Explainer.explain_instance(train[train.sentiment=='Negative']['tweet'].values[1], pipe.predict_proba, num_features=10, top_labels=1)
Experiment.show_in_notebook(text=True)

### 2. Positive

In [None]:
Experiment = Explainer.explain_instance(train[train.sentiment=='Positive']['tweet'].values[1], pipe.predict_proba, num_features=10, top_labels=1)
Experiment.show_in_notebook(text=True)

### 3. Neutral

In [None]:
### Experiment = Explainer.explain_instance(train[train.sentiment=='Neutral']['tweet'].values[1], pipe.predict_proba, num_features=10, top_labels=1)
Experiment.show_in_notebook(text=True)
