In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/bert

In [None]:
%%capture
!pip install shap
!pip install interpret
!pip install nltk
!pip install wordcloud

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import interpret.glassbox
import shap
import random
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopwords = stopwords.words('english')

from preprocess_data import read_dublin_data, read_sentiment_data, preprocess_data
from evaluate import evaluate_model


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Parameters

In [None]:
MAX_FEATURES = 5000
N_TWEET_PRINT = 10

# Read Data

In [None]:
%%capture
filename = "/content/gdrive/MyDrive/bert/Sentiment140-train.csv"
df_train = read_sentiment_data(filename)

In [None]:
%%capture
filename = "/content/gdrive/MyDrive/bert/Sentiment140-test.csv"
df_test = read_sentiment_data(filename)

In [None]:
%%capture
filename = "/content/gdrive/MyDrive/bert/citypulse.dublin_city_council.test.csv"
df_dublin = read_dublin_data(filename)

In [None]:
df_train['split'] = "train"
df_test['split'] = "test"
df_all = pd.concat([df_train, df_test])

# Target distribution in each dataset
* Positive and negative sentiment are equally split in all datasets

In [None]:
df_train['sentiment'].value_counts()

In [None]:
df_test['sentiment'].value_counts()

In [None]:
df_dublin['sentiment'].value_counts()

In [None]:
df_dublin = preprocess_data(df_dublin, stopwords)
df_train = preprocess_data(df_train, stopwords)
df_test = preprocess_data(df_test, stopwords)
df_all = preprocess_data(df_all, stopwords)

In [None]:
word_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                  sublinear_tf=True, strip_accents='unicode',
                                  stop_words='english', ngram_range=(1, 2),
                                  max_features=MAX_FEATURES)
word_vectorizer.fit(df_all['text_clean'])

# Split train/test features and target

In [None]:
train_text = df_all.loc[df_all['split']=="train", "text_clean"]
test_text = df_all.loc[df_all['split']=="test", "text_clean"]
all_text = df_all["text_clean"]
y_train = df_all.loc[df_all['split']=="train", "sentiment"]
y_test = df_all.loc[df_all['split']=="train", "sentiment"]
y_all_text = df_all["sentiment"]

In [None]:
x_dublin = df_dublin["text_clean"]
y_dublin = df_dublin["sentiment"]

In [None]:
train_features = word_vectorizer.transform(train_text)
test_features = word_vectorizer.transform(test_text)
all_features = word_vectorizer.transform(all_text)
dublin_features = word_vectorizer.transform(x_dublin)

# Fit Logistic Regression model

In [None]:
lr = LogisticRegression(penalty='l2',C=1,
                        max_iter = 1000, solver='lbfgs')
lr.fit(train_features,y_train)

# Make predictions

In [None]:
df_dublin['sentiment_pred'] = lr.predict(dublin_features)
df_all['sentiment_pred'] = lr.predict(all_features)

<a id='Metrics-Train'></a>
### Metrics Train

In [None]:
y_hat = df_all.loc[df_all['split']=="train", "sentiment"]
y_pred = df_all.loc[df_all['split']=="train", "sentiment_pred"]
evaluate_model(y_hat, y_pred)

<a id='Metrics-Test'></a>
### Metrics Test

In [None]:
y_hat = df_all.loc[(df_all['split']=="test") & (df_all['sentiment']!=-1), "sentiment"]
y_pred = df_all.loc[(df_all['split']=="test") & (df_all['sentiment']!=-1), "sentiment_pred"]
evaluate_model(y_hat, y_pred)

<a id='Metrics-Dublin'></a>
### Metrics Dublin dataset

In [None]:
y_hat = df_dublin.loc[(df_dublin['sentiment']!=-1), "sentiment"]
y_pred = df_dublin.loc[(df_dublin['sentiment']!=-1), "sentiment_pred"]
evaluate_model(y_hat, y_pred)

<a id='Metrics-Category'></a>
### Metrics by category

In [None]:
for category in df_dublin['category'].unique():
    print(category)
    y_hat = df_dublin.loc[(df_dublin['sentiment']!=-1) & (df_dublin['category']!=category), "sentiment"]
    y_pred = df_dublin.loc[(df_dublin['sentiment']!=-1) & (df_dublin['category']!=category), "sentiment_pred"]
    evaluate_model(y_hat, y_pred)

In [None]:
df_dublin['sentiment_pred_class'] = df_dublin['sentiment_pred'].astype(int)

### Words with Higher TFIDF values in test dataset

In [None]:
feature_array = np.array(word_vectorizer.get_feature_names())
tfidf_sorting = np.argsort(test_features.toarray()).flatten()[::-1]

n = 50
top_n = feature_array[tfidf_sorting][:n]
top_n

### Words with Higher TFIDF values in dublin dataset

In [None]:
tfidf_sorting = np.argsort(dublin_features.toarray()).flatten()[::-1]
top_n = feature_array[tfidf_sorting][:n]
top_n

# Wordcloud most frequent words Train vs Test vs Dublin dataset

In [None]:
wc = WordCloud(
    background_color='white', 
    max_words=50, 
)
wc.generate(' '.join(text for text in df_train.loc[df_train['sentiment'] == 1, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Positive sentiment train', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
wc = WordCloud(
    background_color='white', 
    max_words=50, 
)
wc.generate(' '.join(text for text in df_test.loc[df_test['sentiment'] == 1, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Positive sentiment test', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
wc = WordCloud(
    background_color='white', 
    max_words=50, 
)
wc.generate(' '.join(text for text in df_dublin.loc[df_dublin['sentiment'] == 1, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Positive sentiment dublin dataset', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
wc = WordCloud(
    background_color='white', 
    max_words=50, 
)
wc.generate(' '.join(text for text in df_train.loc[df_train['sentiment'] == 0, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Negative sentiment train', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
wc = WordCloud(
    background_color='white', 
    max_words=50, 
)
wc.generate(' '.join(text for text in df_test.loc[df_test['sentiment'] == 0, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Negative sentiment test', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
wc = WordCloud(
    background_color='white', 
    max_words=50, 
)
wc.generate(' '.join(text for text in df_dublin.loc[df_dublin['sentiment'] == 0, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Negative sentiment dublin dataset', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

# Baseline model coeficients

In [None]:
data = {'word': word_vectorizer.get_feature_names(), 'coef': lr.coef_[0]}

df_coef = pd.DataFrame(data=data)

In [None]:
df_coef.sort_values('coef', ascending = False, inplace = True)

### Words with higher coefficients (correlated with positive sentiment prediction)

In [None]:
df_coef.head(10)

### Words with lower coefficients (correlated with negative sentiment prediction)

In [None]:
df_coef.tail(10)

# SHAP values of baseline model with test dataset 

In [2]:
explainer = shap.LinearExplainer(lr,
                                 test_features)
shap_values = explainer.shap_values(test_features)
X_test_array = test_features.toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

NameError: name 'shap' is not defined

# SHAP values of baseline model with dublin dataset 

In [None]:
explainer = shap.LinearExplainer(lr,
                                 dublin_features)
shap_values = explainer.shap_values(dublin_features)
X_test_array = dublin_features.toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

# SHAP values of baseline model with test dataset only with correct predictions

In [None]:
mask = df_dublin[df_dublin['sentiment'] == df_dublin['sentiment_pred_class']].index

explainer = shap.LinearExplainer(lr,
                                 dublin_features[mask])
shap_values = explainer.shap_values(dublin_features[mask])
X_test_array = dublin_features[mask].toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

# SHAP values of baseline model with test dataset only with wrong predictions

In [None]:
mask = df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].index

explainer = shap.LinearExplainer(lr,
                                 dublin_features[mask])
shap_values = explainer.shap_values(dublin_features[mask])
X_test_array = dublin_features[mask].toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

# SHAP values for individual cases of wrong predictions

In [None]:
for i in range(N_TWEET_PRINT):
    y_true_n = df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].loc[mask[i], 'sentiment']
    y_pred_n = df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].loc[mask[i], 'sentiment_pred_class']
    print(f"Tweet {i+1}/{N_TWEET_PRINT}")
    print(f"Real sentiment: {y_true_n}")
    print(f"Predicted sentiment: {y_pred_n}")
    print(df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].loc[mask[i], 'text'])
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[i], feature_names=word_vectorizer.get_feature_names())