In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/bert

ModuleNotFoundError: No module named 'google.colab'

In [None]:
%%capture
!pip install shap
!pip install interpret
!pip install nltk
!pip install wordcloud
!pip install lightgbm

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import interpret.glassbox
import lightgbm as lgb
import shap
import random
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk
from modules.utils import *
nltk.download('stopwords')
stopwords = stopwords.words('english')

from preprocess_data import read_dublin_data, read_sentiment_data, preprocess_data
from evaluate import evaluate_model


# Parameters

In [None]:
N_SAMPLE = 50000
MAX_FEATURES = 10000
N_TWEET_PRINT = 10

# Read Data

In [None]:
%%capture
filename = "/content/gdrive/MyDrive/bert/Sentiment140-train.csv"
df_train = read_sentiment_data(filename)
df_train = df_train.sample(N_SAMPLE).reset_index(drop=True)

In [None]:
%%capture
filename = "/content/gdrive/MyDrive/bert/Sentiment140-test.csv"
df_test = read_sentiment_data(filename)

In [None]:
%%capture
filename = "/content/gdrive/MyDrive/bert/citypulse.dublin_city_council.test.csv"
df_dublin = read_dublin_data(filename)

In [None]:
df_train['split'] = "train"
df_test['split'] = "test"
df_all = pd.concat([df_train, df_test])

In [None]:
df_dublin = preprocess_data(df_dublin, stopwords)
df_train = preprocess_data(df_train, stopwords)
df_test = preprocess_data(df_test, stopwords)
df_all = preprocess_data(df_all, stopwords)

In [None]:
word_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', sublinear_tf=True, strip_accents='unicode',
                                  stop_words='english', ngram_range=(1, 2), max_features=MAX_FEATURES)
word_vectorizer.fit(df_all['text_clean'])

# Split train/test features and target

In [None]:
train_text = df_all.loc[df_all['split']=="train", "text_clean"]
test_text = df_all.loc[df_all['split']=="test", "text_clean"]
all_text = df_all["text_clean"]
y_train = df_all.loc[df_all['split']=="train", "sentiment"]
y_test = df_all.loc[df_all['split']=="test", "sentiment"]
y_all_text = df_all["sentiment"]

In [None]:
x_dublin = df_dublin["text_clean"]
y_dublin = df_dublin["sentiment"]

In [None]:
train_features = word_vectorizer.transform(train_text)
test_features = word_vectorizer.transform(test_text)
all_features = word_vectorizer.transform(all_text)
dublin_features = word_vectorizer.transform(x_dublin)

In [None]:
df_train_features = pd.DataFrame(data = train_features.toarray())
df_train_features.columns = word_vectorizer.get_feature_names()

df_test_features = pd.DataFrame(data = test_features.toarray())
df_test_features.columns = word_vectorizer.get_feature_names()

df_dublin_features = pd.DataFrame(data = dublin_features.toarray())
df_dublin_features.columns = word_vectorizer.get_feature_names()

In [None]:
train_lgb = lgb.Dataset(df_train_features, label=y_train)
test_lgb = lgb.Dataset(df_test_features, label=y_test)
dublin_lgb = lgb.Dataset(df_dublin_features, label=y_dublin)

# Fit LightGBM model

In [None]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

model = lgb.train(parameters,
                       train_lgb,
                       valid_sets=test_lgb,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

In [None]:
predictions = model.predict(all_features)
df_all['sentiment_pred'] = predictions
df_all['sentiment_pred_class'] = df_all['sentiment_pred'].apply(lambda x: round(x))

In [None]:
df_dublin['sentiment_pred'] = model.predict(dublin_features)
df_dublin['sentiment_pred_class'] = df_dublin['sentiment_pred'].apply(lambda x: round(x))

### Metrics Train

In [None]:
y_hat = df_all.loc[df_all['split']=="train", "sentiment"]
y_pred = df_all.loc[df_all['split']=="train", "sentiment_pred_class"]
print(metrics.confusion_matrix(y_hat, y_pred))
print(metrics.classification_report(y_hat, y_pred))
print("Accuracy Score: %.3f" % metrics.accuracy_score(y_hat, y_pred))

### Metrics Test

In [None]:
y_hat = df_all.loc[(df_all['split']=="test") & (df_all['sentiment']!=-1), "sentiment"]
y_pred = df_all.loc[(df_all['split']=="test") & (df_all['sentiment']!=-1), "sentiment_pred_class"]
print(metrics.confusion_matrix(y_hat, y_pred))
print(metrics.classification_report(y_hat, y_pred))
print("Accuracy Score: %.3f" % metrics.accuracy_score(y_hat, y_pred))
fpr, tpr, thresholds = metrics.roc_curve(y_hat, y_pred, pos_label=1)
print("AUC: %.3f" % metrics.auc(fpr, tpr))

### Metrics Dublin dataset

In [None]:
y_hat = df_dublin.loc[(df_dublin['sentiment']!=-1), "sentiment"]
y_pred = df_dublin.loc[(df_dublin['sentiment']!=-1), "sentiment_pred_class"]
print(metrics.confusion_matrix(y_hat, y_pred))
print(metrics.classification_report(y_hat, y_pred))
print("Accuracy Score: %.3f" % metrics.accuracy_score(y_hat, y_pred))

### Metrics by category

In [None]:
for category in df_dublin['category'].unique():
    print(category)
    y_hat = df_dublin.loc[(df_dublin['sentiment']!=-1) & (df_dublin['category']!=category), "sentiment"]
    y_pred = df_dublin.loc[(df_dublin['sentiment']!=-1) & (df_dublin['category']!=category), "sentiment_pred_class"]
    print(metrics.confusion_matrix(y_hat, y_pred))
    print(metrics.classification_report(y_hat, y_pred))
    print("Accuracy Score: %.3f" % metrics.accuracy_score(y_hat, y_pred))

In [None]:
df_dublin['sentiment_pred_class'] = df_dublin['sentiment_pred'].astype(int)

# SHAP values of baseline model with test dataset 

In [None]:
explainer = shap.TreeExplainer(model,
                                 df_test_features)
shap_values = explainer.shap_values(df_test_features)
X_test_array = df_test_features.toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

# SHAP values of baseline model with dublin dataset 

In [None]:
explainer = shap.TreeExplainer(model,
                                 df_dublin_features)
shap_values = explainer.shap_values(df_dublin_features)
X_test_array = dublin_features.toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

# SHAP values of baseline model with test dataset only with right predictions

In [None]:
mask = df_dublin[df_dublin['sentiment'] == df_dublin['sentiment_pred_class']].index

explainer = shap.TreeExplainer(model,
                                 df_dublin_features.loc[mask])
shap_values = explainer.shap_values(df_dublin_features.loc[mask])
X_test_array = dublin_features[mask].toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

# SHAP values of baseline model with test dataset only with wrong predictions

In [None]:
mask = df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].index

explainer = shap.TreeExplainer(model,
                                 df_dublin_features.loc[mask])
shap_values = explainer.shap_values(df_dublin_features.loc[mask])
X_test_array = dublin_features[mask].toarray()
shap.summary_plot(shap_values,
                  X_test_array,
                  feature_names=word_vectorizer.get_feature_names())

# SHAP values for individual cases of wrong predictions

In [None]:
for i in range(N_TWEET_PRINT):
    y_true_n = df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].loc[mask[i], 'sentiment']
    y_pred_n = df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].loc[mask[i], 'sentiment_pred_class']
    print(f"Tweet {i+1}/{N_TWEET_PRINT}")
    print(f"Real sentiment: {y_true_n}")
    print(f"Predicted sentiment: {y_pred_n}")
    print(df_dublin[df_dublin['sentiment'] != df_dublin['sentiment_pred_class']].loc[mask[i], 'text'])
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[i], feature_names=word_vectorizer.get_feature_names())