In [1]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report, roc_auc_score

import matplotlib.pyplot as plt 
import matplotlib.cm as cm 
import seaborn as sns 

# Plotly 
import plotly.express as px 
import plotly.graph_objects as go 

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, word_tokenize

from collections import defaultdict

from lazypredict.Supervised import LazyClassifier

In [2]:
df = pd.read_csv('../data/processed/tanishq_data_clean_labelled.csv')
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], inplace=True, axis=1)

In [3]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
df['clean_tweet_token'] = df['clean_tweet_segmented'].apply(lemmatizer.lemmatize).apply(tokenizer.tokenize)
df['clean_tweet_token'] = df['clean_tweet_token'].str.join(' ')

In [4]:
df = df[['clean_tweet_token', 'sentiment']]
df.dropna(inplace=True)
df = df[df['sentiment'].isin([0.0, 4.0])]
df['sentiment'] = df['sentiment'].replace({4.0: 1})

In [5]:
count_vec = CountVectorizer()
X_count_vec = count_vec.fit_transform(df['clean_tweet_token'])
#print(count_vec.get_feature_names())

In [6]:
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3))
X_tfidf_vec = tfidf_vec.fit_transform(df['clean_tweet_token'])

In [7]:
'''
kf = KFold(n_splits=3)
y = df['sentiment'].values
count_vec_models
for train_id, test_id in kf.split(y):
    clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
    models, predictions = clf.fit(X_, X_test, y_train, y_test)
'''




In [8]:
y = df['sentiment'].values
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_count_vec.toarray(), y, test_size=0.25, random_state=27, stratify=y)
X_train_tv, X_test_tv, y_train_tv, y_test_tv = train_test_split(X_tfidf_vec.toarray(), y, test_size=0.25, random_state=27, stratify=y)

In [9]:
def custom_metric(A, B):
    d = pd.DataFrame(classification_report(A, B, digits=2,
                                        output_dict=True)).T
    d = d.to_dict()
    d['roc auc'] = roc_auc_score(A, B)
    return d

In [10]:
clf_cv = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=custom_metric)
models_cv, predictions_cv = clf_cv.fit(X_train_cv, X_test_cv, y_train_cv, y_test_cv)

100%|██████████| 30/30 [00:06<00:00,  4.72it/s]


In [11]:
models_cv['Precision_0'] = models_cv['custom_metric'].apply(lambda x: x['precision']['0.0'])
models_cv['Precision_1'] = models_cv['custom_metric'].apply(lambda x: x['precision']['1.0'])
models_cv['Recall_0'] = models_cv['custom_metric'].apply(lambda x: x['recall']['0.0'])
models_cv['Recall_1'] = models_cv['custom_metric'].apply(lambda x: x['recall']['1.0'])
models_cv['F1_0'] = models_cv['custom_metric'].apply(lambda x: x['f1-score']['0.0'])
models_cv['F1_1'] = models_cv['custom_metric'].apply(lambda x: x['f1-score']['1.0'])
models_cv['ROC_AUC'] = models_cv['custom_metric'].apply(lambda x: x['roc auc'])
models_cv.drop('custom_metric', axis=1, inplace=True)
models_cv

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken,Precision_0,Precision_1,Recall_0,Recall_1,F1_0,F1_1,ROC_AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LinearDiscriminantAnalysis,0.68,0.74,0.74,0.77,0.13,0.98,0.12,0.68,0.8,0.8,0.22,0.74
AdaBoostClassifier,0.96,0.69,0.69,0.95,0.28,0.97,0.67,0.99,0.4,0.98,0.5,0.69
Perceptron,0.92,0.68,0.68,0.93,0.05,0.97,0.33,0.95,0.4,0.96,0.36,0.68
XGBClassifier,0.96,0.6,0.6,0.94,0.26,0.96,1.0,1.0,0.2,0.98,0.33,0.6
LGBMClassifier,0.95,0.59,0.59,0.93,0.16,0.96,0.5,0.99,0.2,0.97,0.29,0.59
DecisionTreeClassifier,0.95,0.59,0.59,0.93,0.05,0.96,0.5,0.99,0.2,0.97,0.29,0.59
LinearSVC,0.92,0.58,0.58,0.92,0.8,0.95,0.25,0.97,0.2,0.96,0.22,0.58
PassiveAggressiveClassifier,0.9,0.57,0.57,0.91,0.07,0.95,0.17,0.94,0.2,0.95,0.18,0.57
GaussianNB,0.9,0.57,0.57,0.91,0.05,0.95,0.17,0.94,0.2,0.95,0.18,0.57
QuadraticDiscriminantAnalysis,0.15,0.55,0.55,0.18,0.1,1.0,0.06,0.1,1.0,0.19,0.11,0.55


In [12]:
clf_tv = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=custom_metric)
models_tv, predictions_tv = clf_tv.fit(X_train_tv, X_test_tv, y_train_tv, y_test_tv)

100%|██████████| 30/30 [01:00<00:00,  2.00s/it]


In [13]:
models_tv['Precision_0'] = models_tv['custom_metric'].apply(lambda x: x['precision']['0.0'])
models_tv['Precision_1'] = models_tv['custom_metric'].apply(lambda x: x['precision']['1.0'])
models_tv['Recall_0'] = models_tv['custom_metric'].apply(lambda x: x['recall']['0.0'])
models_tv['Recall_1'] = models_tv['custom_metric'].apply(lambda x: x['recall']['1.0'])
models_tv['F1_0'] = models_tv['custom_metric'].apply(lambda x: x['f1-score']['0.0'])
models_tv['F1_1'] = models_tv['custom_metric'].apply(lambda x: x['f1-score']['1.0'])
models_tv['ROC_AUC'] = models_tv['custom_metric'].apply(lambda x: x['roc auc'])
models_tv.drop('custom_metric', axis=1, inplace=True)
models_tv

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken,Precision_0,Precision_1,Recall_0,Recall_1,F1_0,F1_1,ROC_AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AdaBoostClassifier,0.98,0.89,0.89,0.98,3.31,0.99,0.8,0.99,0.8,0.99,0.8,0.89
PassiveAggressiveClassifier,0.92,0.77,0.77,0.93,0.85,0.98,0.38,0.94,0.6,0.96,0.46,0.77
Perceptron,0.89,0.75,0.75,0.91,0.43,0.98,0.27,0.91,0.6,0.94,0.37,0.75
LinearDiscriminantAnalysis,0.49,0.73,0.73,0.61,1.32,1.0,0.1,0.46,1.0,0.63,0.18,0.73
DecisionTreeClassifier,0.96,0.69,0.69,0.95,0.53,0.97,0.67,0.99,0.4,0.98,0.5,0.69
SGDClassifier,0.93,0.68,0.68,0.93,0.45,0.97,0.4,0.97,0.4,0.97,0.4,0.68
LinearSVC,0.92,0.68,0.68,0.93,7.7,0.97,0.33,0.95,0.4,0.96,0.36,0.68
LGBMClassifier,0.95,0.59,0.59,0.93,0.67,0.96,0.5,0.99,0.2,0.97,0.29,0.59
XGBClassifier,0.95,0.59,0.59,0.93,2.05,0.96,0.5,0.99,0.2,0.97,0.29,0.59
GaussianNB,0.92,0.58,0.58,0.92,0.45,0.95,0.25,0.97,0.2,0.96,0.22,0.58


In [14]:
models_cv.to_html('model_cv.html')

In [15]:
models_tv.to_html('model_tv.html')

In [16]:
models_cv[['Precision_0', 'Precision_1', 'Recall_0', 'Recall_1', 'F1_0', 'F1_1', 'ROC AUC']].head(5).style.background_gradient(cmap='PuBu')

Unnamed: 0_level_0,Precision_0,Precision_1,Recall_0,Recall_1,F1_0,F1_1,ROC AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LinearDiscriminantAnalysis,0.98,0.12,0.68,0.8,0.8,0.22,0.74
AdaBoostClassifier,0.97,0.67,0.99,0.4,0.98,0.5,0.69
Perceptron,0.97,0.33,0.95,0.4,0.96,0.36,0.68
XGBClassifier,0.96,1.0,1.0,0.2,0.98,0.33,0.6
LGBMClassifier,0.96,0.5,0.99,0.2,0.97,0.29,0.59


In [17]:
models_tv[['Precision_0', 'Precision_1', 'Recall_0', 'Recall_1', 'F1_0', 'F1_1', 'ROC AUC']].head(5).style.background_gradient(cmap='PuBu')

Unnamed: 0_level_0,Precision_0,Precision_1,Recall_0,Recall_1,F1_0,F1_1,ROC AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AdaBoostClassifier,0.99,0.8,0.99,0.8,0.99,0.8,0.89
PassiveAggressiveClassifier,0.98,0.38,0.94,0.6,0.96,0.46,0.77
Perceptron,0.98,0.27,0.91,0.6,0.94,0.37,0.75
LinearDiscriminantAnalysis,1.0,0.1,0.46,1.0,0.63,0.18,0.73
DecisionTreeClassifier,0.97,0.67,0.99,0.4,0.98,0.5,0.69
