In [1]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold, train_test_split

import matplotlib.pyplot as plt 
import matplotlib.cm as cm 
import seaborn as sns 

# Plotly 
import plotly.express as px 
import plotly.graph_objects as go 

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, word_tokenize

from collections import defaultdict

from lazypredict.Supervised import LazyClassifier

In [2]:
df = pd.read_csv('../data/processed/tanishq_data_clean_labelled.csv')
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], inplace=True, axis=1)

In [3]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
df['clean_tweet_token'] = df['clean_tweet_segmented'].apply(lemmatizer.lemmatize).apply(tokenizer.tokenize)
df['clean_tweet_token'] = df['clean_tweet_token'].str.join(' ')

In [4]:
df = df[['clean_tweet_token', 'sentiment']]
df.dropna(inplace=True)
df = df[df['sentiment'].isin([0.0, 4.0])]
df['sentiment'] = df['sentiment'].replace({4.0: 1})

In [5]:
count_vec = CountVectorizer()
X_count_vec = count_vec.fit_transform(df['clean_tweet_token'])
#print(count_vec.get_feature_names())

In [6]:
tfidf_vec = TfidfVectorizer()
X_tfidf_vec = tfidf_vec.fit_transform(df['clean_tweet_token'])

In [7]:
'''
kf = KFold(n_splits=3)
y = df['sentiment'].values
count_vec_models
for train_id, test_id in kf.split(y):
    clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
    models, predictions = clf.fit(X_, X_test, y_train, y_test)
'''




In [8]:
y = df['sentiment'].values
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_count_vec.toarray(), y, test_size=0.25, random_state=42, stratify=y)
X_train_tv, X_test_tv, y_train_tv, y_test_tv = train_test_split(X_tfidf_vec.toarray(), y, test_size=0.25, random_state=42, stratify=y)

In [9]:
clf_cv = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_cv, predictions_cv = clf_cv.fit(X_train_cv, X_test_cv, y_train_cv, y_test_cv)

100%|██████████| 30/30 [00:05<00:00,  5.27it/s]


In [10]:
print(models_cv)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LinearDiscriminantAnalysis         0.88               0.75     0.75      0.90   
XGBClassifier                      0.95               0.69     0.69      0.94   
AdaBoostClassifier                 0.96               0.60     0.60      0.94   
DecisionTreeClassifier             0.95               0.59     0.59      0.93   
ExtraTreeClassifier                0.95               0.59     0.59      0.93   
DummyClassifier                    0.92               0.58     0.58      0.92   
QuadraticDiscriminantAnalysis      0.12               0.53     0.53      0.13   
CalibratedClassifierCV             0.95               0.50     0.50      0.92   
NearestCentroid                    0.95               0.50     0.50      0.92   
BernoulliNB                        0.95               0.50     0.50      0.92   
SVC                         

In [11]:
clf_tv = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_tv, predictions_tv = clf_tv.fit(X_train_tv, X_test_tv, y_train_tv, y_test_tv)

100%|██████████| 30/30 [00:05<00:00,  5.25it/s]


In [12]:
print(models_tv)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LinearDiscriminantAnalysis         0.65               0.63     0.63      0.75   
AdaBoostClassifier                 0.96               0.60     0.60      0.94   
LGBMClassifier                     0.96               0.60     0.60      0.94   
XGBClassifier                      0.95               0.59     0.59      0.93   
DummyClassifier                    0.92               0.58     0.58      0.92   
QuadraticDiscriminantAnalysis      0.10               0.52     0.52      0.09   
CalibratedClassifierCV             0.95               0.50     0.50      0.92   
SVC                                0.95               0.50     0.50      0.92   
SGDClassifier                      0.95               0.50     0.50      0.92   
RidgeClassifierCV                  0.95               0.50     0.50      0.92   
RidgeClassifier             