In [1]:
# imports
##
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from scipy.sparse import coo_matrix, hstack

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

import tqdm
from tqdm import trange

In [12]:
# Read in the data
##
tweets = pd.read_csv("csv files/ALL_DATA_V2.csv")
tweets.head()

Unnamed: 0,tweet_id,text,created_at,likes,text length,polarity,sentiment,target,flag,user
0,1419307341560745985,face bound border two The covid said hidden,2021-07-25 17:43:34 EAT,0.0,43.0,-0.1666,negative,-1,,
1,1419307340680114179,I seeing looking like second global going,2021-07-25 17:43:34 EAT,0.0,41.0,0.0,neutral,0,,
2,1419307335982346240,sentence people intensive care,2021-07-25 17:43:33 EAT,0.0,30.0,0.0,neutral,0,,
3,1419307331599339521,contagious delta variant surging across nation...,2021-07-25 17:43:31 EAT,0.0,98.0,0.0,neutral,0,,
4,1419307315220475908,sentence people intensive care,2021-07-25 17:43:28 EAT,0.0,30.0,0.0,neutral,0,,


In [17]:
# Dropping the missing data points row-wise
##
tweets.dropna(axis=0, subset=['text'], inplace=True)
tweets.reset_index(drop=True, inplace=True)

## Model Building

In [7]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [19]:
# BOW (bag of words)
##
cv = CountVectorizer(analyzer='word', stop_words=stop)

cv.fit(tweets['text'])

CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [20]:
# Preview vocabulary and the number of vocab words
##
print(len(cv.vocabulary_))

688877


In [21]:
list(cv.vocabulary_)[:10]

['face',
 'bound',
 'border',
 'two',
 'covid',
 'said',
 'hidden',
 'seeing',
 'looking',
 'like']

In [22]:
# Example transforming a single text
##
print(cv.transform([tweets['text'][0]]))

  (0, 112935)	1
  (0, 113837)	1
  (0, 161356)	1
  (0, 223630)	1
  (0, 276001)	1
  (0, 531134)	1
  (0, 629523)	1


In [37]:
# Example getting the feature name by index
##
cv.get_feature_names()[759]

'0c6abp'

In [None]:
# Transforming the whole BOW to a sparse matrix
##
bow_text = cv.transform(tweets['text'])

In [30]:
# Non-zero occurrences
##
bow_text.nnz

935370

In [13]:
# tfidf weighting
##
tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(bow_text)

TfidfTransformer()

In [32]:
# Example transforming a single bow
##
print(tfidf_transformer.transform(cv.transform([tweets['text'][0]])))

  (0, 17355)	0.36382217854753895
  (0, 14324)	0.33536544967797327
  (0, 7589)	0.4312292397319281
  (0, 5910)	0.3748466201388867
  (0, 3673)	0.2626363014015341
  (0, 1926)	0.4334212820075949
  (0, 1889)	0.4145635743995186


In [33]:
# Example getting idf weight of a word
##
tfidf_transformer.idf_[cv.vocabulary_['good']]

5.147106787035735

In [14]:
# Transforming the whole sparse matrix
##
tfidf_text = tfidf_transformer.transform(bow_text)

Adding 'text length' and 'word count' as features to the model  
We'll stack the features to the sparse matrix horizontally

In [31]:
"""# Transforming the two columns into sparse matrices
##
txt_length = coo_matrix(tweets['text length']).reshape(1182,1)
wrd_cnt = coo_matrix(tweets['word count']).reshape(1182,1)

features = hstack([tfidf_text, txt_length, wrd_cnt])

# Preview difference in shapes
print("Shape of text column sparse matrix: ", tfidf_text.shape)
print("Shape of concatenated features sparse matrix: ", features.shape)"""

'# Transforming the two columns into sparse matrices\n##\ntxt_length = coo_matrix(tweets[\'text length\']).reshape(1182,1)\nwrd_cnt = coo_matrix(tweets[\'word count\']).reshape(1182,1)\n\nfeatures = hstack([tfidf_text, txt_length, wrd_cnt])\n\n# Preview difference in shapes\nprint("Shape of text column sparse matrix: ", tfidf_text.shape)\nprint("Shape of concatenated features sparse matrix: ", features.shape)'

In [15]:
features = tfidf_text

## Comparing different classification models:  
- Logistic Regression
- Linear SVC (svm)
- SGD Classifier
- Random Forest Classifier
- Xgboost Classifier
- LGBM Classifier

In [16]:
# Splitting data into train and test splits
##
X_train, X_test, y_train, y_test = train_test_split(features, tweets['target'], test_size=0.2)

In [17]:
# Initialize models
##
lr = LogisticRegression(C=2.0, class_weight=None, dual=False, max_iter=100)

svc = LinearSVC(C=2.0, class_weight=None, dual=False, max_iter=100)

sgd = SGDClassifier()

rfc = RandomForestClassifier(bootstrap=False, class_weight=None, 
                             criterion='entropy', min_samples_split=6, n_estimators=160, warm_start=False)

xgb = XGBClassifier(objective='multi:softmax', num_class=3)

lgbm = LGBMClassifier(objective='multiclass')

In [None]:
# Train models
##
lr.fit(X_train, y_train)

svc.fit(X_train, y_train)

sgd.fit(X_train, y_train)

rfc.fit(X_train, y_train)

xgb.fit(X_train,y_train)

lgbm.fit(X_train, y_train)

In [27]:
# Make predictions
##
lr_pred = lr.predict(X_test)

svc_pred = svc.predict(X_test)

sgd_pred = sgd.predict(X_test)

rfc_pred = rfc.predict(X_test)

xgb_pred = xgb.predict(X_test)

lgbm_pred = lgbm.predict(X_test)

In [28]:
# Evaluate models
##
print("Logistic Regression \n----------------------------------")
print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

print("Linear SVC \n---------------------------------")
print(confusion_matrix(y_test, svc_pred))
print(classification_report(y_test, svc_pred))

print("SGD \n---------------------------------")
print(confusion_matrix(y_test, sgd_pred))
print(classification_report(y_test, sgd_pred))

print("Random Forest Classifier \n----------------------------------")
print(confusion_matrix(y_test, rfc_pred))
print(classification_report(y_test, rfc_pred))

print("Xgboost Classifier \n----------------------------------")
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))

print("LGBM \n---------------------------------")
print(confusion_matrix(y_test, lgbm_pred))
print(classification_report(y_test, lgbm_pred))

Logistic Regression 
----------------------------------
[[ 4211   307   297]
 [   40 11296    75]
 [  183   318  9004]]
              precision    recall  f1-score   support

          -1       0.95      0.87      0.91      4815
           0       0.95      0.99      0.97     11411
           1       0.96      0.95      0.95      9505

    accuracy                           0.95     25731
   macro avg       0.95      0.94      0.94     25731
weighted avg       0.95      0.95      0.95     25731

Linear SVC 
---------------------------------
[[ 4489   109   217]
 [   53 11289    69]
 [  154   109  9242]]
              precision    recall  f1-score   support

          -1       0.96      0.93      0.94      4815
           0       0.98      0.99      0.99     11411
           1       0.97      0.97      0.97      9505

    accuracy                           0.97     25731
   macro avg       0.97      0.96      0.97     25731
weighted avg       0.97      0.97      0.97     25731

SGD 
---

In [38]:
cross_val_score(svc, features, tweets['target'], cv=5)

array([0.97100773, 0.96972406, 0.97372717, 0.96976292, 0.96902449])

In [None]:
# Making sure that the models are not overfitting using cross val score metric with 5 folds
##
print("Logistic Regression \n------------------------- \nscore= ", cross_val_score(lr, features, tweets['target'], cv=5).mean())
print("\nLinear SVC \n------------------------- \nscore=", cross_val_score(svc, features, tweets['target'], cv=5).mean())
print("\nRandom Forest Classifier \n------------------------- \nscore=", cross_val_score(rfc, features, tweets['target'], cv=5).mean())
print("\nXgboost Classifier \n------------------------- \nscore=", cross_val_score(xgb, features, tweets['target'], cv=5).mean())

Logistic Regression 
------------------------- 
score=  0.9492269631769622

Linear SVC 
------------------------- 
score= 0.9706492731618044


In [38]:
# linear SVC performed best
# That's a good score having in mind that the model is attempting to predict between three category classes

In [29]:
# saving model
from sklearn.pipeline import Pipeline
import pickle

# retrain
pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC(C=2.0, class_weight=None, dual=False, max_iter=100))])
pipe.fit(tweets.text, tweets.target)
pickle.dump(pipe, open('saved_model.pkl', 'wb'))

In [30]:
# load model
model = pickle.load(open('saved_model.pkl', 'rb'))

In [31]:
model.predict(["I don't know"])

array([0])

In [32]:
np.int(pipe.predict(["I love people."]))

1