<p alighn="center>
    [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lyraxvincent/sentiment-analysis/]

</p>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# imports
##
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

from scipy.sparse import coo_matrix, hstack

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

import tqdm
from tqdm import trange

In [3]:
# Load the data
##
tweets = pd.read_csv("drive/MyDrive/Colab Notebooks/sentiment analysis/ALL_DATA_V2.csv")
tweets.head()

Unnamed: 0,tweet_id,text,created_at,likes,text length,polarity,sentiment,target,UserName,ScreenName,Location,TweetAt,Sentiment,ID
0,1.419307e+18,face bound border two The covid said hidden,2021-07-25 17:43:34 EAT,0.0,43.0,-0.1666,negative,-1,,,,,,
1,1.419307e+18,I seeing looking like second global going,2021-07-25 17:43:34 EAT,0.0,41.0,0.0,neutral,0,,,,,,
2,1.419307e+18,sentence people intensive care,2021-07-25 17:43:33 EAT,0.0,30.0,0.0,neutral,0,,,,,,
3,1.419307e+18,contagious delta variant surging across nation...,2021-07-25 17:43:31 EAT,0.0,98.0,0.0,neutral,0,,,,,,
4,1.419307e+18,sentence people intensive care,2021-07-25 17:43:28 EAT,0.0,30.0,0.0,neutral,0,,,,,,


In [4]:
# Dropping the missing data points row-wise
##
tweets.dropna(axis=0, subset=['text'], inplace=True)
tweets.reset_index(drop=True, inplace=True)

## Model Building

In [5]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
# BOW (bag of words)
##
cv = CountVectorizer(analyzer='word', stop_words=stop)

cv.fit(tweets['text'])

CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [7]:
# Preview vocabulary and the number of vocab words
##
print(len(cv.vocabulary_))

20417


In [8]:
list(cv.vocabulary_)[:10]

['face',
 'bound',
 'border',
 'two',
 'covid',
 'said',
 'hidden',
 'seeing',
 'looking',
 'like']

In [9]:
# Example transforming a single text
##
print(cv.transform([tweets['text'][0]]))

  (0, 2061)	1
  (0, 2101)	1
  (0, 3986)	1
  (0, 6362)	1
  (0, 8204)	1
  (0, 15444)	1
  (0, 18732)	1


In [10]:
# Example getting the feature name by index
##
cv.get_feature_names()[7591]

'gorgeous'

In [11]:
# Transforming the whole BOW to a sparse matrix
##
bow_text = cv.transform(tweets['text'])

In [12]:
# Non-zero occurrences
##
bow_text.nnz

1413194

In [13]:
# tfidf weighting
##
tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(bow_text)

TfidfTransformer()

In [14]:
# Example transforming a single bow
##
print(tfidf_transformer.transform(cv.transform([tweets['text'][0]])))

  (0, 18732)	0.3540890140004211
  (0, 15444)	0.33005940555536084
  (0, 8204)	0.43847752566461773
  (0, 6362)	0.3557708794716271
  (0, 3986)	0.27601539418774257
  (0, 2101)	0.4403711131831057
  (0, 2061)	0.4203987210029645


In [15]:
# Example getting idf weight of a word
##
tfidf_transformer.idf_[cv.vocabulary_['good']]

4.902696566057591

In [16]:
# Transforming the whole sparse matrix
##
tfidf_text = tfidf_transformer.transform(bow_text)

Adding 'text length' and 'word count' as features to the model  
We'll stack the features to the sparse matrix horizontally

In [17]:
"""# Transforming the two columns into sparse matrices
##
txt_length = coo_matrix(tweets['text length']).reshape(1182,1)
wrd_cnt = coo_matrix(tweets['word count']).reshape(1182,1)

features = hstack([tfidf_text, txt_length, wrd_cnt])

# Preview difference in shapes
print("Shape of text column sparse matrix: ", tfidf_text.shape)
print("Shape of concatenated features sparse matrix: ", features.shape)"""

'# Transforming the two columns into sparse matrices\n##\ntxt_length = coo_matrix(tweets[\'text length\']).reshape(1182,1)\nwrd_cnt = coo_matrix(tweets[\'word count\']).reshape(1182,1)\n\nfeatures = hstack([tfidf_text, txt_length, wrd_cnt])\n\n# Preview difference in shapes\nprint("Shape of text column sparse matrix: ", tfidf_text.shape)\nprint("Shape of concatenated features sparse matrix: ", features.shape)'

## Comparing different classification models:  
- Logistic Regression
- Linear SVC (svm)
- SGD Classifier
- Random Forest Classifier
- Xgboost Classifier
- LGBM Classifier

In [18]:
# Splitting data into train and test splits
##
X_train, X_test, y_train, y_test = train_test_split(tfidf_text, tweets['target'], stratify=tweets.target, test_size=0.3)

In [19]:
# Initialize models
##
lr = LogisticRegression(C=2.0, class_weight=None, dual=False, max_iter=100)

svc = LinearSVC(C=2.0, class_weight=None, dual=False, max_iter=100)

sgd = SGDClassifier()

rfc = RandomForestClassifier()

xgb = XGBClassifier(objective='multi:softmax', num_class=3)

lgbm = LGBMClassifier(objective='multiclass')

In [None]:
# Train models
models = [lr, svc, sgd, rfc, xgb, lgbm]

for model in models:
    print(f"\nFitting [{model.__class__.__name__}]")
    model.fit(X_train, y_train)


Fitting [LogisticRegression]

Fitting [LinearSVC]

Fitting [SGDClassifier]

Fitting [RandomForestClassifier]

Fitting [XGBClassifier]

Fitting [LGBMClassifier]


In [None]:
# Make predictions
predictions = []
for model in models:
    print(f"\nPredicting for [{model.__class__.__name__}]")
    predictions.append(model.predict(X_test))


Predicting for [LogisticRegression]

Predicting for [LinearSVC]

Predicting for [SGDClassifier]

Predicting for [RandomForestClassifier]

Predicting for [XGBClassifier]

Predicting for [LGBMClassifier]


In [None]:
# Evaluate models
for i, model in enumerate(models):
    print(f"{model.__class__.__name__} \n----------------------------------")
    print(confusion_matrix(y_test, predictions[i]))
    print(classification_report(y_test, predictions[i]))

LogisticRegression 
----------------------------------
[[ 8581  1542  1851]
 [  644 18331  1176]
 [ 1322  1983 17014]]
              precision    recall  f1-score   support

          -1       0.81      0.72      0.76     11974
           0       0.84      0.91      0.87     20151
           1       0.85      0.84      0.84     20319

    accuracy                           0.84     52444
   macro avg       0.83      0.82      0.83     52444
weighted avg       0.84      0.84      0.84     52444

LinearSVC 
----------------------------------
[[ 8779  1376  1819]
 [  706 18101  1344]
 [ 1438  1977 16904]]
              precision    recall  f1-score   support

          -1       0.80      0.73      0.77     11974
           0       0.84      0.90      0.87     20151
           1       0.84      0.83      0.84     20319

    accuracy                           0.83     52444
   macro avg       0.83      0.82      0.82     52444
weighted avg       0.83      0.83      0.83     52444

SGDClassi

In [None]:
cross_val_score(lr, tfidf_text, tweets['target'], cv=5)

array([0.89122787, 0.88956581, 0.87397746, 0.80350094, 0.55274298])

In [None]:
# Random Forest performed best
# That's a good score having in mind that the model is attempting to predict between three category classes

## **Stacking**

In [20]:
stack = StackingClassifier(estimators=[('rfc', rfc), ('svc', svc)], final_estimator=lr)
stack.fit(X_train, y_train)
pred = stack.predict(X_test)
print(f"{confusion_matrix(y_test, pred)}\n{classification_report(y_test, pred)}")

[[ 9467   785  1722]
 [  693 18144  1314]
 [ 1396  1068 17855]]
              precision    recall  f1-score   support

          -1       0.82      0.79      0.80     11974
           0       0.91      0.90      0.90     20151
           1       0.85      0.88      0.87     20319

    accuracy                           0.87     52444
   macro avg       0.86      0.86      0.86     52444
weighted avg       0.87      0.87      0.87     52444



Accuracy improved by 2% !

In [21]:
# saving model
from sklearn.pipeline import Pipeline
import pickle

# retrain
pipe = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', stop_words=stop)), ('classifier', stack)])
pipe.fit(tweets.text, tweets.target)
pickle.dump(pipe, open('saved_model.pkl', 'wb'))

In [22]:
# load model
model = pickle.load(open('saved_model.pkl', 'rb'))

In [30]:
model.predict(["hate"])

array([-1])

In [29]:
np.int(model.predict(["I love people."]))

1