# Analysing sentiment polarity
#### Analysing sentiment polarity of reddit using different models and comparing their accuracy, precision, recall and f1 score to determine the best model

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing Data

In [3]:
train_data = pd.read_json("https://raw.githubusercontent.com/rpsoft/tad_course/main/reddit_sentiment_train.json")
validation_data = pd.read_json("https://raw.githubusercontent.com/rpsoft/tad_course/main/reddit_sentiment_validation.json")
test_data = pd.read_json("https://raw.githubusercontent.com/rpsoft/tad_course/main/reddit_sentiment_test.json")

## Exploring Data Contents

In [4]:
train_data.head(5)

Unnamed: 0,subreddit,title,id,url,author,body,majority_type,is_first_post,post_depth,in_reply_to,sentiment.polarity,sentiment.subjectivity
0,relationships,My Friend/Crush [22/F] acting weird after I [2...,t1_cy7f317,https://www.reddit.com/r/relationships/comment...,Melodrama_,"It's a sad realization, isn't it?",,False,2,t1_cy7erc5,negative,1.0
1,relationships,My Friend/Crush [22/F] acting weird after I [2...,t1_cy7hlyf,https://www.reddit.com/r/relationships/comment...,Melodrama_,I told her a couple of minutes ago that I didn...,elaboration,False,2,t1_cy7erc5,neutral,0.483631
2,relationships,My Friend/Crush [22/F] acting weird after I [2...,t1_cy7etrr,https://www.reddit.com/r/relationships/comment...,TreatYoSelves,Leeches don't make good friends.,answer,False,1,t3_3xshx9,positive,0.6
3,relationships,My Friend/Crush [22/F] acting weird after I [2...,t1_cy7hhpq,https://www.reddit.com/r/relationships/comment...,Melodrama_,I just ended it. Apparently she wasn't a good ...,elaboration,False,2,t1_cy7etrr,positive,0.475
4,relationships,My Friend/Crush [22/F] acting weird after I [2...,t1_cy7q0qg,https://www.reddit.com/r/relationships/comment...,TreatYoSelves,Good for you! Make sure you stick with it.,appreciation,False,3,t1_cy7hhpq,positive,0.744444


In [5]:
train_data['majority_type'].value_counts()

answer              5207
question            2016
elaboration         1638
                    1100
appreciation         674
agreement            406
disagreement         356
humor                275
other                187
negativereaction     157
announcement         122
Name: majority_type, dtype: int64

In [6]:
train_data['title'].value_counts()

As a support main the current state of the summoner spells is awesome!                                                                                                    41
there are 32,966 people here on starcraft...that's 12,966 more than IdrA needs tomorrow night. Reddit, you know what to do!                                               41
iot, please fix the /r chat bug.                                                                                                                                          41
movie night: i'm looking for a horror/psychological thriller that will completely surprise me. suggestions?                                                               40
Reddit, what are the best HL2 mods?                                                                                                                                       40
                                                                                                                                       

In [7]:
train_data[['title','body', 'sentiment.polarity']].describe()

Unnamed: 0,title,body,sentiment.polarity
count,12138,12138.0,12138
unique,1164,11678.0,5
top,As a support main the current state of the sum...,,neutral
freq,41,348.0,7679


In [8]:
train_data['body'].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 348
[deleted]                                                                                                                                                                                                                                                                                                                                                          

## Analysing Sentiment polarity

In [9]:
print (train_data['sentiment.polarity'].value_counts())

neutral          7679
positive         3231
negative          878
very positive     253
very negative      97
Name: sentiment.polarity, dtype: int64


In [10]:
print (test_data['sentiment.polarity'].value_counts())

neutral          2514
positive         1102
negative          282
very positive      86
very negative      32
Name: sentiment.polarity, dtype: int64


In [11]:
print (validation_data['sentiment.polarity'].value_counts())

neutral          1961
positive          845
negative          215
very positive      73
very negative      15
Name: sentiment.polarity, dtype: int64


## Preprocessing of Data

In [12]:
stopwords = ['i','me', 'my','myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours','yourself', 'yourselves', 'he',
 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
 'this', 'that', "that'll",'these', 'those', 'am', 'is', 'are', 'was', 'were', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same',
 'so', 'than', 'too', 'very', 's', 't', 'just', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ma'] 

In [13]:
def text_process(text):
    porter = PorterStemmer()
    text = ''.join([char for char in text if char not in string.punctuation]) #Removed punc
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in stopwords]) #removed stopwords
    text = [porter.stem(word) for word in text.split()] #Stemmin
    return text

## Tokenizing

The CountVectorizer from Scikit-learn is used to turn a set of text documents into a vector of term/token counts. It also allows text data to be pre-processed before being converted into a vector format. Because of this, it's a text feature representation module with a lot of flexibility.

In [14]:
vec_transformer = CountVectorizer(analyzer=text_process)
vec_transformer.fit(train_data['body'])
onehot_train = vec_transformer.transform(train_data['body'])
onehot_test = vec_transformer.transform(test_data['body'])

### TF-IDF
Term Frequency — Inverse Document Frequency (TF-IDF) is a statistic that attempts to better identify the importance of a word in a document while also considering its relationship to other documents in the same corpus.

This is done by counting the number of times a term appears in a document as well as the number of times the same word appears in other documents in the corpus.

In [15]:
tfidf_trans = TfidfTransformer()
tfidf_trans.fit(onehot_train)
tfidf_train = tfidf_trans.transform(onehot_train)
tfidf_test = tfidf_trans.transform(onehot_test)

### Training the model with sklearns Dummy Classifier

In [16]:
dummy_model = DummyClassifier(strategy='most_frequent')
dummy_model.fit(train_data['body'], train_data['sentiment.polarity'])
preds = dummy_model.predict(test_data['body'])
print(classification_report(test_data['sentiment.polarity'], preds))

               precision    recall  f1-score   support

     negative       0.00      0.00      0.00       282
      neutral       0.63      1.00      0.77      2514
     positive       0.00      0.00      0.00      1102
very negative       0.00      0.00      0.00        32
very positive       0.00      0.00      0.00        86

     accuracy                           0.63      4016
    macro avg       0.13      0.20      0.15      4016
 weighted avg       0.39      0.63      0.48      4016



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
dummy_model2 = DummyClassifier(strategy='stratified')
dummy_model2.fit(train_data['body'], train_data['sentiment.polarity'])
preds = dummy_model2.predict(test_data['body'])
print(classification_report(test_data['sentiment.polarity'], preds))

               precision    recall  f1-score   support

     negative       0.08      0.08      0.08       282
      neutral       0.63      0.65      0.63      2514
     positive       0.28      0.26      0.27      1102
very negative       0.00      0.00      0.00        32
very positive       0.00      0.00      0.00        86

     accuracy                           0.48      4016
    macro avg       0.20      0.20      0.20      4016
 weighted avg       0.47      0.48      0.48      4016



### Training One hot vectorised data with Logistic Regression

In [46]:
lr_onehot_model = LogisticRegression(max_iter = 1000)
lr_onehot_model.fit(onehot_train, train_data['sentiment.polarity'])
pred = lr_onehot_model.predict(onehot_test)
print(classification_report(test_data['sentiment.polarity'], pred))

               precision    recall  f1-score   support

     negative       0.35      0.20      0.25       282
      neutral       0.76      0.86      0.81      2514
     positive       0.68      0.58      0.63      1102
very negative       0.33      0.06      0.11        32
very positive       0.46      0.24      0.32        86

     accuracy                           0.72      4016
    macro avg       0.52      0.39      0.42      4016
 weighted avg       0.70      0.72      0.70      4016



### Training TF-IDF vector with Logistic Regression

In [47]:
lr_tfidf_model = LogisticRegression(max_iter = 1000)
lr_tfidf_model.fit(tfidf_train, train_data['sentiment.polarity'])
pred = lr_tfidf_model.predict(tfidf_test)
print(classification_report(test_data['sentiment.polarity'], pred))

               precision    recall  f1-score   support

     negative       0.66      0.11      0.19       282
      neutral       0.74      0.93      0.82      2514
     positive       0.73      0.51      0.60      1102
very negative       0.50      0.03      0.06        32
very positive       0.57      0.05      0.09        86

     accuracy                           0.73      4016
    macro avg       0.64      0.33      0.35      4016
 weighted avg       0.72      0.73      0.70      4016



### Training one hot vector with SVC model

In [48]:
svc_tfidf_model = SVC(kernel='rbf')
svc_tfidf_model.fit(onehot_train, train_data['sentiment.polarity'])
pred = svc_tfidf_model.predict(onehot_test)
print(classification_report(test_data['sentiment.polarity'], pred))

               precision    recall  f1-score   support

     negative       0.58      0.02      0.05       282
      neutral       0.71      0.95      0.82      2514
     positive       0.75      0.44      0.55      1102
very negative       0.00      0.00      0.00        32
very positive       0.00      0.00      0.00        86

     accuracy                           0.72      4016
    macro avg       0.41      0.28      0.28      4016
 weighted avg       0.69      0.72      0.67      4016



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
my_tfidf_model = StackingClassifier([('lr', LogisticRegression(max_iter = 1000)), ('cnb', ComplementNB())])
my_tfidf_model.fit(tfidf_train, train_data['sentiment.polarity'])
pred = my_tfidf_model.predict(tfidf_test)
print(classification_report(test_data['sentiment.polarity'], pred))

               precision    recall  f1-score   support

     negative       0.52      0.23      0.32       282
      neutral       0.76      0.90      0.83      2514
     positive       0.73      0.58      0.65      1102
very negative       0.00      0.00      0.00        32
very positive       0.59      0.19      0.28        86

     accuracy                           0.75      4016
    macro avg       0.52      0.38      0.42      4016
 weighted avg       0.73      0.75      0.73      4016



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Finding Best parameters



In [50]:
pipe = Pipeline(steps=[('tfidf', TfidfVectorizer()), ('lr', LogisticRegression(max_iter = 1000))])

param_grid = {
    'tfidf__sublinear_tf': [True, False],
    'tfidf__max_features': [None, 500, 5000, 50000],
    'lr__C': [0.01,0.1,1,10],
    'lr__class_weight': [None, 'balanced'],
}

search = GridSearchCV(pipe, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=3)
search.fit(train_data['body'], train_data['sentiment.polarity'])
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 2 folds for each of 64 candidates, totalling 128 fits
Best parameter (CV score=0.733):
{'lr__C': 10, 'lr__class_weight': None, 'tfidf__max_features': 5000, 'tfidf__sublinear_tf': True}


In [51]:
pred = search.predict(test_data['body'])
print(classification_report(test_data['sentiment.polarity'], pred))

               precision    recall  f1-score   support

     negative       0.58      0.32      0.41       282
      neutral       0.80      0.88      0.84      2514
     positive       0.72      0.67      0.69      1102
very negative       0.67      0.25      0.36        32
very positive       0.53      0.21      0.30        86

     accuracy                           0.76      4016
    macro avg       0.66      0.47      0.52      4016
 weighted avg       0.75      0.76      0.75      4016



### Training the best model with more features

In [58]:
sublinear_tf = True
max_features = 5000
C = 10
class_weight = None

In [56]:
tfidf_trans = TfidfVectorizer(analyzer=text_process, sublinear_tf=sublinear_tf, max_features=max_features)
tfidf_trans.fit(train_data['body'] + train_data['title'] + train_data['majority_type'])
tfidf_train = tfidf_trans.transform(train_data['body'] + train_data['title'] + train_data['majority_type'])
tfidf_test = tfidf_trans.transform(test_data['body'] + test_data['title'] + test_data['majority_type'])
tfidf_vec_validation = tfidf_trans.transform(validation_data['body'] + validation_data['title'] + validation_data['majority_type'])

In [57]:
lr_onehot_model = LogisticRegression(C=C, class_weight=class_weight, max_iter = 1000)
lr_onehot_model.fit(tfidf_train, train_data['sentiment.polarity'])
preds = lr_onehot_model.predict(tfidf_test)
print(classification_report(test_data['sentiment.polarity'], preds))

               precision    recall  f1-score   support

     negative       0.28      0.20      0.24       282
      neutral       0.73      0.77      0.75      2514
     positive       0.52      0.52      0.52      1102
very negative       0.67      0.06      0.11        32
very positive       0.17      0.05      0.07        86

     accuracy                           0.64      4016
    macro avg       0.47      0.32      0.34      4016
 weighted avg       0.63      0.64      0.63      4016

