In [1]:
import re
import os
import io
import praw
import nltk
import time
import math
import stanza
import string
from string import punctuation

import numpy as np
import pandas as pd
import datetime as dt

from time import time

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score,  classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

import warnings
warnings.filterwarnings("ignore")

<h3>2.1. Data Cleaning and Splitting</h3>

In [2]:
#load data
df = pd.read_csv('label1.csv')
df_2 = pd.read_csv('label2.csv')
df_3 = pd.read_csv('label3.csv')
df_4 = pd.read_csv('label4.csv')
df_5 = pd.read_csv('label5.csv')

#merge all labelled dataframes
df = pd.concat([df, df_2, df_3, df_4, df_5], axis = 0).reset_index(drop = True)

In [3]:
#data cleaning
pattern = r'^https?:\/\/.*[\r\n]*'
#remove comments that are only hyperlinks
df = df[~df['content'].str.contains(pattern)]
#remove deleted comments
df = df.loc[df['content'] != '[deleted]']

In [4]:
#define stop words using the standard english set
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    #Remove urls
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub('', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Remove None
    tokens = list(filter(lambda x: x is not None, tokens))
    return ' '.join(tokens)

In [5]:
#Create new col for preprocessed text
df['cleaned_content'] = df['content'].apply(preprocess_text)

In [6]:
# Split comments into sentences and do POS tagging
sentences = [nltk.sent_tokenize(content) for content in df['cleaned_content']]
pos_tagged_sentences = [[nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in comment] for comment in sentences]

# For each comment, append a dictionary with the key being the tag and the values being the number of tokens belonging to the tag
pos_features = []
for content in pos_tagged_sentences:
    pos_dict = {}
    for sentence in content:
        for word, tag in sentence:
            pos_dict[tag] = pos_dict.get(tag, 0) + 1
    pos_features.append(pos_dict)
    
# Convert POS features to a sparse matrix
pos_vectorizer = DictVectorizer(sparse=False)
pos_features = pos_vectorizer.fit_transform(pos_features)

# Define a function for stemming
stemmer = nltk.stem.PorterStemmer()
def tokenize_and_stem(text):
    tokens = nltk.word_tokenize(text)
    stems = [stemmer.stem(token) for token in tokens]
    return stems

#Initialize TFIDF vectorizer
tfidf = TfidfVectorizer(
    stop_words = 'english',
    ngram_range = (1, 2), #Create ngrams of size 1 and 2
    tokenizer = tokenize_and_stem,
    min_df = 2, #ignore if ngram appears in less than 2 documents
    max_df = 0.9 #ignore if ngram appears in more than 89% of the documents
)

#Vectorize the cleaned content using TFIDF
tfidf_features = tfidf.fit_transform(df['cleaned_content'])

#Vectorize the cleaned content using Bag of Words
bow = CountVectorizer()
bow_features = bow.fit_transform(df['cleaned_content'])

#Combine all 3 features
combined = np.concatenate((tfidf_features.toarray(), bow_features.toarray(), pos_features), axis = 1)

#Add score
combined = np.c_[combined, df['score']]

#specify encoding sequence for label encoder
sequence = ['neutral', 'negative', 'positive']

#load label encoder
le = LabelEncoder()

#encode labels
labels = le.fit(sequence).transform(df['sentiment'].str.lower())

In [7]:
#split into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    combined,
    labels, 
    test_size=0.2, 
    random_state=42
)

<h3> 2.2. Random Forest Tuning </h3>

In [42]:
#Initialize randomforest classifier
rf = RandomForestClassifier(random_state=42)

#Params dict for rf
params_rf = {
    'n_estimators': list(range(30, 100, 10)),
    'max_features': ['sqrt', 'log2']
}

#Bayes search to find optimal param values using 5 fold cross validation
bs_rf = BayesSearchCV(
    n_jobs = -1,
    estimator = rf,
    search_spaces = params_rf,
    cv = 5,
    scoring = 'f1_weighted',
    random_state= 42
)


#fit
bs_rf.fit(X_train, y_train)

BayesSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
              n_jobs=-1, random_state=42, scoring='f1_weighted',
              search_spaces={'max_features': ['sqrt', 'log2'],
                             'n_estimators': [30, 40, 50, 60, 70, 80, 90]})

In [43]:
# Print the best parameters and test F1 score
print(f"Best parameters: {bs_rf.best_params_}")

#score
rf_pred = bs_rf.predict(X_test)

#confusion Matrix
cf_rf = pd.DataFrame(
        confusion_matrix(y_test, rf_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf_rf)

print("\nClassification report:")
print(classification_report(y_test, rf_pred))

Best parameters: OrderedDict([('max_features', 'sqrt'), ('n_estimators', 90)])
               predicted neutral  predicted negative  predicted positive
true neutral                  41                  66                  11
true negative                  5                 206                  17
true positive                  2                  40                  86

Classification report:
              precision    recall  f1-score   support

           0       0.85      0.35      0.49       118
           1       0.66      0.90      0.76       228
           2       0.75      0.67      0.71       128

    accuracy                           0.70       474
   macro avg       0.76      0.64      0.66       474
weighted avg       0.73      0.70      0.68       474



<h3> 2.3. Logistic Regression Tuning </h3>

In [44]:
penalty_pairs = {
    'none': 'none',
    'l1': 'liblinear',
    'l2': 'lbfgs'
}

#define parameters
params_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
}
best_model_lr = ""
best_f1 = 0

#iterate through each penalty and solver pair
for p, s in penalty_pairs.items():
    if p == 'none':
        lr = LogisticRegression(penalty = 'none')
    else:
        lr = LogisticRegression(penalty = p, solver = s)
        
    #Use bayes search to find optimal param values
    bs_lr = BayesSearchCV(
        n_jobs = -1,
        estimator = lr,
        search_spaces = params_lr,
        cv = 5,
        scoring = 'f1_weighted',
        random_state= 42
    )

    bs_lr.fit(X_train, y_train)

    # Print the best parameters and test F1 score
    print(f"Best parameters: {bs_lr.best_params_}")


    #score
    lr_pred = bs_lr.predict(X_test)
    lr_pred_train = bs_lr.predict(X_train)
    
    f1_test = f1_score(y_test, lr_pred, average = 'weighted')
    f1_train = f1_score(y_train, lr_pred_train, average = 'weighted')

    print(f"[penalty type - {p}] f1 train: {f1_train} | f1 test: {f1_test}")
    
    if f1_train > best_f1:
        best_model_lr = bs_lr
        best_f1 = f1_train

Best parameters: OrderedDict([('C', 0.1)])
[penalty type - none] f1 train: 0.6285335763611025 | f1 test: 0.5148194539186893
Best parameters: OrderedDict([('C', 10.0)])
[penalty type - l1] f1 train: 0.9968357708158168 | f1 test: 0.6724005499914476
Best parameters: OrderedDict([('C', 100.0)])
[penalty type - l2] f1 train: 0.6889529777686051 | f1 test: 0.5492880375397172


In [45]:
#Clearly, L1 penalty gives the best results
print(f"Best parameters: {best_model_lr.best_params_}")

#score
lr_pred = best_model_lr.predict(X_test)

#confusion Matrix
cf_lr = pd.DataFrame(
        confusion_matrix(y_test, lr_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf_lr)

print("\nClassification report:")
print(classification_report(y_test, lr_pred))

Best parameters: OrderedDict([('C', 10.0)])
               predicted neutral  predicted negative  predicted positive
true neutral                  55                  46                  17
true negative                 20                 183                  25
true positive                 14                  30                  84

Classification report:
              precision    recall  f1-score   support

           0       0.62      0.47      0.53       118
           1       0.71      0.80      0.75       228
           2       0.67      0.66      0.66       128

    accuracy                           0.68       474
   macro avg       0.66      0.64      0.65       474
weighted avg       0.67      0.68      0.67       474



<h3> 2.4. XGBoost Tuning </h3>

In [12]:
#Initialize XGB classifier with 100 estimators
xgb = XGBClassifier(n_estimators = 100, learning_rate = 0.1)

#Params for bayes search
params_xgb = {
    'max_depth': Integer(3, 20),
    'reg_alpha': Real(1e-9, 1.0, prior='log-uniform'),
}

#Bayes Search
bs_xgb = BayesSearchCV(
    n_jobs = -1,
    estimator = xgb,
    search_spaces = params_xgb,
    cv = 5,
    scoring = 'f1_weighted',
    random_state= 42
).fit(X_train, y_train)

In [13]:
# Print the best parameters and test F1 score
print(f"Best parameters: {bs_xgb.best_params_}")

#score
xgb_pred = bs_xgb.predict(X_test)

#confusion Matrix
cf_xgb = pd.DataFrame(
        confusion_matrix(y_test, xgb_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf_xgb)

print("\nClassification report:")
print(classification_report(y_test, xgb_pred))

Best parameters: OrderedDict([('max_depth', 19), ('reg_alpha', 0.0020211241974354348)])
               predicted neutral  predicted negative  predicted positive
true neutral                  53                  53                  12
true negative                 20                 195                  13
true positive                  9                  30                  89

Classification report:
              precision    recall  f1-score   support

           0       0.65      0.45      0.53       118
           1       0.70      0.86      0.77       228
           2       0.78      0.70      0.74       128

    accuracy                           0.71       474
   macro avg       0.71      0.67      0.68       474
weighted avg       0.71      0.71      0.70       474



<h3> 2.5. MLP Tuning </h3>

In [48]:
#Initialize MLP classifier
mlp = MLPClassifier(random_state=42, learning_rate = 'adaptive')

#Params for bayes search
params_mlp = {
    'hidden_layer_sizes': [10, 20, 30, 60, 120],
    'activation': ['relu', 'tanh', 'logistic']
}

#Bayes Search
bs_mlp = BayesSearchCV(
    estimator = mlp, 
    search_spaces = params_mlp, 
    cv=5, 
    n_jobs = -1, 
    random_state= 42
)

bs_mlp.fit(X_train, y_train)

BayesSearchCV(cv=5,
              estimator=MLPClassifier(learning_rate='adaptive',
                                      random_state=42),
              n_jobs=-1, random_state=42,
              search_spaces={'activation': ['relu', 'tanh', 'logistic'],
                             'hidden_layer_sizes': [10, 20, 30, 60, 120]})

In [49]:
# Print the best parameters and test F1 score
print(f"Best parameters: {bs_mlp.best_params_}")

#score
mlp_pred = bs_mlp.predict(X_test)

#confusion Matrix
cf_mlp = pd.DataFrame(
        confusion_matrix(y_test, mlp_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf_mlp)

print("\nClassification report:")
print(classification_report(y_test, mlp_pred))

Best parameters: OrderedDict([('activation', 'logistic'), ('hidden_layer_sizes', 20)])
               predicted neutral  predicted negative  predicted positive
true neutral                  61                  37                  20
true negative                 36                 163                  29
true positive                 14                  28                  86

Classification report:
              precision    recall  f1-score   support

           0       0.55      0.52      0.53       118
           1       0.71      0.71      0.71       228
           2       0.64      0.67      0.65       128

    accuracy                           0.65       474
   macro avg       0.63      0.63      0.63       474
weighted avg       0.65      0.65      0.65       474



<h3> 2.6. Compare against Pre-tuned Models</h3>

In [30]:
#Compare with VADER
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Train test split using the same random state to ensure consistency
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], 
    labels, 
    test_size = 0.2, 
    random_state= 42
)

In [31]:
#List to store VADER output
vader_labels = []

#Initialize analyzer
analyzer = SentimentIntensityAnalyzer()

#Iterate through X_test and label each comment/post
for i, comment in enumerate(X_test):
    sent = analyzer.polarity_scores(comment)
    compound_score = sent['compound']
    
    #'Neutral' comments are from -0.05 to 0.05 non-inclusive; 
    #-1 is very negative, 0 is very neutral, 1 is very positive
    if compound_score > 0.05:
        sent = 2
    elif compound_score < -0.05:
        sent = 1
    else:
        sent = 0
    
    #Store output
    vader_labels.append(sent)
    
#View weighted f1 score
print(f"Test F1 score: {f1_score(y_test, vader_labels, average = 'weighted')}")

Test F1 score: 0.29096832991748856


In [32]:
#confusion Matrix
cf_vader = pd.DataFrame(
        confusion_matrix(y_test, vader_labels),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )

print(cf_vader)

#classification report
print("\nClassification report:")
print(classification_report(y_test, vader_labels))

               predicted neutral  predicted negative  predicted positive
true neutral                   7                  26                  15
true negative                 93                  52                 159
true positive                 33                   7                  82

Classification report:
              precision    recall  f1-score   support

           0       0.05      0.15      0.08        48
           1       0.61      0.17      0.27       304
           2       0.32      0.67      0.43       122

    accuracy                           0.30       474
   macro avg       0.33      0.33      0.26       474
weighted avg       0.48      0.30      0.29       474



In [33]:
#Compare with Stanza's sentiment analyzer
#Train test split using the same random state once again
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], 
    labels, 
    test_size = 0.2, 
    random_state= 42
)

In [34]:
st = stanza.Pipeline('en', processors='tokenize, sentiment')

#Function to return sentiment using stanza
def classify_sentiment(comment):
    doc = st(comment)
    for i, sentence in enumerate(doc.sentences):
        if sentence.sentiment == 0:
            return 1
        elif sentence.sentiment == 1:
            return 0
        else:
            return 2

#Classify sentiment using stanza
stanza_pred = [classify_sentiment(comment) for comment in X_test]

2023-04-10 13:01:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-04-10 13:01:18 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2023-04-10 13:01:18 INFO: Use device: gpu
2023-04-10 13:01:18 INFO: Loading: tokenize
2023-04-10 13:01:21 INFO: Loading: sentiment
2023-04-10 13:01:21 INFO: Done loading processors!


In [35]:
#confusion Matrix
cf_stanza = pd.DataFrame(
        confusion_matrix(y_test, stanza_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )

print(cf_stanza)

#classification report
print("\nClassification report:")
print(classification_report(y_test, stanza_pred))

               predicted neutral  predicted negative  predicted positive
true neutral                  16                  31                   1
true negative                194                  74                  36
true positive                 51                  15                  56

Classification report:
              precision    recall  f1-score   support

           0       0.06      0.33      0.10        48
           1       0.62      0.24      0.35       304
           2       0.60      0.46      0.52       122

    accuracy                           0.31       474
   macro avg       0.43      0.35      0.32       474
weighted avg       0.56      0.31      0.37       474



<h3> 3. Labelling the Main Dataset with the Best Model</h3>

In [10]:
#Read all the posts collected
main_df = pd.read_csv('top_posts_year.csv')

#Repeat data cleaning process
pattern = r'^https?:\/\/.*[\r\n]*'
main_df = main_df[~main_df['content'].str.contains(pattern)]
main_df = main_df.loc[main_df['content'] != '[deleted]']
main_df['cleaned_content'] = main_df['content'].apply(preprocess_text)

#Repeat feature engineering process
sentences = [nltk.sent_tokenize(content) for content in main_df['cleaned_content']]
pos_tagged_sentences = [[nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in comment] for comment in sentences]
pos_main = []
for content in pos_tagged_sentences:
    pos_dict = {}
    for sentence in content:
        for word, tag in sentence:
            pos_dict[tag] = pos_dict.get(tag, 0) + 1
    pos_main.append(pos_dict)
pos_main = pos_vectorizer.transform(pos_main)
tfidf_main = tfidf.transform(main_df['cleaned_content'])
bow_main = bow.transform(main_df['cleaned_content'])
combined_main = np.concatenate((tfidf_main.toarray(), bow_main.toarray(), pos_main), axis = 1)
combined_main = np.c_[combined_main, main_df['score']]

#Predict sentiment
sents = []
for i in bs_xgb.predict(combined_main):
    sent = ''
    if i == 0:
        sent = 'neutral'
    elif i == 1:
        sent = 'negative'
    else:
        sent = 'positive'
    sents.append(sent)

#Create pred_sentiment column in main df
main_df['pred_sentiment'] = sents

#Save to csv
#main_df.to_csv('output.csv', index = False)