In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, precision_recall_curve,precision_score, accuracy_score, recall_score, f1_score
from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler


Mounted at /content/drive


In [None]:
sentiment = pd.read_csv('../Datasets/sentiment.csv')
sentiment = sentiment.dropna()
X_train = sentiment.drop(columns = ['label', 'parent_comment'])
y_train = sentiment['label']

# Base model
No PCA

No lower bound on tf idf

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
acc = []
prec = []
rec = []
f1 = []
y_pred = []
y_true = []

for train_i, val_i in kf.split(X_train):
  X_train_fold, X_val_fold = X_train.iloc[train_i], X_train.iloc[val_i] # numpy array
  y_train_fold, y_val_fold = y_train.iloc[train_i], y_train.iloc[val_i] # pd df

  # Apply Tf-idf vectors on comments
  tfidf = TfidfVectorizer()
  train_tfidf = tfidf.fit_transform(X_train_fold["comment"])
  val_tfidf = tfidf.transform(X_val_fold["comment"])

  # scale the non-comments features
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train_fold.drop(['comment'], axis =1))
  X_val_scaled = scaler.transform(X_val_fold.drop(['comment'], axis = 1))


  X_train_tfidf = hstack([csr_matrix(X_train_scaled), train_tfidf])
  X_val_tfidf = hstack([csr_matrix(X_val_scaled), val_tfidf])


  # Initialize the RandomForestClassifier
  model = RandomForestClassifier(random_state=42, n_estimators=100, criterion='gini')

  model.fit(X_train_tfidf, y_train_fold)
  preds = model.predict(X_val_tfidf)
  y_pred.extend(preds)
  y_true.extend(y_val_fold)
  acc.append(accuracy_score(y_val_fold, preds))
  prec.append(precision_score(y_val_fold, preds))
  rec.append(recall_score(y_val_fold, preds))
  f1.append(f1_score(y_val_fold, preds))

In [None]:
print(f'Mean accuracy: {np.mean(acc)}')
print(f'Mean precision: {np.mean(prec)}')
print(f'Mean recall: {np.mean(rec)}')
print(f'Mean f1: {np.mean(f1)}')

print("Confusion matrix:")
confusion_matrix(y_true, y_pred)

Mean accuracy: 0.6716509133877366
Mean precision: 0.6817371654672431
Mean recall: 0.8818470179846708
Mean f1: 0.7689827997762576
Confusion matrix:


array([[10204, 20799],
       [ 5969, 44551]])

# Bigrams

In [None]:
sentiment_bigram = pd.read_csv('../Datasets/sentiment_bigram_final.csv')
sentiment_bigram = sentiment_bigram.dropna()
X_train_2 = sentiment_bigram.drop(columns = ['label', 'parent_comment'])
y_train_2 = sentiment_bigram['label']

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
acc = []
prec = []
rec = []
f1 = []
y_pred = []
y_true = []

for train_i, val_i in kf.split(X_train_2):
  X_train_fold, X_val_fold = X_train_2.iloc[train_i], X_train_2.iloc[val_i] # numpy array
  y_train_fold, y_val_fold = y_train_2.iloc[train_i], y_train_2.iloc[val_i] # pd df

  # Apply Tf-idf vectors on comments
  tfidf = TfidfVectorizer()
  train_tfidf = tfidf.fit_transform(X_train_fold["comment"])
  val_tfidf = tfidf.transform(X_val_fold["comment"])

  # scale the non-comments features
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train_fold.drop(['comment'], axis =1))
  X_val_scaled = scaler.transform(X_val_fold.drop(['comment'], axis = 1))


  X_train_tfidf = hstack([csr_matrix(X_train_scaled), train_tfidf])
  X_val_tfidf = hstack([csr_matrix(X_val_scaled), val_tfidf])


  # Initialize the RandomForestClassifier
  model = RandomForestClassifier(random_state=42, n_estimators=100, criterion='gini')

  model.fit(X_train_tfidf, y_train_fold)
  preds = model.predict(X_val_tfidf)
  y_pred.extend(preds)
  y_true.extend(y_val_fold)
  acc.append(accuracy_score(y_val_fold, preds))
  prec.append(precision_score(y_val_fold, preds))
  rec.append(recall_score(y_val_fold, preds))
  f1.append(f1_score(y_val_fold, preds))

In [None]:
print(f'Mean accuracy: {np.mean(acc)}')
print(f'Mean precision: {np.mean(prec)}')
print(f'Mean recall: {np.mean(rec)}')
print(f'Mean f1: {np.mean(f1)}')

print("Confusion matrix:")
confusion_matrix(y_true, y_pred)

Mean accuracy: 0.6907041739225572
Mean precision: 0.7147939980991488
Mean recall: 0.836279091306945
Mean f1: 0.7707417469573964
Confusion matrix:


array([[13848, 16832],
       [ 8259, 42184]])

# Hyperparameter tuning

In [None]:
def tune_rf(model, X_train, y_train):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
    acc = []
    prec = []
    rec = []
    f1 = []
    y_pred = []
    y_true = []

    for train_i, val_i in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_i], X_train.iloc[val_i] # numpy array
        y_train_fold, y_val_fold = y_train.iloc[train_i], y_train.iloc[val_i] # pd df

        # Apply Tf-idf vectors on comments
        tfidf = TfidfVectorizer()
        train_tfidf = tfidf.fit_transform(X_train_fold["comment"])
        val_tfidf = tfidf.transform(X_val_fold["comment"])

        # scale the non-comments features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold.drop(['comment'], axis =1))
        X_val_scaled = scaler.transform(X_val_fold.drop(['comment'], axis = 1))


        X_train_tfidf = hstack([csr_matrix(X_train_scaled), train_tfidf])
        X_val_tfidf = hstack([csr_matrix(X_val_scaled), val_tfidf])

        model.fit(X_train_tfidf, y_train_fold)
        preds = model.predict(X_val_tfidf)
        y_pred.extend(preds)
        y_true.extend(y_val_fold)
        acc.append(accuracy_score(y_val_fold, preds))
        prec.append(precision_score(y_val_fold, preds))
        rec.append(recall_score(y_val_fold, preds))
        f1.append(f1_score(y_val_fold, preds))

    print(f'Mean accuracy: {np.mean(acc)}')
    print(f'Mean precision: {np.mean(prec)}')
    print(f'Mean recall: {np.mean(rec)}')
    print(f'Mean f1: {np.mean(f1)}')

    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

## 100 trees, entropy, bigrams

In [None]:
model_2 = RandomForestClassifier(random_state=42, n_estimators=100, criterion='entropy')
tune_rf(model_2, X_train_2, y_train_2)

Mean accuracy: 0.6921833894050322
Mean precision: 0.7209171516607025
Mean recall: 0.8238989206977259
Mean f1: 0.7689458543388612
Confusion matrix:
[[14592 16088]
 [ 8883 41560]]


## 150 trees, entropy, bigrams

In [None]:
model_3 = RandomForestClassifier(random_state=42, n_estimators=150, criterion='entropy')
tune_rf(model_3, X_train_2, y_train_2)

Mean accuracy: 0.6931079471376081
Mean precision: 0.720936244354892
Mean recall: 0.8263348412471956
Mean f1: 0.7700043176861998
Confusion matrix:
[[14545 16135]
 [ 8761 41682]]


## 200 trees, entropy, bigrams

In [None]:
model_4 = RandomForestClassifier(random_state=42, n_estimators=200, criterion='entropy')
tune_rf(model_4, X_train_2, y_train_2)

Mean accuracy: 0.6931942299334737
Mean precision: 0.7208112592968121
Mean recall: 0.8268899420101207
Mean f1: 0.7701755851853282
Confusion matrix:
[[14524 16156]
 [ 8733 41710]]


## 150 trees, entropy, balanced class weight, bigrams

In [None]:
model_5 = RandomForestClassifier(random_state=42, n_estimators=150, criterion='entropy', class_weight='balanced')
tune_rf(model_5, X_train_2, y_train_2)

Mean accuracy: 0.6957336016873175
Mean precision: 0.7402466631408499
Mean recall: 0.7866899243611659
Mean f1: 0.7627423059332128
Confusion matrix:
[[16757 13923]
 [10760 39683]]


# BERT

In [None]:
train_df = pd.read_pickle('/content/drive/MyDrive/CS3244 Grp Project/Model training/BERT Embeddings/bert_embeddings_no_pooling_train.pkl')

In [None]:
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,bert_comment,seq_len
0,0,nc and nh.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"yeah, i get that argument. at this point, i'd ...","[[[-0.26591256, -0.29882812, -0.040224716, -0....",3
10,0,i think a significant amount would be against ...,ThisIsNotKimJongUn,politics,92,92,0,2016-09,2016-09-20 17:53:52,i bet if that money was poured into college de...,"[[[0.075774364, 0.03500098, -0.04244519, -0.05...",15
17,0,because it's what really bothers him... and it...,kozmo1313,politics,15,-1,-1,2016-12,2016-12-26 20:10:45,he actually acts like a moody emo girl on twit...,"[[[0.03299582, 0.04939469, -0.08288911, -0.163...",12
22,0,conservatism as an ideology is for sure a reac...,MayorMcCheese59,politics,1,-1,-1,2016-12,2016-12-24 00:04:06,"i still doubt that ""all conservatives stand fo...","[[[-0.1579521, -0.00796949, -0.33657235, -0.09...",29
23,0,"maybe not control, but certainly that is evide...",SunTzu-,politics,1,-1,-1,2016-10,2016-10-13 20:48:14,today russian media tweeted out that wikileaks...,"[[[-0.093258426, -0.08815382, -0.058376268, -0...",10


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82764 entries, 0 to 1010825
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   label           82764 non-null  int64 
 1   comment         82764 non-null  object
 2   author          82764 non-null  object
 3   subreddit       82764 non-null  object
 4   score           82764 non-null  int64 
 5   ups             82764 non-null  int64 
 6   downs           82764 non-null  int64 
 7   date            82764 non-null  object
 8   created_utc     82764 non-null  object
 9   parent_comment  82764 non-null  object
 10  bert_comment    82764 non-null  object
 11  seq_len         82764 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 10.2+ MB


In [None]:
train_df['comment_id'] = range(1, len(train_df) + 1)
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,bert_comment,seq_len,comment_id
0,0,nc and nh.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"yeah, i get that argument. at this point, i'd ...","[[[-0.26591256, -0.29882812, -0.040224716, -0....",3,1
10,0,i think a significant amount would be against ...,ThisIsNotKimJongUn,politics,92,92,0,2016-09,2016-09-20 17:53:52,i bet if that money was poured into college de...,"[[[0.075774364, 0.03500098, -0.04244519, -0.05...",15,2
17,0,because it's what really bothers him... and it...,kozmo1313,politics,15,-1,-1,2016-12,2016-12-26 20:10:45,he actually acts like a moody emo girl on twit...,"[[[0.03299582, 0.04939469, -0.08288911, -0.163...",12,3
22,0,conservatism as an ideology is for sure a reac...,MayorMcCheese59,politics,1,-1,-1,2016-12,2016-12-24 00:04:06,"i still doubt that ""all conservatives stand fo...","[[[-0.1579521, -0.00796949, -0.33657235, -0.09...",29,4
23,0,"maybe not control, but certainly that is evide...",SunTzu-,politics,1,-1,-1,2016-10,2016-10-13 20:48:14,today russian media tweeted out that wikileaks...,"[[[-0.093258426, -0.08815382, -0.058376268, -0...",10,5


In [None]:
# Load the uploaded CSV file to inspect its content
file_path = '../Datasets/sentiment_bigram.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81490 entries, 0 to 81489
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   comment_id            81490 non-null  int64  
 1   label                 81490 non-null  int64  
 2   comment               81490 non-null  object 
 3   parent_comment        81490 non-null  object 
 4   word_count            81490 non-null  int64  
 5   capital_count         81490 non-null  int64  
 6   punc_count            81490 non-null  int64  
 7   comment_polarity      81490 non-null  float64
 8   comment_subjectivity  81490 non-null  float64
 9   parent_polarity       81490 non-null  float64
 10  parent_subjectivity   81490 non-null  float64
dtypes: float64(4), int64(5), object(2)
memory usage: 6.8+ MB


In [None]:
data.head()

Unnamed: 0,comment_id,label,comment,parent_comment,word_count,capital_count,punc_count,comment_polarity,comment_subjectivity,parent_polarity,parent_subjectivity
0,1,0,nc nh,yeah argument point prefer live nc,3,4,1,0.0,0.0,0.136364,0.5
1,2,0,think significant spend tax dollar people,bet money pour college debt health debt relief...,15,1,1,0.375,0.875,0.0,0.0
2,3,0,bother sign weakness,actually act moody emo girl twitter lash incre...,12,0,6,0.0,0.0,0.126667,0.373333
3,4,0,conservatism ideology sure reaction liberalism...,doubt conservative stand defeat liberal use el...,29,1,2,0.5,0.888889,0.176667,0.52
4,5,0,maybe control certainly evidence collusion,today russian medium tweet wikileak release po...,10,1,2,0.214286,0.571429,0.0,0.0


In [None]:
merged_df = pd.merge(train_df, data[['comment_id', 'word_count', 'capital_count', 'punc_count', 'comment_subjectivity', 'parent_polarity', 'parent_subjectivity']], on='comment_id', how='left')

In [None]:
merged_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,bert_comment,seq_len,comment_id,word_count,capital_count,punc_count,comment_subjectivity,parent_polarity,parent_subjectivity
0,0,nc and nh.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"yeah, i get that argument. at this point, i'd ...","[[[-0.26591256, -0.29882812, -0.040224716, -0....",3,1,3.0,4.0,1.0,0.0,0.136364,0.5
1,0,i think a significant amount would be against ...,ThisIsNotKimJongUn,politics,92,92,0,2016-09,2016-09-20 17:53:52,i bet if that money was poured into college de...,"[[[0.075774364, 0.03500098, -0.04244519, -0.05...",15,2,15.0,1.0,1.0,0.875,0.0,0.0
2,0,because it's what really bothers him... and it...,kozmo1313,politics,15,-1,-1,2016-12,2016-12-26 20:10:45,he actually acts like a moody emo girl on twit...,"[[[0.03299582, 0.04939469, -0.08288911, -0.163...",12,3,12.0,0.0,6.0,0.0,0.126667,0.373333
3,0,conservatism as an ideology is for sure a reac...,MayorMcCheese59,politics,1,-1,-1,2016-12,2016-12-24 00:04:06,"i still doubt that ""all conservatives stand fo...","[[[-0.1579521, -0.00796949, -0.33657235, -0.09...",29,4,29.0,1.0,2.0,0.888889,0.176667,0.52
4,0,"maybe not control, but certainly that is evide...",SunTzu-,politics,1,-1,-1,2016-10,2016-10-13 20:48:14,today russian media tweeted out that wikileaks...,"[[[-0.093258426, -0.08815382, -0.058376268, -0...",10,5,10.0,1.0,2.0,0.571429,0.0,0.0


In [None]:
merged_df.drop(columns = ['comment_id', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment'], axis=1, inplace=True)

In [None]:
merged_df.head()

Unnamed: 0,label,bert_comment,seq_len,word_count,capital_count,punc_count,comment_subjectivity,parent_polarity,parent_subjectivity
0,0,"[[[-0.26591256, -0.29882812, -0.040224716, -0....",3,3.0,4.0,1.0,0.0,0.136364,0.5
1,0,"[[[0.075774364, 0.03500098, -0.04244519, -0.05...",15,15.0,1.0,1.0,0.875,0.0,0.0
2,0,"[[[0.03299582, 0.04939469, -0.08288911, -0.163...",12,12.0,0.0,6.0,0.0,0.126667,0.373333
3,0,"[[[-0.1579521, -0.00796949, -0.33657235, -0.09...",29,29.0,1.0,2.0,0.888889,0.176667,0.52
4,0,"[[[-0.093258426, -0.08815382, -0.058376268, -0...",10,10.0,1.0,2.0,0.571429,0.0,0.0


In [None]:
print(merged_df['bert_comment'].info())
print(merged_df['bert_comment'][0].shape)
print(merged_df['bert_comment'][0][0].shape)

<class 'pandas.core.series.Series'>
RangeIndex: 82764 entries, 0 to 82763
Series name: bert_comment
Non-Null Count  Dtype 
--------------  ----- 
82764 non-null  object
dtypes: object(1)
memory usage: 646.7+ KB
None
(1, 6, 768)
(6, 768)


In [None]:
# flatten bert comment`
bert_comment_expanded = pd.DataFrame(
    merged_df['bert_comment'].apply(lambda row: row.flatten()).to_list(),
    index=merged_df.index
).add_prefix('bert_comment_')

# Concatenate these expanded DataFrames with the original `merged_df`, excluding the original columns
merged_df_expanded = pd.concat([merged_df.drop(['bert_comment'], axis=1),
                                bert_comment_expanded], axis=1)


The bert_comment has to be flattened to be fed into the random forest but it took to much ram and the session crashed

I am just putting it here as an evidence to show that I did try to use the BERT embedding for random forest

In [None]:
merged_df_expanded.head()

NameError: name 'merged_df_expanded' is not defined

In [None]:
merged_df_expanded.info()