## Set up: import libraries and read data

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, precision_recall_curve,precision_score, accuracy_score, recall_score, f1_score
from sklearn.metrics import
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler

Mounted at /content/drive


In [None]:
sentiment = pd.read_csv('../Datasets/sentiment.csv')
sentiment = sentiment.dropna()
X_train = sentiment.drop(columns = ['label', 'parent_comment'])
y_train = sentiment['label']

## Modelling

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
acc = []
prec = []
rec = []
f1 = []
y_pred = []
y_true = []

for train_i, val_i in kf.split(X_train):
  X_train_fold, X_val_fold = X_train.iloc[train_i], X_train.iloc[val_i] # numpy array
  y_train_fold, y_val_fold = y_train.iloc[train_i], y_train.iloc[val_i] # pd df

  # Apply Tf-idf vectors on comments
  tfidf = TfidfVectorizer(min_df = 15)
  train_tfidf = tfidf.fit_transform(X_train_fold["comment"])
  val_tfidf = tfidf.transform(X_val_fold["comment"])

  X_train_tfidf = hstack([csr_matrix(X_train_fold.drop(['comment'], axis =1).values), train_tfidf])
  X_val_tfidf = hstack([csr_matrix(X_val_fold.drop(['comment'], axis = 1).values), val_tfidf])

  # Scale features before applying PCA
  scaler = StandardScaler()
  X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf.toarray())
  X_val_tfidf_scaled = scaler.transform(X_val_tfidf.toarray())

  # Apply PCA
  pca = PCA(n_components = 0.95)
  X_train_pca = pca.fit_transform(X_train_tfidf_scaled)
  X_val_pca = pca.transform(X_val_tfidf_scaled)

  log_reg = LogisticRegression(random_state = 123, max_iter = 1000)
  log_reg.fit(X_train_pca, y_train_fold)
  preds = log_reg.predict(X_val_pca)
  y_pred.extend(preds)
  y_true.extend(y_val_fold)
  acc.append(accuracy_score(y_val_fold, preds))
  prec.append(precision_score(y_val_fold, preds))
  rec.append(recall_score(y_val_fold, preds))
  f1.append(f1_score(y_val_fold, preds))



In [None]:
print(f'Mean accuracy: {np.mean(acc)}')
print(f'Mean precision: {np.mean(prec)}')
print(f'Mean recall: {np.mean(rec)}')
print(f'Mean f1: {np.mean(f1)}')

print("Confusion matrix:")
confusion_matrix(y_true, y_pred)

Mean accuracy: 0.6664622366691855
Mean precision: 0.6990595535005332
Mean recall: 0.8108624238913645
Mean f1: 0.7508128981464242
Confusion matrix:


array([[13367, 17636],
       [ 9555, 40965]])