In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.metrics import auc, roc_auc_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sentiment = pd.read_csv('../Datasets/sentiment.csv')
bigram = pd.read_csv('../Datasets/sentiment_bigram_final.csv')

In [3]:
sentiment = sentiment.dropna()
bigram = bigram.dropna()

**Using just TF-IDF**

In [4]:
X_train = sentiment.drop(columns = ['label', 'parent_comment'])
y_train = sentiment['label']

kf = KFold(n_splits=5, shuffle=True, random_state=123)
acc = []
prec = []
rec = []
f1 = []

for train_i, val_i in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_i], X_train.iloc[val_i]  # numpy array
    y_train_fold, y_val_fold = y_train.iloc[train_i], y_train.iloc[val_i]  # pd df

    # Apply Tf-idf vectors on comments
    tfidf = TfidfVectorizer(min_df=15)
    train_tfidf = tfidf.fit_transform(X_train_fold["comment"])
    val_tfidf = tfidf.transform(X_val_fold["comment"])

    X_train_tfidf = hstack([csr_matrix(X_train_fold.drop(['comment'], axis=1).values), train_tfidf])
    X_val_tfidf = hstack([csr_matrix(X_val_fold.drop(['comment'], axis=1).values), val_tfidf])

    # Scale features before applying PCA
    scaler = StandardScaler()
    X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf.toarray())
    X_val_tfidf_scaled = scaler.transform(X_val_tfidf.toarray())

    # Apply PCA
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_tfidf_scaled)
    X_val_pca = pca.transform(X_val_tfidf_scaled)

    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(X_train_pca, y_train_fold)
    preds = knn.predict(X_val_pca)

    # Collect evaluation metrics
    acc.append(accuracy_score(y_val_fold, preds))
    prec.append(precision_score(y_val_fold, preds))
    rec.append(recall_score(y_val_fold, preds))
    f1.append(f1_score(y_val_fold, preds))


In [5]:
print(f'Mean accuracy: {np.mean(acc)}')
print(f'Mean precision: {np.mean(prec)}')
print(f'Mean recall: {np.mean(rec)}')
print(f'Mean f1: {np.mean(f1)}')

Mean accuracy: 0.6066017418511633
Mean precision: 0.6722147829080328
Mean recall: 0.7127333202233335
Mean f1: 0.6918703932912272


**Using TF-IDF with Bigram**

In [6]:
X_train_b = bigram.drop(columns = ['label', 'parent_comment'])
y_train_b = bigram['label']

kf = KFold(n_splits=10, shuffle=True, random_state=123)
acc_b = []
prec_b = []
rec_b = []
f1_b = []

for train_i, val_i in kf.split(X_train_b):
    X_train_fold, X_val_fold = X_train_b.iloc[train_i], X_train_b.iloc[val_i]  # numpy array
    y_train_fold, y_val_fold = y_train_b.iloc[train_i], y_train_b.iloc[val_i]  # pd df

    # Apply Tf-idf vectors on comments
    tfidf = TfidfVectorizer(min_df=15)
    train_tfidf = tfidf.fit_transform(X_train_fold["comment"])
    val_tfidf = tfidf.transform(X_val_fold["comment"])

    X_train_tfidf = hstack([csr_matrix(X_train_fold.drop(['comment'], axis=1).values), train_tfidf])
    X_val_tfidf = hstack([csr_matrix(X_val_fold.drop(['comment'], axis=1).values), val_tfidf])

    # Scale features before applying PCA
    scaler = StandardScaler()
    X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf.toarray())
    X_val_tfidf_scaled = scaler.transform(X_val_tfidf.toarray())

    # Apply PCA
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_tfidf_scaled)
    X_val_pca = pca.transform(X_val_tfidf_scaled)

    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(X_train_pca, y_train_fold)
    preds = knn.predict(X_val_pca)

    # Collect evaluation metrics
    acc_b.append(accuracy_score(y_val_fold, preds))
    prec_b.append(precision_score(y_val_fold, preds))
    rec_b.append(recall_score(y_val_fold, preds))
    f1_b.append(f1_score(y_val_fold, preds))

In [7]:
print(f'Mean accuracy: {np.mean(acc_b)}')
print(f'Mean precision: {np.mean(prec_b)}')
print(f'Mean recall: {np.mean(rec_b)}')
print(f'Mean f1: {np.mean(f1_b)}')

Mean accuracy: 0.6114047577718182
Mean precision: 0.6779640774078878
Mean recall: 0.7144736313447267
Mean f1: 0.6957021210923701
