# STUDENT: LORD CHARITE IGIRIMBABAZI

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Part 1

In [2]:
train_data = pd.read_csv('hw4-train-new.csv')
test_data = pd.read_csv('hw4-test-new.csv')

train_combined_text = (
    train_data['reviewText'].fillna('') + ' ' +
    train_data['summary'].fillna('') + ' ' +
    train_data['verified'].astype(str) + ' ' +
    train_data['vote'].astype(str) + ' ' +
    train_data['category'].fillna('')
)

test_combined_text = (
    test_data['reviewText'].fillna('') + ' ' +
    test_data['summary'].fillna('') + ' ' +
    test_data['vote'].astype(str) + ' ' +
    test_data['verified'].astype(str) + ' ' +
    test_data['category'].fillna('')
)

# TF-IDF Vectorization 
tfidf_vectorizer = TfidfVectorizer()
x_train_final = tfidf_vectorizer.fit_transform(train_combined_text)
x_test_final = tfidf_vectorizer.transform(test_combined_text)

# target variable 'label'
y_train = train_data['label']

In [3]:
def evaluate(y_true, y_pred, y_proba):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    
    if y_proba.shape[1] == 2:
        y_proba = y_proba[:, 1]
    
    roc_auc = roc_auc_score(y_true, y_proba, multi_class='ovr')


    print("Evaluation Metrics: \n")
    print(f"- Accuracy Score: {accuracy * 100:.2f} %")
    print(f"- F1 Macro Score: {f1 * 100:.2f} %")
    print(f"- Recall Macro Score: {recall * 100:.2f} %")
    print(f"- ROC AUC Score: {roc_auc * 100:.2f} %")

In [4]:
def logistic_regression_classifier(x_train, y_train, x_test):

    x_train_split, x_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    logistic = LogisticRegression(max_iter=10000, class_weight='balanced')
    calibrated_logistic = CalibratedClassifierCV(logistic, cv=5)
 
    calibrated_logistic.fit(x_train_split, y_train_split)

    test_predictions = calibrated_logistic.predict(x_test)
    #test_probabilities = calibrated_logistic.predict_proba(x_test)

    val_predictions = calibrated_logistic.predict(x_val)
    val_probabilities = calibrated_logistic.predict_proba(x_val)
    
    return y_val, val_predictions, val_probabilities, test_predictions

y_true_logistic, logistic_predictions, logistic_probabilities, logistic_test_predictions = (
    logistic_regression_classifier(x_train_final, y_train, x_test_final)
)
print("1) Logistic Regression")
evaluate(y_true_logistic, logistic_predictions, logistic_probabilities )

1) Logistic Regression
Evaluation Metrics: 

- Accuracy Score: 85.78 %
- F1 Macro Score: 84.96 %
- Recall Macro Score: 84.60 %
- ROC AUC Score: 92.79 %


In [5]:
def perceptron_classifier(x_train, y_train, x_test):
    x_train_split, x_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
    
    perceptron = Perceptron(max_iter=1000)
    calibrated_perceptron = CalibratedClassifierCV(perceptron, cv=5)
   
    calibrated_perceptron.fit(x_train_split, y_train_split)
    
    val_predictions = calibrated_perceptron.predict(x_val)
    val_probabilities = calibrated_perceptron.predict_proba(x_val)
    
    return y_val, val_predictions, val_probabilities

y_true_perceptron, perceptron_predictions, perceptron_probabilities = (
    perceptron_classifier(x_train_final, y_train, x_test_final)
)

print("2) Perceptron")
evaluate(y_true_perceptron, perceptron_predictions, perceptron_probabilities)

2) Perceptron
Evaluation Metrics: 

- Accuracy Score: 81.70 %
- F1 Macro Score: 80.42 %
- Recall Macro Score: 79.82 %
- ROC AUC Score: 89.95 %


In [6]:
def rf_classifier(x_train, y_train, x_test):
    x_train_split, x_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    rf = RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1)

    calibrated_rf = CalibratedClassifierCV(rf, cv=5, n_jobs=-1)
    calibrated_rf.fit(x_train_split, y_train_split)

    val_predictions = calibrated_rf.predict(x_val)
    val_probabilities = calibrated_rf.predict_proba(x_val)

    return y_val, val_predictions, val_probabilities


y_true_rf, rf_predictions, rf_probabilities = rf_classifier(x_train_final, y_train, x_test_final)

print("3) Random Forest")
evaluate(y_true_rf, rf_predictions, rf_probabilities)

3) Random Forest
Evaluation Metrics: 

- Accuracy Score: 84.62 %
- F1 Macro Score: 83.39 %
- Recall Macro Score: 82.52 %
- ROC AUC Score: 92.20 %


In [7]:

df = pd.DataFrame({'id': test_data['id'], 'predictions': logistic_test_predictions})

df.to_csv('test_predictions.csv', index=False)