# Ensemble method

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import sys
sys.path.append("../scripts")
from data_cleaning import clean_data

In [2]:
#train
train_raw = pd.read_csv("../data/train.csv")
train_clean = clean_data(train_raw)
train_clean.to_csv("../data/train_clean.csv", index=False)

#test
test_raw = pd.read_csv("../data/test.csv")
test_clean = clean_data(test_raw)
test_clean.to_csv("../data/test_clean.csv", index=False)

print("Saved train_clean.csv and test_clean.csv")

Saved train_clean.csv and test_clean.csv


In [3]:
df = pd.read_csv('../data/train_clean.csv')
# df = pd.read_csv('../data/train.csv')

# making label of rating_rejected feature into binary from continuous (0.0 to 1.0)
df["comment_text"] = df["comment_text"].fillna("").astype(str)
df["label"] = df["rating_rejected"]

In [4]:
text_col = "comment_text"
df[text_col] = df[text_col].fillna("").astype(str)

y = df["label"]

cols = [
    c for c in df.columns
    if c not in [text_col, "label", "rating_rejected"]
]

In [5]:
train_idx, val_idx = train_test_split(
    df.index,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

y_train = df["label"].loc[train_idx]
y_val = df["label"].loc[val_idx]

In [6]:
X_train_both = df.loc[train_idx, ["comment_text"] + cols]
X_val_both   = df.loc[val_idx, ["comment_text"] + cols]

## Random Forest Base (both comment and numerics):

In [7]:
preprocess_rf = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            min_df=5,
            max_features=5000
        ), text_col),
        ("num", "passthrough", cols),
    ]
)

In [None]:
rf_pipe = Pipeline([
    ("preprocess", preprocess_rf),
    ("rf", RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"
    ))
])

param_grid_rf = {
    "rf__n_estimators": [100, 200],
    "rf__max_depth": [None, 20, 50],
    "rf__min_samples_split": [2, 10],
    "rf__min_samples_leaf": [1, 5],
}

grid_rf = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid_rf,
    cv=3,
    scoring="f1",
    n_jobs=-1,
    verbose=2
)

grid_rf.fit(X_train_both, y_train)

print("Best params (RF):", grid_rf.best_params_)
print("Best CV F1 (RF):", grid_rf.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:
best_rf = grid_rf.best_estimator_

y_val_pred_rf = best_rf.predict(X_val_both)

print("Validation accuracy (RF tuned):", accuracy_score(y_val, y_val_pred_rf))
print("\nValidation classification report (RF tuned):\n")
print(classification_report(y_val, y_val_pred_rf, digits=4))

In [None]:
# confusion matrix for best RF ensemble model:
cm = confusion_matrix(y_val, y_val_pred_rf)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot()
plt.title("Tuned Random Forest - Validation Confusion Matrix")
plt.show()