# Bynary Classification

## Library & Dataset Import

In [2]:
import pandas as pd
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
import os
import json
import joblib
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from utils.w2v_feature_extraction import compute_w2v_features

df_train = pd.read_csv("../dataset/training_set.csv")
df_train.head()
X_text = df_train["text"]
y_binary = df_train["binary_label"]

## Dataset balancing and Feature Extraction

In [8]:
# --- 1. Undersampling ---
df_balanced = pd.concat([X_text, y_binary], axis=1)
minority_class = df_balanced['binary_label'].value_counts().idxmin()
majority_class = df_balanced['binary_label'].value_counts().idxmax()

minority_df = df_balanced[df_balanced['binary_label'] == minority_class]
majority_df = df_balanced[df_balanced['binary_label'] == majority_class]

majority_downsampled = resample(majority_df,
                                replace=False,
                                n_samples=len(minority_df),
                                random_state=42)

df_undersampled = pd.concat([minority_df, majority_downsampled]).sample(frac=1, random_state=42)  # shuffle

X_text_bal = df_undersampled["text"]
y_bal = df_undersampled["binary_label"]
y_bal = y_bal.map({"cyberbullying": 1, "not_cyberbullying": 0})

print("Distribuzione dopo undersampling:")
print(y_bal.value_counts())

# --- 2. Feature extraction ---

# BoW
bow_vectorizer = CountVectorizer(max_features=350)
X_bow = bow_vectorizer.fit_transform(X_text_bal)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=350)
X_tfidf = tfidf_vectorizer.fit_transform(X_text_bal)

# Load Word2Vec models
model1 = Word2Vec.load("../model/word2vec_model1.model")
model2 = Word2Vec.load("../model/word2vec_model2.model")

X_w2v1 = compute_w2v_features(X_text_bal, model1, model1.vector_size)
X_w2v2 = compute_w2v_features(X_text_bal, model2, model2.vector_size)


Distribuzione dopo undersampling:
binary_label
0    6243
1    6243
Name: count, dtype: int64


## GRID search

In [None]:
os.makedirs("../model", exist_ok=True)
results_list = []

# Hyperparameter Grid
param_grid = {
    "LogisticRegression": {
        "model__C": [0.01, 0.1, 1, 10],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"]
    },
    "SVM": {
        "model__C": [0.01, 0.1, 1, 10],
        "model__kernel": ["linear", "rbf"],
        "model__gamma": ["scale", "auto"]
    },
    "RandomForest": {
        "model__n_estimators": [100, 200, 400, 500, 1000],
        "model__max_depth": [None, 10, 20],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]
    }
}

# Selected model
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "RandomForest": RandomForestClassifier()
}

# Vectorization method
datasets = {
    "BoW": X_bow,
    "TF-IDF": X_tfidf,
    "W2V-1": X_w2v1,
    "W2V-2": X_w2v2
}

# Evaluation Metrics
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

for vectorizer_name, X in datasets.items():
    for model_name, model in models.items():

        # Pipeline: Word2Vec only
        steps = []
        if "W2V" in vectorizer_name:
            steps.append(("scaler", StandardScaler()))
        steps.append(("model", model))
        pipeline = Pipeline(steps)

        # Grid Search
        grid = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid[model_name],
            cv=cv_strategy,
            scoring=scoring,
            refit="recall", 
            n_jobs=-1,
            verbose=1
        )

        grid.fit(X, y_bal)

        
        model_path = f"../model/grid_search_binary_recall/{model_name}_{vectorizer_name}.pkl"
        joblib.dump(grid.best_estimator_, model_path)

        best_idx = grid.best_index_
        results_list.append({
            "model": model_name,
            "vectorizer": vectorizer_name,
            "accuracy": grid.cv_results_["mean_test_accuracy"][best_idx],
            "precision": grid.cv_results_["mean_test_precision"][best_idx],
            "recall": grid.cv_results_["mean_test_recall"][best_idx],
            "f1": grid.cv_results_["mean_test_f1"][best_idx]
        })

with open("../model/grid_search_binary_recall/results_grid_search_model.json", "w") as f:
    json.dump(results_list, f, indent=2)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Fitting 10 folds for each of 16 candidates, totalling 160 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fitting 10 folds for each of 60 candidates, totalling 600 fits




Fitting 10 folds for each of 4 candidates, totalling 40 fits
Fitting 10 folds for each of 16 candidates, totalling 160 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fitting 10 folds for each of 60 candidates, totalling 600 fits




Fitting 10 folds for each of 4 candidates, totalling 40 fits
Fitting 10 folds for each of 16 candidates, totalling 160 fits




Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Fitting 10 folds for each of 16 candidates, totalling 160 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits


## K-fold Cross Validation and Model Evaluation

In [9]:
# Dataset
datasets = {
    "BoW": X_bow,
    "TF-IDF": X_tfidf,
    "W2V-1": X_w2v1,
    "W2V-2": X_w2v2
}
y = y_bal

# Cross-validation setup
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scoring con gestione zero_division
scoring = {
    "accuracy": "accuracy",
    "precision": make_scorer(precision_score, zero_division=0),
    "recall": make_scorer(recall_score, zero_division=0),
    "f1": make_scorer(f1_score, zero_division=0)
}

# Path ai modelli
model_dir = "../model/grid_search_binary_f1"
eval_results = []

for fname in os.listdir(model_dir):
    if fname.endswith(".pkl") and "_" in fname:
        model_name, vectorizer_name = fname.replace(".pkl", "").split("_", 1)
        model_path = os.path.join(model_dir, fname)
        model = joblib.load(model_path)

        if vectorizer_name not in datasets:
            print(f"Dataset '{vectorizer_name}' not found.")
            continue

        X = datasets[vectorizer_name]

        try:
            scores = cross_validate(
                model,
                X,
                y,
                cv=cv,
                scoring=scoring,
                n_jobs=-1
            )
        except Exception as e:
            print(f"Error evaluating {model_name} with {vectorizer_name}: {e}")
            scores = {}

        # Calcolo delle metriche con nan-safe mean
        result = {
            "model": model_name,
            "vectorizer": vectorizer_name
        }

        for key in ["accuracy", "precision", "recall", "f1"]:
            score_values = scores.get(f"test_{key}", [np.nan])
            mean_score = np.nanmean(score_values)
            result[key] = mean_score

            if np.isnan(mean_score):
                print(f"{model_name} + {vectorizer_name}: '{key}' is NaN")

        eval_results.append(result)

df_eval = pd.DataFrame(eval_results)
df_eval_sorted = df_eval.sort_values(by="f1", ascending=False)

display(df_eval_sorted)



Unnamed: 0,model,vectorizer,accuracy,precision,recall,f1
5,RandomForest,BoW,0.840699,0.898869,0.768059,0.82807
6,SVM,BoW,0.84238,0.92518,0.745154,0.825203
4,RandomForest,TF-IDF,0.843822,0.937677,0.736665,0.824866
2,SVM,TF-IDF,0.838697,0.915397,0.746594,0.822129
0,SVM,W2V-1,0.832049,0.883369,0.765339,0.819896
1,SVM,W2V-2,0.831008,0.881476,0.76534,0.819011
3,LogisticRegression,BoW,0.830287,0.895767,0.747719,0.814866
9,LogisticRegression,TF-IDF,0.821638,0.862745,0.765176,0.81083
11,RandomForest,W2V-1,0.808342,0.848252,0.751244,0.796653
10,RandomForest,W2V-2,0.805459,0.834676,0.761976,0.796519
