# Multiclass Classification

## Library and Dataset import

In [2]:
import pandas as pd
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
import os
import json
import joblib
import pickle
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from utils.w2v_feature_extraction import compute_w2v_features

df_train = pd.read_csv("../dataset/training_set.csv")
df_train.head()
X_text = df_train["text"]
y_binary = df_train["multiclass_label"]

## Not_cyberbulling pruning & Feature Extraction

In [4]:
# --- 1. not_cyberbullying pruning ---

df_filtered = df_train[df_train["multiclass_label"] != "not_cyberbullying"].copy()

label2id = {label: idx for idx, label in enumerate(sorted(df_filtered["multiclass_label"].unique()))}
id2label = {v: k for k, v in label2id.items()}

df_filtered["label_id"] = df_filtered["multiclass_label"].map(label2id)

X_text_pruned = df_filtered["text"]
y_multiclass = df_filtered["label_id"]

print("Final Classes:", label2id)
print("Classe Distribution:")
print(df_filtered["multiclass_label"].value_counts())

# --- 2. Feature extraction ---

# BoW
bow_vectorizer = CountVectorizer(max_features=350)
X_bow = bow_vectorizer.fit_transform(X_text_pruned)
with open("../model/bow_vocabulary.pkl", "wb") as f:
    pickle.dump(bow_vectorizer.vocabulary_, f)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=350)
X_tfidf = tfidf_vectorizer.fit_transform(X_text_pruned)
with open("../model/tfidf_vocabulary.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer.vocabulary_, f)

# Load Word2Vec models
model1 = Word2Vec.load("../model/word2vec_model1.model")
model2 = Word2Vec.load("../model/word2vec_model2.model")

X_w2v1 = compute_w2v_features(X_text_pruned, model1, model1.vector_size)
X_w2v2 = compute_w2v_features(X_text_pruned, model2, model2.vector_size)

Final Classes: {'age': 0, 'ethnicity': 1, 'gender': 2, 'other_cyberbullying': 3, 'religion': 4}
Classe Distribution:
multiclass_label
religion               6398
age                    6393
ethnicity              6368
gender                 6354
other_cyberbullying    6081
Name: count, dtype: int64


## GRID search

In [None]:
# Assicurati che il path esista
os.makedirs("../model", exist_ok=True)
results_list = []

# Parametri aggiornati per multiclass
param_grid = {
    "LogisticRegression": {
        "model__C": [0.01, 0.1, 1, 10],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"],
        "model__multi_class": ["multinomial"]
    },
    "SVM": {
        "model__C": [0.01, 0.1, 1, 10],
        "model__kernel": ["linear", "rbf"],
        "model__gamma": ["scale", "auto"]
    },
    "RandomForest": {
        "model__n_estimators": [100, 200, 400, 500],
        "model__max_depth": [None, 10, 20],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]
    }
}

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "RandomForest": RandomForestClassifier()
}

datasets = {
    "BoW": X_bow,
    "TF-IDF": X_tfidf,
    "W2V-1": X_w2v1,
    "W2V-2": X_w2v2
}

# Etichette multiclass
y = y_multiclass

# Cross-validation setup
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scoring multiclass (f1-weighted: utile se le classi non sono perfettamente bilanciate)
scoring = {
    "accuracy": "accuracy",
    "precision": make_scorer(precision_score, average="weighted", zero_division=0),
    "recall": make_scorer(recall_score, average="weighted", zero_division=0),
    "f1": make_scorer(f1_score, average="weighted", zero_division=0)
}

for vectorizer_name, X in datasets.items():
    for model_name, model in models.items():
        
        steps = []
        if "W2V" in vectorizer_name:
            steps.append(("scaler", StandardScaler()))
        steps.append(("model", model))
        pipeline = Pipeline(steps)

        grid = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid[model_name],
            cv=cv_strategy,
            scoring=scoring,
            refit="f1",
            n_jobs=-1,
            verbose=0
        )

        grid.fit(X, y)

        model_path = f"../model/grid_search_multiclass/{model_name}_{vectorizer_name}_multiclass.pkl"
        joblib.dump(grid.best_estimator_, model_path)

        best_idx = grid.best_index_
        results_list.append({
            "model": model_name,
            "vectorizer": vectorizer_name,
            "accuracy": grid.cv_results_["mean_test_accuracy"][best_idx],
            "precision": grid.cv_results_["mean_test_precision"][best_idx],
            "recall": grid.cv_results_["mean_test_recall"][best_idx],
            "f1": grid.cv_results_["mean_test_f1"][best_idx]
        })

with open("../model/grid_search_multiclass/results_multiclass.json", "w") as f:
    json.dump(results_list, f, indent=2)

## K-fold Cross Validation and Model Evaluation

In [None]:
# Dataset
datasets = {
    "BoW": X_bow,
    "TF-IDF": X_tfidf,
    "W2V-1": X_w2v1,
    "W2V-2": X_w2v2
}
y = y_multiclass  

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scoring = {
    "accuracy": "accuracy",
    "precision": make_scorer(precision_score, average="weighted", zero_division=0),
    "recall": make_scorer(recall_score, average="weighted", zero_division=0),
    "f1": make_scorer(f1_score, average="weighted", zero_division=0)
}

model_dir = "../model/grid_search_multiclass"
eval_results = []

for fname in os.listdir(model_dir):
    if fname.endswith(".pkl") and "_" in fname:
        model_name, vectorizer_name = fname.replace(".pkl", "").split("_", 1)
        model_path = os.path.join(model_dir, fname)
        model = joblib.load(model_path)

        if vectorizer_name not in datasets:
            print(f"Dataset '{vectorizer_name}' not found.")
            continue

        X = datasets[vectorizer_name]

        try:
            scores = cross_validate(
                model,
                X,
                y,
                cv=cv,
                scoring=scoring,
                n_jobs=-1
            )
        except Exception as e:
            print(f"Error evaluating {model_name} with {vectorizer_name}: {e}")
            scores = {}

        # Calcolo delle metriche con nan-safe mean
        result = {
            "model": model_name,
            "vectorizer": vectorizer_name
        }

        for key in ["accuracy", "precision", "recall", "f1"]:
            score_values = scores.get(f"test_{key}", [np.nan])
            mean_score = np.nanmean(score_values)
            result[key] = mean_score

            if np.isnan(mean_score):
                print(f" {model_name} + {vectorizer_name}: '{key}' is NaN")

        eval_results.append(result)

# Visualizzazione risultati
df_eval = pd.DataFrame(eval_results)
df_eval_sorted = df_eval.sort_values(by="f1", ascending=False)

display(df_eval_sorted)
