# Import

In [32]:
import os
import pickle
import sys
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [33]:
# Get the absolute path to the 'src' directory
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(project_root)
print(project_root)

e:\2_LEARNING_BKU\2_File_2\K22_HK242\CO3117_Machine_Learning\Main


In [34]:
from src.features.build_features_utils import *  # Assuming build_features_utils is inside build_features.py
from src.models.models_utils import *  # Assuming utils.py exists inside src/models/

# Load

In [52]:
# Set random state to match training (if applicable)
random_state = 42

# Load the data (make sure to load your df_sampled or similar dataset)
dataset_path = os.path.join(project_root, "data", "final", "final_clean_no_neutral_no_duplicates.csv")
df = pd.read_csv(dataset_path)

# Replace target 4 with 1
df["target"] = df["target"].replace(4, 1)

df_sampled = df.sample(n=2000, random_state=42)

# Split the data as done during training
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled["text_clean"], df_sampled["target"], test_size=0.2, random_state=random_state
)


In [53]:
# Print lengths of splits
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training labels size: {len(y_train)}")
print(f"Test labels size: {len(y_test)}")

Training set size: 1600
Test set size: 400
Training labels size: 1600
Test labels size: 400


# Predict 

In [54]:
# Define the feature methods
feature_methods = ["count", "word2vec", "glove", "bert"]

X_test_features_dict = {}

In [55]:
# Predict for each model
for method in feature_methods:
    # Initialize FeatureBuilder
    feature_builder = FeatureBuilder(
        method=method,
        save_dir=os.path.join(project_root, "data", "processed"),
        reduce_dim="pca",  # Assuming you want to reduce dimensions using PCA
        n_components=50
    )

    # Transform test data
    feature_builder.fit(X_train.tolist())
    X_test_features_dict[method] = feature_builder.transform(X_test.tolist())
    print(f"✅ {method} - X_test_features shape: {X_test_features_dict[method].shape}")


✅ count - X_test_features shape: (400, 50)


Processing Word2Vec: 100%|██████████| 400/400 [00:00<00:00, 2877.13document/s]


✅ word2vec - X_test_features shape: (400, 50)


Processing GloVe: 100%|██████████| 400/400 [00:00<00:00, 2378.17document/s]


✅ glove - X_test_features shape: (400, 50)


Processing BERT: 100%|██████████| 400/400 [00:11<00:00, 34.40document/s]


✅ bert - X_test_features shape: (400, 50)


In [56]:
model_names = ["decision_tree", "logistic_regression", "random_forest", "xgboost", "perceptron", "bayesian_enhanced", "svm"]

In [57]:
# Predict for each model
for model_name in model_names:
    for method in feature_methods:
        # Load the saved model
        model_filename = os.path.join(project_root, "src", "models", f"best_{model_name}_{method}.pkl")
        with open(model_filename, 'rb') as model_file:
            model = joblib.load(model_file)

        # Make predictions
        y_pred = model.predict(X_test_features_dict[method])

        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        
        # ROC AUC can be computed if the model outputs probabilities
        # Handle models that do not support `predict_proba`
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test_features_dict[method])[:, 1]  # Take the positive class probabilities
            roc_auc = roc_auc_score(y_test, y_prob)
        else:
            roc_auc = "N/A"  # Not applicable for models like Perceptron

        # Print metrics
        print(f"Model: {model_name}")
        print(f"Method: {method}")
        print("-" * 50)
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        if hasattr(model, "predict_proba"):
            print(f"ROC AUC: {roc_auc:.4f}")
        else:
            print("ROC AUC: N/A")
        print("-" * 50)
    print("%" * 50)

Model: decision_tree
Method: count
--------------------------------------------------
Accuracy: 0.4775
Precision: 0.4719
Recall: 0.4221
F1 Score: 0.4456
ROC AUC: 0.4572
--------------------------------------------------
Model: decision_tree
Method: word2vec
--------------------------------------------------
Accuracy: 0.4700
Precision: 0.4714
Recall: 0.5377
F1 Score: 0.5023
ROC AUC: 0.4715
--------------------------------------------------
Model: decision_tree
Method: glove
--------------------------------------------------
Accuracy: 0.4950
Precision: 0.4919
Recall: 0.4573
F1 Score: 0.4740
ROC AUC: 0.4966
--------------------------------------------------
Model: decision_tree
Method: bert
--------------------------------------------------
Accuracy: 0.4975
Precision: 0.4951
Recall: 0.5075
F1 Score: 0.5012
ROC AUC: 0.4857
--------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Model: logistic_regression
Method: count
--------------------------