In [None]:
# make sure we are on correct path 
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
# Add the project root to the system path
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
# imports and helpers
import pandas as pd
from helpers.visualization_helpers import plot_confusion_matrix, display_classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib
import numpy as np

In [None]:
# define paths from joblib files for each vectorization methods
vectorization_methods = ['tfidf', 'bow', 'spacy']
data_paths = {
    'tfidf': {
        'X_train': '../outputs/models/X_train_tfidf_balanced.joblib',
        'y_train': '../outputs/models/y_train_tfidf_balanced.joblib',
        'X_test': '../outputs/models/X_test_tfidf.joblib',
        'y_test': '../outputs/models/y_test.joblib',
        'vectorizer': '../outputs/models/tfidf_vectorizer.joblib'
    },
    'bow': {
        'X_train': '../outputs/models/X_train_bow_balanced.joblib',
        'y_train': '../outputs/models/y_train_bow_balanced.joblib',
        'X_test': '../outputs/models/X_test_tfidf.joblib',  # Assuming same X_test for all
        'y_test': '../outputs/models/y_test.joblib',
        'vectorizer': '../outputs/models/bow_vectorizer.joblib'
    },
    'spacy': {
        'X_train': '../outputs/models/X_train_spacy_balanced.joblib',
        'y_train': '../outputs/models/y_train_spacy_balanced.joblib',
        'X_test': '../outputs/models/X_test_tfidf.joblib',  # Assuming same X_test for all
        'y_test': '../outputs/models/y_test.joblib',
        'vectorizer': '../outputs/models/spacy_model.joblib'
    }
}

In [4]:
# load all data into directory 
loaded_data = {}
for method in vectorization_methods:
    loaded_data[method] = {
        'X_train': joblib.load(data_paths[method]['X_train']),
        'y_train': joblib.load(data_paths[method]['y_train']),
        'X_test': joblib.load(data_paths[method]['X_test']),
        'y_test': joblib.load(data_paths[method]['y_test']),
        'vectorizer': joblib.load(data_paths[method]['vectorizer'])
    }

print("Preprocessed data loaded successfully!")

Preprocessed data loaded successfully!


In [5]:
# Cell 4: Define Models to Train

models = {
    'Logistic Regression': LogisticRegression(random_state=21, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=21),
    'Support Vector Machine': SVC(random_state=21, probability=True)
}


In [None]:
# Cell 5: Train and Evaluate Models

# Initialize a results DataFrame
results = pd.DataFrame(columns=['Vectorization', 'Model', 'Accuracy'])

# Dictionary to store trained models for potential future use
trained_models = {}

for method in vectorization_methods:
    for model_name, model in models.items():
        print(f"\nTraining {model_name} with {method.upper()} Vectorization...")
        
        # Train the model
        model.fit(loaded_data[method]['X_train'], loaded_data[method]['y_train'])
        
        # Predict on test set
        y_pred = model.predict(loaded_data[method]['X_test'])
        
        # Calculate accuracy
        accuracy = accuracy_score(loaded_data[method]['y_test'], y_pred)
        print(f"Accuracy: {accuracy:.4f}")
        
        # Update results using .loc instead of append
        results.loc[len(results)] = {
            'Vectorization': method.upper(),
            'Model': model_name,
            'Accuracy': accuracy
        }
        
        # Save the model with a descriptive name
        model_save_path = f"../outputs/models/{model_name.replace(' ', '_')}_{method.upper()}.joblib"
        joblib.dump(model, model_save_path)
        print(f"Model saved to {model_save_path}")
        
        # Store trained model
        trained_models[(method, model_name)] = model

print("\nAll models trained and evaluated.")


Training Logistic Regression with TFIDF Vectorization...
Accuracy: 0.5872
Model saved to ../outputs/models/Logistic_Regression_TFIDF.joblib

Training Random Forest with TFIDF Vectorization...
Accuracy: 0.5387
Model saved to ../outputs/models/Random_Forest_TFIDF.joblib

Training Support Vector Machine with TFIDF Vectorization...


In [None]:
# Cell 6: Display Results

print("\nModel Performance Comparison:")
display(results.sort_values(by='Accuracy', ascending=False).reset_index(drop=True))

In [None]:
# Cell 7: Identify and Save the Best Model

# Identify the best-performing model
best_row = results.loc[results['Accuracy'].idxmax()]
best_vectorization = best_row['Vectorization']
best_model_name = best_row['Model']
best_accuracy = best_row['Accuracy']

print(f"\nBest Model: {best_model_name} with {best_vectorization} Vectorization")
print(f"Accuracy: {best_accuracy:.4f}")

In [None]:
# Load the best model
best_model = trained_models[(best_vectorization.lower(), best_model_name)]

# Save the best model separately if desired
best_model_save_path = f"../outputs/models/Best_{best_model_name.replace(' ', '_')}_{best_vectorization}.joblib"
joblib.dump(best_model, best_model_save_path)
print(f"Best model saved to {best_model_save_path}")


In [None]:
# Cell 8: Evaluate the Best Model

# Predict on test set using the best model
y_pred_best = best_model.predict(loaded_data[best_vectorization.lower()]['X_test'])

# Display classification report
display_classification_report(loaded_data[best_vectorization.lower()]['y_test'], y_pred_best)


Preprocessed data loaded successfully!

Training Logistic Regression with TFIDF Vectorization...
Accuracy: 0.5872
Model saved to ../outputs/models/Logistic_Regression_TFIDF.joblib

Training Random Forest with TFIDF Vectorization...


KeyboardInterrupt: 

In [None]:
# Plot confusion matrix
classes = sorted(loaded_data[best_vectorization.lower()]['y_test'].unique())
plot_confusion_matrix(
    y_true=loaded_data[best_vectorization.lower()]['y_test'], 
    y_pred=y_pred_best, 
    classes=classes, 
    title=f"{best_model_name} with {best_vectorization} Vectorization Confusion Matrix",
    save_path=f"../outputs/figures/{best_model_name.replace(' ', '_')}_{best_vectorization}_confusion_matrix.png"