In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import time
import os

In [31]:
def load_and_preprocess_data():
    # Define paths
    file_path = 'G:/My Drive/github/msc-ai-cw/dataset/processed/cinnamon_quality_dataset.csv'
    
    # Load the dataset
    print("Loading dataset...")
    data = pd.read_csv(file_path)
    print(f"Initial dataset shape: {data.shape}")
    
    # Handle null values
    print("Handling missing values...") 
    missing_before = data.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_before}")
    
    # Fill numerical NaNs with median
    num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    if 'Sample_ID' in num_cols:
        num_cols.remove('Sample_ID')
    
    data[num_cols] = data[num_cols].fillna(data[num_cols].median())
    
    # Drop rows with missing target
    data = data.dropna(subset=['Quality_Label'])
    
    missing_after = data.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_after}")
    print(f"Dataset shape after handling missing values: {data.shape}")
    

    # Remove noise/outliers using Z-score
    print("Removing outliers using Z-score method...")

    # Calculate Z-scores for numerical columns
    z_scores = np.abs((data[num_cols] - data[num_cols].mean()) / data[num_cols].std())
    
    # Remove rows where any feature has Z-score > 3
    outlier_mask = (z_scores < 3).all(axis=1)
    outliers_removed = len(data) - outlier_mask.sum()
    
    data = data[outlier_mask]
    
    print(f"Outliers removed: {outliers_removed}")
    print(f"Final dataset shape: {data.shape}")
    
    # Separate features and target
    X = data.drop(columns=["Sample_ID", "Quality_Label"])
    y = data["Quality_Label"]
    
    # Check class distribution
    print(f"\nClass distribution:")
    print(y.value_counts().sort_index())
    
    # Encode target labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, le

In [32]:
def train_logistic_regression(X_train, y_train):
    print("Training Logistic Regression...")
    start_time = time.time()
    
    log_reg = LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=10000,
        random_state=42
    )
    
    log_reg.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    return log_reg, training_time

In [33]:
def train_random_forest(X_train, y_train):
    print("Training Random Forest...")
    start_time = time.time()
    
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    
    rf.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    return rf, training_time

In [34]:
def train_svm(X_train, y_train):
    print("Training SVM with Grid Search...")
    start_time = time.time()
    
    param_grid = {
        'C': [1, 10],
        'gamma': ['scale', 0.1],
        'kernel': ['rbf']
    }
    
    grid_search = GridSearchCV(
        SVC(probability=True, random_state=42),
        param_grid,
        cv=5,
        scoring='accuracy',
        verbose=0,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    best_svm = grid_search.best_estimator_
    
    training_time = time.time() - start_time
    return best_svm, training_time, grid_search.best_params_

In [35]:
def evaluate_model(model, X_test, y_test, model_name, training_time, le):
    print(f"\n{'='*50}")
    print(f"{model_name} Results")
    print(f"{'='*50}")
    
    # Predictions
    start_time = time.time()
    y_pred = model.predict(X_test)
    prediction_time = time.time() - start_time
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Training Time: {training_time:.2f} seconds")
    print(f"Prediction Time: {prediction_time:.4f} seconds")
    print(f"Accuracy: {accuracy*100:.2f}%")
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return {
        'model_name': model_name,
        'accuracy': accuracy,
        'training_time': training_time,
        'prediction_time': prediction_time,
        'y_pred': y_pred
    }


In [36]:
def compare_models_summary(results):
    print(f"\n{'='*70}")
    print(f"MODELS COMPARISON SUMMARY")
    print(f"{'='*70}")
    
    # Create comparison DataFrame
    comparison_df = pd.DataFrame([
        {
            'Model': result['model_name'],
            'Accuracy (%)': f"{result['accuracy']*100:.2f}",
            'Training Time (s)': f"{result['training_time']:.2f}",
            'Prediction Time (s)': f"{result['prediction_time']:.4f}"
        }
        for result in results
    ])
    
    print(comparison_df.to_string(index=False))
    
    # Find best model
    best_model = max(results, key=lambda x: x['accuracy'])
    print(f"\nBest Model: {best_model['model_name']} with {best_model['accuracy']*100:.2f}% accuracy")
    
    return comparison_df

In [37]:
def main():    
    print("Starting Classical ML Algorithms Comparison for Cinnamon Quality Classification")
    print("="*80)
    
    # Load and preprocess data
    print("Loading and preprocessing data...")
    X_train, X_test, y_train, y_test, le = load_and_preprocess_data()
    
    print(f"Dataset Info:")
    print(f"- Training samples: {X_train.shape[0]}")
    print(f"- Test samples: {X_test.shape[0]}")
    print(f"- Features: {X_train.shape[1]}")
    print(f"- Classes: {le.classes_}")
    
    results = []
    
    # Train and evaluate Logistic Regression
    log_reg, log_reg_time = train_logistic_regression(X_train, y_train)
    log_reg_results = evaluate_model(log_reg, X_test, y_test, "Logistic Regression", log_reg_time, le)
    results.append(log_reg_results)
    
    # Train and evaluate Random Forest
    rf, rf_time = train_random_forest(X_train, y_train)
    rf_results = evaluate_model(rf, X_test, y_test, "Random Forest", rf_time, le)
    results.append(rf_results)
    
    # Train and evaluate SVM
    svm, svm_time, best_params = train_svm(X_train, y_train)
    print(f"Best SVM Parameters: {best_params}")
    svm_results = evaluate_model(svm, X_test, y_test, "Support Vector Machine", svm_time, le)
    results.append(svm_results)
    
    # Final comparison
    comparison_summary = compare_models_summary(results)    
    return results, comparison_summary

if __name__ == "__main__":
    results, summary = main()

Starting Classical ML Algorithms Comparison for Cinnamon Quality Classification
Loading and preprocessing data...
Loading dataset...
Initial dataset shape: (8692, 14)
Handling missing values...
Missing values before cleaning: 70
Missing values after cleaning: 0
Dataset shape after handling missing values: (8692, 14)
Removing outliers using Z-score method...
Outliers removed: 0
Final dataset shape: (8692, 14)

Class distribution:
Quality_Label
High      2944
Low       2859
Medium    2889
Name: count, dtype: int64
Dataset Info:
- Training samples: 6953
- Test samples: 1739
- Features: 12
- Classes: ['High' 'Low' 'Medium']
Training Logistic Regression...

Logistic Regression Results
Training Time: 0.02 seconds
Prediction Time: 0.0000 seconds
Accuracy: 82.35%

Classification Report:
              precision    recall  f1-score   support

        High       0.90      0.85      0.87       589
         Low       0.85      0.88      0.86       572
      Medium       0.73      0.74      0.74    




Random Forest Results
Training Time: 1.13 seconds
Prediction Time: 0.0724 seconds
Accuracy: 89.88%

Classification Report:
              precision    recall  f1-score   support

        High       0.94      0.91      0.92       589
         Low       0.92      0.93      0.93       572
      Medium       0.84      0.86      0.85       578

    accuracy                           0.90      1739
   macro avg       0.90      0.90      0.90      1739
weighted avg       0.90      0.90      0.90      1739


Confusion Matrix:
[[534   0  55]
 [  0 534  38]
 [ 37  46 495]]
Training SVM with Grid Search...
Best SVM Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

Support Vector Machine Results
Training Time: 18.40 seconds
Prediction Time: 0.3267 seconds
Accuracy: 91.20%

Classification Report:
              precision    recall  f1-score   support

        High       0.95      0.93      0.94       589
         Low       0.92      0.94      0.93       572
      Medium       0.87      0.87 