In [1]:
# Install necessary packages
!pip install optuna xgboost



In [2]:
import pandas as pd
import numpy as np
import re
import joblib
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, multilabel_confusion_matrix, accuracy_score
from sklearn.metrics import hamming_loss, jaccard_score
import logging
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')

In [3]:
class ReviewPreprocessor:
    def __init__(self):
        # Indonesian slang dictionary for normalization
        self.slang_dict = {
            # Quality terms
            'bgus': 'bagus',
            'bgs': 'bagus',
            'mantap': 'bagus',
            'mantul': 'bagus',
            'oke': 'bagus',
            'ok': 'bagus',
            'jelek': 'buruk',
            'ancur': 'buruk',
            'rusak': 'buruk',
            'parah': 'buruk',
            
            # Shipping terms
            'cpt': 'cepat',
            'cpet': 'cepat',
            'kilat': 'cepat',
            'lama': 'lambat',
            'lelet': 'lambat',
            
            # Price terms
            'murmer': 'murah',
            'mumer': 'murah',
            'worth': 'sebanding',
            'worthit': 'sebanding',
            'mahil': 'mahal',
            
            # Service terms
            'rekom': 'rekomendasi',
            'recommended': 'rekomendasi',
            'respon': 'responsif',
            'fast': 'cepat',
            'slow': 'lambat',
            
            # General terms
            'bgt': 'banget',
            'bgt': 'banget',
            'bener': 'benar',
            'gak': 'tidak',
            'ga': 'tidak',
            'tp': 'tapi',
            'jd': 'jadi',
            'krn': 'karena',
            'udh': 'sudah',
            'udah': 'sudah',
            'blm': 'belum',
            'belom': 'belum'
        }
        
        # Aspect keywords for multi-label classification
        self.aspect_keywords = {
            'kualitas_produk': {
                'positive': ['bagus', 'berkualitas', 'original', 'asli', 'premium', 'excellent', 'kualitas', 'mantap'],
                'negative': ['jelek', 'buruk', 'rusak', 'kw', 'palsu', 'fake', 'cacat', 'ancur', 'parah']
            },
            'harga': {
                'positive': ['murah', 'worth', 'sebanding', 'value', 'affordable', 'terjangkau'],
                'negative': ['mahal', 'overprice', 'kemahalan', 'expensive', 'pricey']
            },
            'pengiriman': {
                'positive': ['cepat', 'kilat', 'express', 'fast', 'tepat', 'ontime'],
                'negative': ['lama', 'lambat', 'telat', 'slow', 'delay', 'lelet']
            },
            'pelayanan': {
                'positive': ['ramah', 'baik', 'responsif', 'helpful', 'fast', 'respon', 'sopan'],
                'negative': ['buruk', 'jelek', 'tidak', 'slow', 'lambat', 'cuek', 'galak']
            },
            'performa': {
                'positive': ['battery', 'speed', 'cepat', 'lancar', 'smooth', 'camera', 'bagus', 'performance'],
                'negative': ['lemot', 'lag', 'hang', 'error', 'lambat', 'boros', 'panas', 'overheat']
            },
            'packaging': {
                'positive': ['rapi', 'aman', 'bubble', 'wrap', 'packaging', 'bungkus', 'kemasan'],
                'negative': ['rusak', 'jelek', 'buruk', 'hancur', 'penyok', 'lecek']
            }
        }
    
    def clean_text(self, text):
        """Clean and normalize Indonesian text"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove numbers (but keep price-related ones)
        text = re.sub(r'\d+', '', text)
        
        # Remove special characters but keep indonesian characters
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Normalize slang
        words = text.split()
        normalized_words = []
        for word in words:
            if word in self.slang_dict:
                normalized_words.append(self.slang_dict[word])
            else:
                normalized_words.append(word)
        
        return ' '.join(normalized_words)
    
    def create_aspect_labels(self, df):
        """Create multi-label aspect classification labels"""
        print("Creating aspect labels...")
        
        for aspect, keywords in self.aspect_keywords.items():
            # Initialize with zeros
            df[f'has_{aspect}'] = 0
            
            # Check for aspect mentions
            for _, row in df.iterrows():
                text = row['clean_text'].lower()
                
                # Check if any keyword is present
                has_positive = any(keyword in text for keyword in keywords['positive'])
                has_negative = any(keyword in text for keyword in keywords['negative'])
                
                if has_positive or has_negative:
                    df.at[row.name, f'has_{aspect}'] = 1
        
        return df
    
    def create_sentiment_labels(self, df):
        """Create sentiment labels for each aspect"""
        print("Creating sentiment labels...")
        
        for aspect, keywords in self.aspect_keywords.items():
            # Initialize with neutral (0)
            df[f'sentiment_{aspect}'] = 0
            
            for _, row in df.iterrows():
                text = row['clean_text'].lower()
                
                # Count positive and negative keywords
                positive_count = sum(1 for keyword in keywords['positive'] if keyword in text)
                negative_count = sum(1 for keyword in keywords['negative'] if keyword in text)
                
                # Determine sentiment
                if positive_count > negative_count:
                    df.at[row.name, f'sentiment_{aspect}'] = 1  # Positive
                elif negative_count > positive_count:
                    df.at[row.name, f'sentiment_{aspect}'] = -1  # Negative
                # else remains 0 (neutral)
        
        return df
    
    def preprocess_data(self, input_file='D:/projects/ecommerce-review-classifier/data/raw/tokopedia_reviews.csv', 
                    output_file='D:/projects/ecommerce-review-classifier/data/processed/processed_reviews1.csv'):
        """Main preprocessing pipeline"""
        print(f"Loading data from {input_file}")
        
        # Load data
        df = pd.read_csv(input_file)

        # All data
        print(f"Number of the data: {df.shape}")

        # --- HAPUS DATA YANG KOSONG DI 'ulasan' DAN 'nama_barang' ---
        df.dropna(subset=['ulasan', 'nama_barang'], inplace=True)
        df.reset_index(drop=True, inplace=True)

        # Rename to match expected column name
        df.rename(columns={'ulasan': 'review_text'}, inplace=True)
        
        # Clean text
        print("Cleaning text...")
        df['clean_text'] = df['review_text'].apply(self.clean_text)
        
        # Remove empty reviews (after cleaning)
        df = df[df['clean_text'].str.strip().str.len() > 0]
        
        # Create aspect labels
        df = self.create_aspect_labels(df)
        
        # Create sentiment labels
        df = self.create_sentiment_labels(df)
        
        # Add text length feature
        df['text_length'] = df['clean_text'].str.len()
        
        # Add word count feature
        df['word_count'] = df['clean_text'].str.split().str.len()
        
        # Save processed data
        #df.to_csv(output_file, index=False)
        #print(f"Processed data saved to {output_file}")
        
        return df

In [4]:
preprocessor = ReviewPreprocessor()
df = preprocessor.preprocess_data()

Loading data from D:/projects/ecommerce-review-classifier/data/raw/tokopedia_reviews.csv
Number of the data: (19920, 4)
Cleaning text...
Creating aspect labels...
Creating sentiment labels...


In [5]:
df.head()

Unnamed: 0,toko,nama_barang,review_text,rating,clean_text,has_kualitas_produk,has_harga,has_pengiriman,has_pelayanan,has_performa,has_packaging,sentiment_kualitas_produk,sentiment_harga,sentiment_pengiriman,sentiment_pelayanan,sentiment_performa,sentiment_packaging,text_length,word_count
0,ismile-indonesia,Apple iPhone 13 128GB Garansi Resmi Indonesia,"Barang original, berfungsi semua, gada kendala...",5,barang original berfungsi semua gada kendala p...,1,0,0,0,0,0,1,0,0,0,0,0,66,9
1,ismile-indonesia,Vention Kabel Charger USB 2.0 Type C to Lightn...,"Braided cable, kokoh, ada tutupnya, apalagi ?",5,braided cable kokoh ada tutupnya apalagi,0,0,0,0,1,0,0,0,0,0,-1,0,40,6
2,ismile-indonesia,Apple iPhone 16 Garansi Resmi - 128GB 256GB 512GB,"respon cepat, barang langsung dikirim. pembeli...",5,responsif cepat barang langsung dikirim pembel...,0,0,1,1,1,0,0,0,1,1,1,0,55,7
3,ismile-indonesia,Apple iPhone 13 128GB Garansi Resmi Indonesia,puassss banget. barang sampai dengan selamat d...,5,puassss banget barang sampai dengan selamat da...,1,0,1,0,0,0,1,0,-1,0,0,0,75,10
4,ismile-indonesia,Apple iPhone 16 Pro Max 128GB 256GB 512GB 1TB ...,Respon penjual baik\nPengemasan aman\nPengirim...,5,responsif penjual baik pengemasan aman pengiri...,0,0,1,1,1,1,0,0,1,1,1,1,158,23


In [6]:
df.describe()

Unnamed: 0,rating,has_kualitas_produk,has_harga,has_pengiriman,has_pelayanan,has_performa,has_packaging,sentiment_kualitas_produk,sentiment_harga,sentiment_pengiriman,sentiment_pelayanan,sentiment_performa,sentiment_packaging,text_length,word_count
count,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0
mean,4.863012,0.478078,0.022072,0.245787,0.277694,0.520722,0.224675,0.44635,0.018293,0.060877,0.071853,0.415882,0.205302,58.92023,9.277634
std,0.602355,0.499534,0.146921,0.430566,0.447875,0.499585,0.41738,0.521736,0.147236,0.48274,0.506432,0.56681,0.423648,45.686754,7.386894
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0
25%,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,4.0
50%,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,7.0
75%,5.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,82.0,13.0
max,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,184.0,40.0


In [7]:
aspect_columns = [col for col in df.columns if col.startswith('has_')]
for col in aspect_columns:
    aspect_name = col.replace('has_', '')
    count = df[col].sum()
    percentage = (count / len(df)) * 100
    print(f"{aspect_name}: {count} reviews ({percentage:.1f}%)")

kualitas_produk: 7971 reviews (47.8%)
harga: 368 reviews (2.2%)
pengiriman: 4098 reviews (24.6%)
pelayanan: 4630 reviews (27.8%)
performa: 8682 reviews (52.1%)
packaging: 3746 reviews (22.5%)


In [8]:
print(f"Rating distribution:\n", df['rating'].value_counts().sort_index())

Rating distribution:
 rating
1      283
2       60
3      161
4      650
5    15519
Name: count, dtype: int64


In [9]:
sample_cols = ['clean_text', 'rating'] + aspect_columns[:3]
df[sample_cols].tail()

Unnamed: 0,clean_text,rating,has_kualitas_produk,has_harga,has_pengiriman
16836,terimakasih barng yg kupesan sudah datng,5,0,0,0
16837,hp nya sehari pesan lgsung smpe dtng dlm k ada...,5,0,0,0
16838,bagus bang sesuai dengan kebutuhan,5,1,0,0
16839,hssbzbzbxb,4,0,0,0
16840,tzxrzdzff,4,0,0,0


In [10]:
X = df['clean_text']
y = df[[col for col in df.columns if col.startswith('has_')]]

In [11]:
test_size=0.2
val_size=0.1

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y.iloc[:, 0]
)
val_size_adjusted = val_size / (1 - test_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size_adjusted, random_state=42
)


In [12]:
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [13]:
print(f"Data split completed:")
print(f"Train: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")

Data split completed:
Train: 11670 samples
Validation: 1668 samples
Test: 3335 samples


In [14]:
# Model Training with Optuna Hyperparameter Tuning
import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, hamming_loss, jaccard_score, f1_score
import optuna
from optuna.samplers import TPESampler

# Suppress optuna logs
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [15]:
class MultiLabelModelTrainer:
    def __init__(self, train_df, val_df, test_df):
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        
        # Prepare data
        self.X_train = train_df['clean_text']
        self.y_train = train_df[[col for col in train_df.columns if col.startswith('has_')]]
        
        self.X_val = val_df['clean_text']
        self.y_val = val_df[[col for col in val_df.columns if col.startswith('has_')]]
        
        self.X_test = test_df['clean_text']
        self.y_test = test_df[[col for col in test_df.columns if col.startswith('has_')]]
        
        # Results storage
        self.results = {}
    
    def objective_rf(self, trial):
        """Objective function for Random Forest hyperparameter tuning"""
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Create pipeline
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
            ('classifier', MultiOutputClassifier(RandomForestClassifier(**params)))
        ])
        
        # Train model
        pipeline.fit(self.X_train, self.y_train)
        
        # Predict on validation set
        y_pred = pipeline.predict(self.X_val)
        
        # Calculate accuracy (subset accuracy)
        accuracy = accuracy_score(self.y_val, y_pred)
        
        return accuracy
    
    def objective_xgb(self, trial):
        """Objective function for XGBoost hyperparameter tuning"""
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Create pipeline
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
            ('classifier', MultiOutputClassifier(xgb.XGBClassifier(**params)))
        ])
        
        # Train model
        pipeline.fit(self.X_train, self.y_train)
        
        # Predict on validation set
        y_pred = pipeline.predict(self.X_val)
        
        # Calculate accuracy
        accuracy = accuracy_score(self.y_val, y_pred)
        
        return accuracy
    
    def objective_svm(self, trial):
        """Objective function for SVM hyperparameter tuning"""
        params = {
            'C': trial.suggest_float('C', 0.1, 10.0),
            'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf']),
            'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
            'random_state': 42
        }
        
        # Create pipeline
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=3000, stop_words='english')),  # Reduced features for SVM
            ('classifier', MultiOutputClassifier(SVC(**params)))
        ])
        
        # Train model
        pipeline.fit(self.X_train, self.y_train)
        
        # Predict on validation set
        y_pred = pipeline.predict(self.X_val)
        
        # Calculate accuracy
        accuracy = accuracy_score(self.y_val, y_pred)
        
        return accuracy
    
    def train_random_forest(self, n_trials=20):
        """Train Random Forest with hyperparameter tuning"""
        print("\nTraining Random Forest with Optuna...")
        start_time = time.time()
        
        study = optuna.create_study(direction='maximize', sampler=TPESampler())
        study.optimize(self.objective_rf, n_trials=n_trials)
        
        print(f"Best RF parameters: {study.best_params}")
        print(f"Best RF validation accuracy: {study.best_value:.4f}")
        
        # Train final model with best parameters
        best_params = study.best_params
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
            ('classifier', MultiOutputClassifier(RandomForestClassifier(**best_params)))
        ])
        
        pipeline.fit(self.X_train, self.y_train)
        
        # Evaluate on test set
        y_pred = pipeline.predict(self.X_test)
        test_accuracy = accuracy_score(self.y_test, y_pred)
        hamming = hamming_loss(self.y_test, y_pred)
        jaccard = jaccard_score(self.y_test, y_pred, average='macro')
        f1 = f1_score(self.y_test, y_pred, average='macro')
        
        training_time = time.time() - start_time
        
        self.results['Random Forest'] = {
            'best_params': best_params,
            'validation_accuracy': study.best_value,
            'test_accuracy': test_accuracy,
            'hamming_loss': hamming,
            'jaccard_score': jaccard,
            'f1_score': f1,
            'training_time': training_time,
            'model': pipeline
        }
        
        print(f"RF Training completed in {training_time:.2f} seconds")
        print(f"RF Test Accuracy: {test_accuracy:.4f}")
        print("-" * 50)
        
        return pipeline
    
    def train_xgboost(self, n_trials=20):
        """Train XGBoost with hyperparameter tuning"""
        print("\nTraining XGBoost with Optuna...")
        start_time = time.time()
        
        study = optuna.create_study(direction='maximize', sampler=TPESampler())
        study.optimize(self.objective_xgb, n_trials=n_trials)
        
        print(f"Best XGB parameters: {study.best_params}")
        print(f"Best XGB validation accuracy: {study.best_value:.4f}")
        
        # Train final model with best parameters
        best_params = study.best_params
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
            ('classifier', MultiOutputClassifier(xgb.XGBClassifier(**best_params)))
        ])
        
        pipeline.fit(self.X_train, self.y_train)
        
        # Evaluate on test set
        y_pred = pipeline.predict(self.X_test)
        test_accuracy = accuracy_score(self.y_test, y_pred)
        hamming = hamming_loss(self.y_test, y_pred)
        jaccard = jaccard_score(self.y_test, y_pred, average='macro')
        f1 = f1_score(self.y_test, y_pred, average='macro')
        
        training_time = time.time() - start_time
        
        self.results['XGBoost'] = {
            'best_params': best_params,
            'validation_accuracy': study.best_value,
            'test_accuracy': test_accuracy,
            'hamming_loss': hamming,
            'jaccard_score': jaccard,
            'f1_score': f1,
            'training_time': training_time,
            'model': pipeline
        }
        
        print(f"XGB Training completed in {training_time:.2f} seconds")
        print(f"XGB Test Accuracy: {test_accuracy:.4f}")
        print("-" * 50)
        
        return pipeline
    
    def train_svm(self, n_trials=20):
        """Train SVM with hyperparameter tuning (reduced trials due to computational cost)"""
        print("\nTraining SVM with Optuna...")
        start_time = time.time()
        
        study = optuna.create_study(direction='maximize', sampler=TPESampler())
        study.optimize(self.objective_svm, n_trials=n_trials)
        
        print(f"Best SVM parameters: {study.best_params}")
        print(f"Best SVM validation accuracy: {study.best_value:.4f}")
        
        # Train final model with best parameters
        best_params = study.best_params
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=3000, stop_words='english')),
            ('classifier', MultiOutputClassifier(SVC(**best_params)))
        ])
        
        pipeline.fit(self.X_train, self.y_train)
        
        # Evaluate on test set
        y_pred = pipeline.predict(self.X_test)
        test_accuracy = accuracy_score(self.y_test, y_pred)
        hamming = hamming_loss(self.y_test, y_pred)
        jaccard = jaccard_score(self.y_test, y_pred, average='macro')
        f1 = f1_score(self.y_test, y_pred, average='macro')
        
        training_time = time.time() - start_time
        
        self.results['SVM'] = {
            'best_params': best_params,
            'validation_accuracy': study.best_value,
            'test_accuracy': test_accuracy,
            'hamming_loss': hamming,
            'jaccard_score': jaccard,
            'f1_score': f1,
            'training_time': training_time,
            'model': pipeline
        }
        
        print(f"SVM Training completed in {training_time:.2f} seconds")
        print(f"SVM Test Accuracy: {test_accuracy:.4f}")
        print("-" * 50)
        
        return pipeline
    
    def train_all_models(self):
        """Train all models"""
        print("\nStarting Multi-Label Classification Training...")
        print("=" * 60)
        
        # Train Random Forest
        self.train_random_forest(n_trials=50)
        
        # Train XGBoost
        self.train_xgboost(n_trials=50)
        
        # Train SVM (fewer trials due to computational cost)
        self.train_svm(n_trials=30)
        
        print("\nAll models training completed!")
        return self.results
    
    def display_results(self):
        """Display comparison of all models"""
        if not self.results:
            print("No results to display. Please train models first.")
            return
        
        print("\n" + "=" * 80)
        print("\nMODEL COMPARISON RESULTS")
        print("=" * 80)
        
        # Create results DataFrame
        results_data = []
        for model_name, metrics in self.results.items():
            results_data.append({
                'Model': model_name,
                'Test Accuracy': f"{metrics['test_accuracy']:.4f}",
                'Validation Accuracy': f"{metrics['validation_accuracy']:.4f}",
                'Hamming Loss': f"{metrics['hamming_loss']:.4f}",
                'Jaccard Score': f"{metrics['jaccard_score']:.4f}",
                'F1 Score': f"{metrics['f1_score']:.4f}",
                'Training Time (s)': f"{metrics['training_time']:.2f}"
            })
        
        results_df = pd.DataFrame(results_data)
        print(results_df.to_string(index=False))
        
        # Find best model
        best_model = max(self.results.keys(), 
                        key=lambda x: self.results[x]['test_accuracy'])
        
        print(f"\n   BEST MODEL: {best_model}")
        print(f"   Test Accuracy: {self.results[best_model]['test_accuracy']:.4f}")
        print(f"   Training Time: {self.results[best_model]['training_time']:.2f}s")
        
        print("\n" + "=" * 80)
        
        return results_df

In [16]:
# Initialize and train models
trainer = MultiLabelModelTrainer(train_df, val_df, test_df)

# Train all models
results = trainer.train_all_models()

# Display results
results_df = trainer.display_results()


Starting Multi-Label Classification Training...

Training Random Forest with Optuna...


[W 2025-07-28 21:43:13,127] Trial 36 failed with parameters: {'n_estimators': 189, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': None} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\VICTUS\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\VICTUS\AppData\Local\Temp\ipykernel_6284\3331212398.py", line 39, in objective_rf
    pipeline.fit(self.X_train, self.y_train)
  File "c:\Users\VICTUS\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\VICTUS\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "c:\Users\VICTUS\AppData\Local\Pr

KeyboardInterrupt: 

In [None]:
import joblib
import os

# Make folder to save models
os.makedirs('saved_models', exist_ok=True)

# Model from result
rf_model = trainer.results['Random Forest']['model']
xgb_model = trainer.results['XGBoost']['model']
svm_model = trainer.results['SVM']['model']

# Save model
joblib.dump(rf_model, 'saved_models/random_forest_model.pkl')
joblib.dump(xgb_model, 'saved_models/xgboost_model.pkl')
joblib.dump(svm_model, 'saved_models/svm_model.pkl')

print("\nAll models saved!")
