In [1]:
# Install necessary packages
!pip install optuna xgboost

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import re
import os
import joblib
import time
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.samplers import TPESampler
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import classification_report, multilabel_confusion_matrix, accuracy_score
from sklearn.metrics import hamming_loss, jaccard_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

import logging
import warnings

# Suppress optuna logs
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
class ReviewPreprocessor:
    def __init__(self):
        # Indonesian slang dictionary for normalization
        self.slang_dict = {
            # Quality terms
            'bgus': 'bagus',
            'bgs': 'bagus',
            'mantap': 'bagus',
            'mantul': 'bagus',
            'oke': 'bagus',
            'ok': 'bagus',
            'jelek': 'buruk',
            'ancur': 'buruk',
            'rusak': 'buruk',
            'parah': 'buruk',

            # Shipping terms
            'cpt': 'cepat',
            'cpet': 'cepat',
            'kilat': 'cepat',
            'lama': 'lambat',
            'lelet': 'lambat',

            # Price terms
            'murmer': 'murah',
            'mumer': 'murah',
            'worth': 'sebanding',
            'worthit': 'sebanding',
            'mahil': 'mahal',

            # Service terms
            'rekom': 'rekomendasi',
            'recommended': 'rekomendasi',
            'respon': 'responsif',
            'fast': 'cepat',
            'slow': 'lambat',

            # General terms
            'bgt': 'banget',
            'bgt': 'banget',
            'bener': 'benar',
            'gak': 'tidak',
            'ga': 'tidak',
            'tp': 'tapi',
            'jd': 'jadi',
            'krn': 'karena',
            'udh': 'sudah',
            'udah': 'sudah',
            'blm': 'belum',
            'belom': 'belum'
        }

        # Aspect keywords for multi-label classification
        self.aspect_keywords = {
            'kualitas_produk': {
                'positive': ['bagus', 'berkualitas', 'original', 'asli', 'premium', 'excellent', 'kualitas', 'mantap'],
                'negative': ['jelek', 'buruk', 'rusak', 'kw', 'palsu', 'fake', 'cacat', 'ancur', 'parah']
            },
            'harga': {
                'positive': ['murah', 'worth', 'sebanding', 'value', 'affordable', 'terjangkau'],
                'negative': ['mahal', 'overprice', 'kemahalan', 'expensive', 'pricey']
            },
            'pengiriman': {
                'positive': ['cepat', 'kilat', 'express', 'fast', 'tepat', 'ontime'],
                'negative': ['lama', 'lambat', 'telat', 'slow', 'delay', 'lelet']
            },
            'pelayanan': {
                'positive': ['ramah', 'baik', 'responsif', 'helpful', 'fast', 'respon', 'sopan'],
                'negative': ['buruk', 'jelek', 'tidak', 'slow', 'lambat', 'cuek', 'galak']
            },
            'performa': {
                'positive': ['battery', 'speed', 'cepat', 'lancar', 'smooth', 'camera', 'bagus', 'performance'],
                'negative': ['lemot', 'lag', 'hang', 'error', 'lambat', 'boros', 'panas', 'overheat']
            },
            'packaging': {
                'positive': ['rapi', 'aman', 'bubble', 'wrap', 'packaging', 'bungkus', 'kemasan'],
                'negative': ['rusak', 'jelek', 'buruk', 'hancur', 'penyok', 'lecek']
            }
        }

    def clean_text(self, text):
        """Clean and normalize Indonesian text"""
        if pd.isna(text):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)

        # Remove numbers (but keep price-related ones)
        text = re.sub(r'\d+', '', text)

        # Remove special characters but keep indonesian characters
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        # Normalize slang
        words = text.split()
        normalized_words = []
        for word in words:
            if word in self.slang_dict:
                normalized_words.append(self.slang_dict[word])
            else:
                normalized_words.append(word)

        return ' '.join(normalized_words)

    def create_aspect_labels(self, df):
        """Create multi-label aspect classification labels"""
        print("Creating aspect labels...")

        for aspect, keywords in self.aspect_keywords.items():
            # Initialize with zeros
            df[f'has_{aspect}'] = 0

            # Check for aspect mentions
            for _, row in df.iterrows():
                text = row['clean_text'].lower()

                # Check if any keyword is present
                has_positive = any(keyword in text for keyword in keywords['positive'])
                has_negative = any(keyword in text for keyword in keywords['negative'])

                if has_positive or has_negative:
                    df.at[row.name, f'has_{aspect}'] = 1

        return df

    def create_sentiment_labels(self, df):
        """Create sentiment labels for each aspect"""
        print("Creating sentiment labels...")

        for aspect, keywords in self.aspect_keywords.items():
            # Initialize with neutral (0)
            df[f'sentiment_{aspect}'] = 0

            for _, row in df.iterrows():
                text = row['clean_text'].lower()

                # Count positive and negative keywords
                positive_count = sum(1 for keyword in keywords['positive'] if keyword in text)
                negative_count = sum(1 for keyword in keywords['negative'] if keyword in text)

                # Determine sentiment
                if positive_count > negative_count:
                    df.at[row.name, f'sentiment_{aspect}'] = 1  # Positive
                elif negative_count > positive_count:
                    df.at[row.name, f'sentiment_{aspect}'] = -1  # Negative
                # else remains 0 (neutral)

        return df

    def preprocess_data(self, input_file='/content/drive/MyDrive/Project/ecommerce-review-classifier/tokopedia_reviews.csv',
                    output_file='/content/drive/MyDrive/Project/ecommerce-review-classifier/processed_reviews.csv'):
        """Main preprocessing pipeline"""
        print(f"Loading data from {input_file}")

        # Load data
        df = pd.read_csv(input_file)

        # All data
        print(f"Number of the data: {df.shape}")

        # --- HAPUS DATA YANG KOSONG DI 'ulasan' DAN 'nama_barang' ---
        df.dropna(subset=['ulasan', 'nama_barang'], inplace=True)
        df.reset_index(drop=True, inplace=True)

        # Rename to match expected column name
        df.rename(columns={'ulasan': 'review_text'}, inplace=True)

        # Clean text
        print("Cleaning text...")
        df['clean_text'] = df['review_text'].apply(self.clean_text)

        # Remove empty reviews (after cleaning)
        df = df[df['clean_text'].str.strip().str.len() > 0]

        # Create aspect labels
        df = self.create_aspect_labels(df)

        # Create sentiment labels
        df = self.create_sentiment_labels(df)

        # Add text length feature
        df['text_length'] = df['clean_text'].str.len()

        # Add word count feature
        df['word_count'] = df['clean_text'].str.split().str.len()

        # Save processed data
        #df.to_csv(output_file, index=False)
        #print(f"Processed data saved to {output_file}")

        return df

In [7]:
preprocessor = ReviewPreprocessor()
df = preprocessor.preprocess_data()

Loading data from /content/drive/MyDrive/Project/ecommerce-review-classifier/tokopedia_reviews.csv
Number of the data: (19920, 4)
Cleaning text...
Creating aspect labels...
Creating sentiment labels...


In [8]:
df.head()

Unnamed: 0,toko,nama_barang,review_text,rating,clean_text,has_kualitas_produk,has_harga,has_pengiriman,has_pelayanan,has_performa,has_packaging,sentiment_kualitas_produk,sentiment_harga,sentiment_pengiriman,sentiment_pelayanan,sentiment_performa,sentiment_packaging,text_length,word_count
0,ismile-indonesia,Apple iPhone 13 128GB Garansi Resmi Indonesia,"Barang original, berfungsi semua, gada kendala...",5,barang original berfungsi semua gada kendala p...,1,0,0,0,0,0,1,0,0,0,0,0,66,9
1,ismile-indonesia,Vention Kabel Charger USB 2.0 Type C to Lightn...,"Braided cable, kokoh, ada tutupnya, apalagi ?",5,braided cable kokoh ada tutupnya apalagi,0,0,0,0,1,0,0,0,0,0,-1,0,40,6
2,ismile-indonesia,Apple iPhone 16 Garansi Resmi - 128GB 256GB 512GB,"respon cepat, barang langsung dikirim. pembeli...",5,responsif cepat barang langsung dikirim pembel...,0,0,1,1,1,0,0,0,1,1,1,0,55,7
3,ismile-indonesia,Apple iPhone 13 128GB Garansi Resmi Indonesia,puassss banget. barang sampai dengan selamat d...,5,puassss banget barang sampai dengan selamat da...,1,0,1,0,0,0,1,0,-1,0,0,0,75,10
4,ismile-indonesia,Apple iPhone 16 Pro Max 128GB 256GB 512GB 1TB ...,Respon penjual baik\nPengemasan aman\nPengirim...,5,responsif penjual baik pengemasan aman pengiri...,0,0,1,1,1,1,0,0,1,1,1,1,158,23


In [9]:
df.describe()

Unnamed: 0,rating,has_kualitas_produk,has_harga,has_pengiriman,has_pelayanan,has_performa,has_packaging,sentiment_kualitas_produk,sentiment_harga,sentiment_pengiriman,sentiment_pelayanan,sentiment_performa,sentiment_packaging,text_length,word_count
count,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0,16673.0
mean,4.863012,0.478078,0.022072,0.245787,0.277694,0.520722,0.224675,0.44635,0.018293,0.060877,0.071853,0.415882,0.205302,58.92023,9.277634
std,0.602355,0.499534,0.146921,0.430566,0.447875,0.499585,0.41738,0.521736,0.147236,0.48274,0.506432,0.56681,0.423648,45.686754,7.386894
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0
25%,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,4.0
50%,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,7.0
75%,5.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,82.0,13.0
max,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,184.0,40.0


In [10]:
aspect_columns = [col for col in df.columns if col.startswith('has_')]
for col in aspect_columns:
    aspect_name = col.replace('has_', '')
    count = df[col].sum()
    percentage = (count / len(df)) * 100
    print(f"{aspect_name}: {count} reviews ({percentage:.1f}%)")

kualitas_produk: 7971 reviews (47.8%)
harga: 368 reviews (2.2%)
pengiriman: 4098 reviews (24.6%)
pelayanan: 4630 reviews (27.8%)
performa: 8682 reviews (52.1%)
packaging: 3746 reviews (22.5%)


In [11]:
print(f"Rating distribution:\n", df['rating'].value_counts().sort_index())

Rating distribution:
 rating
1      283
2       60
3      161
4      650
5    15519
Name: count, dtype: int64


In [12]:
sample_cols = ['clean_text', 'rating'] + aspect_columns[:3]
df[sample_cols].tail()

Unnamed: 0,clean_text,rating,has_kualitas_produk,has_harga,has_pengiriman
16836,terimakasih barng yg kupesan sudah datng,5,0,0,0
16837,hp nya sehari pesan lgsung smpe dtng dlm k ada...,5,0,0,0
16838,bagus bang sesuai dengan kebutuhan,5,1,0,0
16839,hssbzbzbxb,4,0,0,0
16840,tzxrzdzff,4,0,0,0


In [13]:
X = df['clean_text']
y = df[[col for col in df.columns if col.startswith('has_')]]

In [14]:
test_size=0.2
val_size=0.1

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y.iloc[:, 0]
)
val_size_adjusted = val_size / (1 - test_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size_adjusted, random_state=42
)


In [15]:
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [16]:
print(f"Data split completed:")
print(f"Train: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")

Data split completed:
Train: 11670 samples
Validation: 1668 samples
Test: 3335 samples


In [17]:
class MultiLabelModelTrainer:
    def __init__(self, train_df, val_df, test_df):
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df

        self.label_cols = [col for col in train_df.columns if col.startswith('has_')]
        self.X_train = train_df['clean_text']
        self.y_train = train_df[self.label_cols]
        self.X_val = val_df['clean_text']
        self.y_val = val_df[self.label_cols]
        self.X_test = test_df['clean_text']
        self.y_test = test_df[self.label_cols]

        self.stopwords_indo = stopwords.words('indonesian')
        self.results = {}

    def tune_and_train(self, model_name, objective_func, base_model_class, tfidf_max_features=5000, n_trials=20):
        print(f"\nTuning hyperparameters for {model_name} with Optuna...")
        start_time = time.time()

        study = optuna.create_study(direction='maximize', sampler=TPESampler())
        study.optimize(lambda trial: objective_func(trial), n_trials=n_trials)

        print(f"\nBest {model_name} params: {study.best_params}")
        print(f"\nBest {model_name} validation accuracy: {study.best_value:.4f}")

        # Build final pipeline with best params
        if model_name == 'LightGBM':
            classifier = MultiOutputClassifier(lgb.LGBMClassifier(**study.best_params))
        elif model_name == 'XGBoost':
            classifier = MultiOutputClassifier(xgb.XGBClassifier(**study.best_params))
        else:  # Random Forest
            classifier = MultiOutputClassifier(RandomForestClassifier(**study.best_params))

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=tfidf_max_features, stop_words=self.stopwords_indo)),
            ('classifier', classifier)
        ])

        pipeline.fit(self.X_train, self.y_train)
        y_pred = pipeline.predict(self.X_test)

        # Evaluation
        test_accuracy = accuracy_score(self.y_test, y_pred)
        hamming = hamming_loss(self.y_test, y_pred)
        jaccard = jaccard_score(self.y_test, y_pred, average='macro')
        f1 = f1_score(self.y_test, y_pred, average='macro')
        training_time = time.time() - start_time

        self.results[model_name] = {
            'best_params': study.best_params,
            'validation_accuracy': study.best_value,
            'test_accuracy': test_accuracy,
            'hamming_loss': hamming,
            'jaccard_score': jaccard,
            'f1_score': f1,
            'training_time': training_time,
            'model': pipeline
        }

        print(f"\n{model_name} training completed in {training_time:.2f} seconds")
        print(f"\nF1-score: {f1:.4f}")
        print("-" * 60)

        return pipeline

    def objective_rf(self, trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
            'random_state': 42,
            'n_jobs': -1
        }

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,  stop_words=self.stopwords_indo)),
            ('classifier', MultiOutputClassifier(RandomForestClassifier(**params)))
        ])
        pipeline.fit(self.X_train, self.y_train)
        y_pred = pipeline.predict(self.X_val)
        return f1_score(self.y_val, y_pred, average='macro')

    def objective_xgb(self, trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
            'random_state': 42,
            'n_jobs': -1
        }

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,  stop_words=self.stopwords_indo)),
            ('classifier', MultiOutputClassifier(xgb.XGBClassifier(**params)))
        ])
        pipeline.fit(self.X_train, self.y_train)
        y_pred = pipeline.predict(self.X_val)
        return f1_score(self.y_val, y_pred, average='macro')

    def objective_lgbm(self, trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
            'random_state': 42,
            'n_jobs': -1
        }

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,  stop_words=self.stopwords_indo)),
            ('classifier', MultiOutputClassifier(lgb.LGBMClassifier(**params)))
        ])
        pipeline.fit(self.X_train, self.y_train)
        y_pred = pipeline.predict(self.X_val)
        return f1_score(self.y_val, y_pred, average='macro')

    def train_all_models(self):
        print("\nStarting Multi-Label Training for All Models...")
        model_trials = {
            "Random Forest": 30,
            "XGBoost": 30,
            "LightGBM": 30
        }
        self.tune_and_train(
            "Random Forest",
            self.objective_rf,
            RandomForestClassifier,
            n_trials=model_trials["Random Forest"])
        self.tune_and_train(
            "XGBoost",
            self.objective_xgb,
            xgb.XGBClassifier,
            n_trials=model_trials["XGBoost"])
        self.tune_and_train(
            "LightGBM",
            self.objective_lgbm,
            lgb.LGBMClassifier,
            n_trials=model_trials["LightGBM"])
        print("\nAll models have been trained.")
        return self.results

    def display_results(self):
        print("\nMODEL COMPARISON RESULTS")
        print("=" * 60)
        rows = []
        for model_name, metrics in self.results.items():
            rows.append({
                'Model': model_name,
                'Validation Accuracy': f"{metrics['validation_accuracy']:.4f}",
                'Test Accuracy': f"{metrics['test_accuracy']:.4f}",
                'Hamming Loss': f"{metrics['hamming_loss']:.4f}",
                'Jaccard Score': f"{metrics['jaccard_score']:.4f}",
                'F1 Score': f"{metrics['f1_score']:.4f}",
                'Training Time (s)': f"{metrics['training_time']:.2f}"
            })

        df = pd.DataFrame(rows)
        print(df.to_string(index=False))

        best_model = max(self.results, key=lambda x: self.results[x]['f1_score'])
        print(f"\nBEST MODEL: {best_model}")
        print("=" * 60)
        return df


In [18]:
# Initialization
trainer = MultiLabelModelTrainer(train_df, val_df, test_df)

In [19]:
trainer.tune_and_train(
    model_name="Random Forest",
    objective_func=trainer.objective_rf,
    base_model_class=RandomForestClassifier,
    n_trials=30
)


Tuning hyperparameters for Random Forest with Optuna...

Best Random Forest params: {'n_estimators': 487, 'max_depth': 20, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'sqrt'}

Best Random Forest validation accuracy: 0.5504

Random Forest training completed in 431.01 seconds

F1-score: 0.5610
------------------------------------------------------------


In [20]:
trainer.tune_and_train(
    model_name="XGBoost",
    objective_func=trainer.objective_xgb,
    base_model_class=xgb.XGBClassifier,
    n_trials=30
)


Tuning hyperparameters for XGBoost with Optuna...

Best XGBoost params: {'n_estimators': 91, 'max_depth': 9, 'learning_rate': 0.2284208325275402, 'subsample': 0.8666809992987552, 'colsample_bytree': 0.864848007636768, 'reg_alpha': 1.241951439186747, 'reg_lambda': 1.2570215419454296}

Best XGBoost validation accuracy: 0.9399

XGBoost training completed in 668.29 seconds

F1-score: 0.9428
------------------------------------------------------------


In [24]:
trainer.tune_and_train(
    model_name="LightGBM",
    objective_func=trainer.objective_lgbm,
    base_model_class=lgb.LGBMClassifier,
    n_trials=40
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 6057, number of negative: 5613
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16052
[LightGBM] [Info] Number of data points in the train set: 11670, number of used features: 423
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.519023 -> initscore=0.076129
[LightGBM] [Info] Start training from score 0.076129
[LightGBM] [Info] Number of positive: 2608, number of negative: 9062
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16052
[LightGBM] [Info] Number of data points in the tr

In [25]:
trainer.display_results()


MODEL COMPARISON RESULTS
        Model Validation Accuracy Test Accuracy Hamming Loss Jaccard Score F1 Score Training Time (s)
Random Forest              0.5504        0.5412       0.0997        0.4711   0.5610            431.01
      XGBoost              0.9397        0.8567       0.0259        0.9014   0.9436           1127.16
     LightGBM              0.9386        0.8519       0.0272        0.8946   0.9395            243.34

BEST MODEL: XGBoost


Unnamed: 0,Model,Validation Accuracy,Test Accuracy,Hamming Loss,Jaccard Score,F1 Score,Training Time (s)
0,Random Forest,0.5504,0.5412,0.0997,0.4711,0.561,431.01
1,XGBoost,0.9397,0.8567,0.0259,0.9014,0.9436,1127.16
2,LightGBM,0.9386,0.8519,0.0272,0.8946,0.9395,243.34


In [26]:
# Make folder to save models
os.makedirs('/content/drive/MyDrive/Project/ecommerce-review-classifier/saved_models', exist_ok=True)

# Model from result
rf_model = trainer.results['Random Forest']['model']
xgb_model = trainer.results['XGBoost']['model']
lgb_model = trainer.results['LightGBM']['model']

# Save model
joblib.dump(rf_model, '/content/drive/MyDrive/Project/ecommerce-review-classifier/saved_models/random_forest_model.pkl')
joblib.dump(xgb_model, '/content/drive/MyDrive/Project/ecommerce-review-classifier/saved_models/xgboost_model.pkl')
joblib.dump(lgb_model, '/content/drive/MyDrive/Project/ecommerce-review-classifier/saved_models/lgb_model.pkl')

print("\nAll models saved!")


All models saved!
