In [None]:
!pip install autofeat

In [None]:
!pip install optuna

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from autofeat import AutoFeatRegressor, AutoFeatClassifier
import optuna
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from re import template
train_label = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/train_label_data.csv')
member = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/member_data.csv')
train_transaction = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/train_transaction_data.csv')
product = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/product_data.csv')
program = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/prodgram_data.csv')
test_transaction = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/test_transaction_data.csv')
template_submission = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/sample_submission.csv')

### Test

In [None]:
test = test_transaction.merge(member, on='MemberID', how='left')
test = test.merge(product, left_on='FK_PRODUCT_ID', right_on='productID', how='left')
test = test.merge(program, left_on='FK_PROD_GRAM_ID', right_on='prodgramID', how='left')

test

In [None]:
test['Price'].value_counts()

In [None]:
missing_values = test.isnull().sum()

missing_values = missing_values[missing_values > 0]

print("Missing values per kolom:")
print(missing_values)

In [None]:
cols_to_drop = ['FK_PRODUCT_ID', 'productID', 'FK_PROD_GRAM_ID', 'prodgramID', 'TransactionID']
test = test.drop(columns=[col for col in cols_to_drop if col in test.columns])

test.head()

In [None]:
test.info()

In [None]:
test = test.drop(columns=['DateOfBirth'])

median_price = test['PricePerUnit'].median()
test['PricePerUnit'] = test['PricePerUnit'].fillna(median_price)

In [None]:
test['JoinDate'] = pd.to_datetime(test['JoinDate'])
test['TransactionDatetime'] = pd.to_datetime(test['TransactionDatetime'], utc=True).dt.tz_localize(None)

test['EldestKidDOB'] = pd.to_datetime(test['EldestKidDOB'], errors='coerce')
test['YoungestKidDOB'] = pd.to_datetime(test['YoungestKidDOB'], errors='coerce')

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test['Trans_Year']     = test['TransactionDatetime'].dt.year
test['Trans_Month']    = test['TransactionDatetime'].dt.month
test['Trans_Day']      = test['TransactionDatetime'].dt.day
test['Trans_Weekday']  = test['TransactionDatetime'].dt.weekday
test['Trans_Hour']     = test['TransactionDatetime'].dt.hour
test['IsWeekend'] = test['Trans_Weekday'].isin([5, 6]).astype(int)

test['Days_Since_Join'] = (test['TransactionDatetime'] - test['JoinDate']).dt.days

test['Join_Year']     = test['JoinDate'].dt.year
test['Join_Month']    = test['JoinDate'].dt.month
test['Join_Weekday']  = test['JoinDate'].dt.weekday
test['Join_Day']      = test['JoinDate'].dt.day

test['EldestKid_Year']    = test['EldestKidDOB'].dt.year
test['EldestKid_Month']   = test['EldestKidDOB'].dt.month
test['EldestKid_AgeDays'] = (test['TransactionDatetime'] - test['EldestKidDOB']).dt.days
test['EldestKid_AgeYears'] = test['EldestKid_AgeDays'] / 365

test['YoungestKid_Year']    = test['YoungestKidDOB'].dt.year
test['YoungestKid_Month']   = test['YoungestKidDOB'].dt.month
test['YoungestKid_AgeDays'] = (test['TransactionDatetime'] - test['YoungestKidDOB']).dt.days
test['YoungestKid_AgeYears'] = test['YoungestKid_AgeDays'] / 365

In [None]:
test = test.drop(columns=['JoinDate', 'EldestKidDOB', 'YoungestKidDOB', 'TransactionDatetime'])

In [None]:
test['IsWeekend'] = test['IsWeekend'].astype(int)

test['EldestKid_AgeDays'] = test['EldestKid_AgeDays'].astype(int)
test['YoungestKid_AgeDays'] = test['YoungestKid_AgeDays'].astype(int)

In [None]:
test.info()

In [None]:
data_test = test.drop(columns=['MemberID'])

In [None]:
categorical_cols = ['City', 'Source', 'ProductName', 'ProductCategory', 'ProductLevel', 'GrammageName']

le = LabelEncoder()
for col in categorical_cols:
    data_test[col] = le.fit_transform(data_test[col])

In [None]:
int32_cols = data_test.select_dtypes('int32').columns
data_test[int32_cols] = data_test[int32_cols].astype('int64')

scaler = StandardScaler()
cols_to_scale = ['PricePerUnit', 'Price', 'EldestKid_AgeYears', 'YoungestKid_AgeYears']
data_test[cols_to_scale] = scaler.fit_transform(data_test[cols_to_scale])

In [None]:
data_test.info()

### Train

In [None]:
train = train_label.merge(member, on='MemberID', how='left')
train = train.merge(train_transaction, on='MemberID', how='left')
train = train.merge(product, left_on='FK_PRODUCT_ID', right_on='productID', how='left')
train = train.merge(program, left_on='FK_PROD_GRAM_ID', right_on='prodgramID', how='left')

train

In [None]:
columns_to_drop = ['FK_PRODUCT_ID', 'productID', 'FK_PROD_GRAM_ID', 'prodgramID', 'TransactionID']
train = train.drop(columns=columns_to_drop)

train.head()

In [None]:
train['Price'].value_counts()

In [None]:
missing_values = train.isnull().sum()

missing_values = missing_values[missing_values > 0]

print("Missing values per kolom:")
print(missing_values)

In [None]:
train = train.drop(columns=['DateOfBirth'])

median_price = train['PricePerUnit'].median()
train['PricePerUnit'] = train['PricePerUnit'].fillna(median_price)

In [None]:
train['JoinDate'] = pd.to_datetime(train['JoinDate'])
train['TransactionDatetime'] = pd.to_datetime(train['TransactionDatetime'], utc=True).dt.tz_localize(None)

train['EldestKidDOB'] = pd.to_datetime(train['EldestKidDOB'], errors='coerce')
train['YoungestKidDOB'] = pd.to_datetime(train['YoungestKidDOB'], errors='coerce')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train['Trans_Year']     = train['TransactionDatetime'].dt.year
train['Trans_Month']    = train['TransactionDatetime'].dt.month
train['Trans_Day']      = train['TransactionDatetime'].dt.day
train['Trans_Weekday']  = train['TransactionDatetime'].dt.weekday
train['Trans_Hour']     = train['TransactionDatetime'].dt.hour
train['IsWeekend'] = train['Trans_Weekday'].isin([5, 6]).astype(int)

train['Days_Since_Join'] = (train['TransactionDatetime'] - train['JoinDate']).dt.days

train['Join_Year']     = train['JoinDate'].dt.year
train['Join_Month']    = train['JoinDate'].dt.month
train['Join_Weekday']  = train['JoinDate'].dt.weekday
train['Join_Day']      = train['JoinDate'].dt.day

train['EldestKid_Year']    = train['EldestKidDOB'].dt.year
train['EldestKid_Month']   = train['EldestKidDOB'].dt.month
train['EldestKid_AgeDays'] = (train['TransactionDatetime'] - train['EldestKidDOB']).dt.days
train['EldestKid_AgeYears'] = train['EldestKid_AgeDays'] / 365

train['YoungestKid_Year']    = train['YoungestKidDOB'].dt.year
train['YoungestKid_Month']   = train['YoungestKidDOB'].dt.month
train['YoungestKid_AgeDays'] = (train['TransactionDatetime'] - train['YoungestKidDOB']).dt.days
train['YoungestKid_AgeYears'] = train['YoungestKid_AgeDays'] / 365

In [None]:
train = train.drop(columns=['JoinDate', 'EldestKidDOB', 'YoungestKidDOB', 'TransactionDatetime'])

In [None]:
train['IsWeekend'] = train['IsWeekend'].astype(int)

train['EldestKid_AgeDays'] = train['EldestKid_AgeDays'].astype(int)
train['YoungestKid_AgeDays'] = train['YoungestKid_AgeDays'].astype(int)

In [None]:
train.info()

In [None]:
data_train = train.drop(columns=['MemberID'])

In [None]:
categorical_cols = ['City', 'Source', 'ProductName', 'ProductCategory', 'ProductLevel', 'GrammageName']

le = LabelEncoder()
for col in categorical_cols:
    data_train[col] = le.fit_transform(data_train[col])

In [None]:
int32_cols = data_train.select_dtypes('int32').columns
data_train[int32_cols] = data_train[int32_cols].astype('int64')

scaler = StandardScaler()
cols_to_scale = ['PricePerUnit', 'Price', 'EldestKid_AgeYears', 'YoungestKid_AgeYears']
data_train[cols_to_scale] = scaler.fit_transform(data_train[cols_to_scale])

In [None]:
data_train[cols_to_scale].head()

In [None]:
data_train['Price'].value_counts()

In [None]:
print(data_train.dtypes.value_counts())
print(data_train.describe())

In [None]:
data_train.info()

In [None]:
data_train.to_csv('data_train.csv', index=False)

from google.colab import files
files.download('data_train.csv')

In [None]:
data_test.to_csv('data_test.csv', index=False)

from google.colab import files
files.download('data_test.csv')

### By-pass data

In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/data_train.csv')
data_test = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/data_test.csv')

In [None]:
data_train_feat = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/data_train_feat.csv')
data_test_feat = pd.read_csv('/content/drive/MyDrive/Intellectra 2025/dataset/data_test_feat.csv')

### Ope

### **Auto Feature Engineering**

In [None]:
X = data_train.drop(columns=['next_buy'])
y = data_train['next_buy']

autofeat = AutoFeatClassifier(verbose=1, feateng_steps=1)
X_feat = autofeat.fit_transform(X, y)

data_train_feat = X_feat.copy()
data_train_feat['next_buy'] = y.values

In [None]:
original_columns = X.columns.tolist()

X_test = data_test[original_columns]

In [None]:
X_feat_test = autofeat.transform(X_test)

data_test_feat = X_feat_test.copy()

In [None]:
data_train_feat.to_csv('data_train_feat.csv', index=False)

from google.colab import files
files.download('data_train_feat.csv')

In [None]:
data_test_feat.to_csv('data_test_feat.csv', index=False)

from google.colab import files
files.download('data_test_feat.csv')

In [None]:
if list((data_train_feat.drop(columns=['next_buy'])).columns) == list(data_test_feat.columns):
    print("Kolom train dan test sudah sama persis (nama & urutan).")
else:
    print("Kolom tidak sama atau urutannya berbeda.")

In [None]:
data_train_feat.info()

In [None]:
data_test_feat.info()

### **Correlation check**

**Imbalanced Handle and Feature Engineering**

In [None]:
print(data_train_feat['next_buy'].value_counts())
print(data_train_feat['next_buy'].value_counts(normalize=True))

counts = data_train_feat['next_buy'].value_counts()
imbalance_ratio = counts.min() / counts.max()
print("Rasio kelas minoritas terhadap mayoritas:", imbalance_ratio)

sns.countplot(x='next_buy', data=data_train_feat)
plt.title('Distribusi Kelas next_buy')
plt.show()

**Model Selection and Training**

In [None]:
X = data_train_feat.drop(columns=['next_buy'])
y = data_train_feat['next_buy']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from collections import Counter

def show_distribution(y, name=""):
    counts = Counter(y)
    total = sum(counts.values())
    for k, v in counts.items():
        print(f"{name} - Class {k}: {v} ({v/total:.2%})")

show_distribution(y_train, "Train")
show_distribution(y_valid, "Valid")

In [None]:
from collections import Counter
print("Sebelum balancing:", Counter(y_train))

In [None]:
smote_tomek = SMOTETomek(random_state=42)

X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

from collections import Counter
print("Setelah SMOTETomek:", Counter(y_train_resampled))

In [None]:
sns.countplot(x=y_train_resampled)
plt.title("Distribusi Label Setelah SMOTETomek")
plt.show()

### **Param Search Optuna**

**Tuning LGBM**

In [None]:
def balanced_accuracy_eval(y_pred, dataset):
    y_true = dataset.get_label()
    y_pred_binary = (y_pred > 0.5).astype(int)
    score = balanced_accuracy_score(y_true, y_pred_binary)
    return 'balanced_accuracy', score, True

In [None]:
def objective(trial):
    max_depth = trial.suggest_int("max_depth", 3, 8)
    max_leaves = min(200, 2 ** max_depth)
    min_leaves = min(max_leaves, 2 ** (max_depth - 1))

    param = {
        "objective": "binary",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "random_state": 42,
        "n_jobs": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", min_leaves, max_leaves),
        "max_depth": max_depth,
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 5),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "scale_pos_weight": scale_pos_weight,
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    balanced_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_resampled)):
        X_tr, X_val_ = X_train_resampled.iloc[train_idx], X_train_resampled.iloc[val_idx]
        y_tr, y_val_ = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]

        model = lgb.LGBMClassifier(
            n_estimators=1000,
            **param
        )

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val_, y_val_)],
            eval_metric=balanced_accuracy_eval,
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )

        y_pred = model.predict(X_val_)
        bal_acc = balanced_accuracy_score(y_val_, y_pred)
        print(f"Fold {fold+1} Balanced Accuracy: {bal_acc:.4f}")
        balanced_scores.append(bal_acc)

    return np.mean(balanced_scores)

In [None]:
db_path = "/content/drive/MyDrive/Intellectra 2025/dataset/optuna/optuna_study.db"

neg, pos = np.bincount(y_train_resampled)
scale_pos_weight = neg / pos

study = optuna.create_study(
    study_name="lgbm_optuna",
    direction='maximize',
    storage=f"sqlite:///{db_path}",
    load_if_exists=True
)

study.optimize(objective, n_trials=100, show_progress_bar=True)

In [None]:
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

study = optuna.load_study(
    study_name="lgbm_optuna",
    storage = "sqlite:////content/drive/MyDrive/Intellectra 2025/dataset/optuna/optuna_study.db"
)
study.optimize(objective, n_trials=30, show_progress_bar=True)

In [None]:
import json

with open("/content/drive/MyDrive/Intellectra 2025/backup_model and param/best_lgbm_params_2.json", "w") as f:
    json.dump(study.best_trial.params, f)

In [None]:
import joblib

joblib.dump(study, "/content/drive/MyDrive/Intellectra 2025/backup_model and param/study_lgbm.pkl")

In [None]:
print("Best trial:")
print("  Balanced Accuracy:", study.best_value)
print("  Best Params:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

**Tuning LogReg**

In [None]:
from sklearn.pipeline import make_pipeline

def objective_logreg(trial):
    C = trial.suggest_float("C", 1e-3, 10.0, log=True)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    solver = "liblinear" if penalty == "l1" else "lbfgs"

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    balanced_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_resampled)):
        X_tr, X_val_ = X_train_resampled.iloc[train_idx], X_train_resampled.iloc[val_idx]
        y_tr, y_val_ = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]

        model = make_pipeline(
            StandardScaler(),
            LogisticRegression(
                C=C,
                penalty=penalty,
                solver=solver,
                class_weight='balanced',
                max_iter=1000,
                random_state=42
            )
        )

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val_)
        bal_acc = balanced_accuracy_score(y_val_, y_pred)
        print(f"Fold {fold+1} Balanced Accuracy: {bal_acc:.4f}")
        balanced_scores.append(bal_acc)

    return np.mean(balanced_scores)

In [None]:
study_logreg = optuna.create_study(direction='maximize')
study_logreg.optimize(objective_logreg, n_trials=100, show_progress_bar=True)

In [None]:
import json

with open("/content/drive/MyDrive/Intellectra 2025/backup_model and param/best_logreg_params.json", "w") as f:
    json.dump(study_logreg.best_trial.params, f)

In [None]:
import joblib

joblib.dump(study_logreg, "/content/drive/MyDrive/Intellectra 2025/backup_model and param/study_logreg.pkl")

In [None]:
print("Best trial:")
print("  Balanced Accuracy:", study_logreg.best_value)
print("  Best Params:")
for key, value in study_logreg.best_params.items():
    print(f"    {key}: {value}")

**Tuning RF**

In [None]:
def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "class_weight": "balanced",
        "random_state": 42,
        "n_jobs": -1
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    balanced_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_resampled)):
        X_tr, X_val_ = X_train_resampled.iloc[train_idx], X_train_resampled.iloc[val_idx]
        y_tr, y_val_ = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]

        model = RandomForestClassifier(**params)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val_)
        bal_acc = balanced_accuracy_score(y_val_, y_pred)
        print(f"Fold {fold+1} Balanced Accuracy: {bal_acc:.4f}")
        balanced_scores.append(bal_acc)

    return np.mean(balanced_scores)

In [None]:
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=100, show_progress_bar=True)

In [None]:
with open("/content/drive/MyDrive/Intellectra 2025/backup_model and param/best_rf_params.json", "w") as f:
    json.dump(study_rf.best_trial.params, f)

In [None]:
import joblib

joblib.dump(study_rf, "/content/drive/MyDrive/Intellectra 2025/backup_model and param/study_rf.pkl")

### Evaluate and Model Train

In [None]:
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score, confusion_matrix

def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    print(f"\n=== {name} ===")

    def evaluate_split(split_name, X, y):
        y_pred = model.predict(X)

        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X)
            if isinstance(y_proba, pd.DataFrame):
                y_prob = y_proba.iloc[:, 1]
            else:
                y_prob = y_proba[:, 1]
        else:
            y_prob = None

        f1 = f1_score(y, y_pred)
        bal_acc = balanced_accuracy_score(y, y_pred)
        auc = roc_auc_score(y, y_prob) if y_prob is not None else "N/A"

        print(f"\n--- {split_name} ---")
        print("F1 Score          :", round(f1, 4))
        print("Balanced Accuracy :", round(bal_acc, 4))
        print("AUC Score         :", round(auc, 4) if auc != "N/A" else "N/A")
        print("Confusion Matrix:\n", confusion_matrix(y, y_pred))

    evaluate_split("Train", X_train, y_train)
    evaluate_split("Validation/Test", X_val, y_val)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

best_logreg_params = study_logreg.best_params
best_logreg_params.update({
    "class_weight": "balanced",
    "random_state": 42,
    "max_iter": 1000
})

best_logreg_model = make_pipeline(
    StandardScaler(),
    LogisticRegression(**best_logreg_params)
)

best_logreg_model.fit(X_train_resampled, y_train_resampled)
evaluate_model("Logistic Regression (Optuna Tuned)", best_logreg_model, X_train_resampled, y_train_resampled, X_valid, y_valid)

In [None]:
import json

with open("/content/drive/MyDrive/Intellectra 2025/backup_model and param/best_rf_params.json", "r") as f:
    best_rf_params = json.load(f)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# best_rf_params = study_rf.best_params
best_rf_params.update({
    "class_weight": "balanced",
    "random_state": 42,
    "n_jobs": -1
})

best_rf_model = RandomForestClassifier(
    **best_rf_params
)

best_rf_model.fit(X_train_resampled, y_train_resampled)
evaluate_model("Random Forest (Optuna Tuned)", best_rf_model, X_train_resampled, y_train_resampled, X_valid, y_valid)

In [None]:
import json

with open("/content/drive/MyDrive/Intellectra 2025/backup_model and param/best_lgbm_params_1.json", "r") as f:
    best_params = json.load(f)

In [None]:
neg, pos = np.bincount(y_train_resampled)
scale_pos_weight = neg / pos

best_params.update({
    "objective": "binary",
    "random_state": 42,
    "scale_pos_weight": scale_pos_weight,
    "n_jobs": -1
})

best_lgb_model = lgb.LGBMClassifier(
    n_estimators=1000,
    **best_params
)

best_lgb_model.fit(X_train_resampled, y_train_resampled)

In [None]:
neg, pos = np.bincount(y_train_resampled)
scale_pos_weight = neg / pos

# best_params = study.best_params
best_params.update({
    "objective": "binary",
    "random_state": 42,
    "scale_pos_weight": scale_pos_weight,
    "n_jobs": -1
})

best_lgb_model = lgb.LGBMClassifier(
    n_estimators=1000,
    **best_params
)

best_lgb_model.fit(X_train_resampled, y_train_resampled,
                   eval_set=[(X_valid, y_valid)],
                   eval_metric="binary_logloss",
                   callbacks=[lgb.early_stopping(50), lgb.log_evaluation(10)])
evaluate_model("LightGBM (Optuna Tuned)", best_lgb_model, X_train_resampled, y_train_resampled, X_valid, y_valid)

In [None]:
import joblib

joblib.dump(best_lgb_model, "best_lgb_model_1.pkl")

from google.colab import files
files.download("best_lgb_model_1.pkl")

In [None]:
neg, pos = np.bincount(y_train_resampled)
scale_pos_weight = neg / pos

logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
logreg.fit(X_train_resampled, y_train_resampled)
evaluate_model("Logistic Regression", logreg, X_train_resampled, y_train_resampled, X_valid, y_valid)

rf = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
rf.fit(X_train_resampled, y_train_resampled)
evaluate_model("Random Forest", rf, X_train_resampled, y_train_resampled, X_valid, y_valid)

lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42,
                               scale_pos_weight=scale_pos_weight)
lgb_model.fit(X_train_resampled, y_train_resampled)
evaluate_model("LightGBM", lgb_model, X_train_resampled, y_train_resampled, X_valid, y_valid)

**Ensemble Stacking**

In [None]:
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score, confusion_matrix

def evaluate_model_balacc(name, model, X_train, y_train, X_val, y_val):
    """
    Mengevaluasi model dengan metrik F1, Balanced Accuracy, AUC, dan Confusion Matrix
    pada data training dan validasi.

    Parameters:
    - name  : Nama model (string)
    - model : Objek model yang sudah dilatih
    - X_train, y_train : Data latih
    - X_val, y_val     : Data validasi/test
    """
    print(f"\n=== {name} ===")

    def evaluate_split(split_name, X, y):
        y_pred = model.predict(X)
        y_prob = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else None

        f1 = f1_score(y, y_pred)
        bal_acc = balanced_accuracy_score(y, y_pred)
        auc = roc_auc_score(y, y_prob) if y_prob is not None else "N/A"

        print(f"\n--- {split_name} ---")
        print(f"F1 Score          : {round(f1, 4)}")
        print(f"Balanced Accuracy : {round(bal_acc, 4)}")
        print(f"AUC Score         : {round(auc, 4) if auc != 'N/A' else 'N/A'}")
        print("Confusion Matrix  :\n", confusion_matrix(y, y_pred))

    evaluate_split("Train", X_train, y_train)
    evaluate_split("Validation/Test", X_val, y_val)

In [None]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score

base_estimators = [
    ('rf', best_rf_model),
    ('lgbm', best_lgb_model)
]

meta_learner = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_learner,
    passthrough=False,
    cv=5,
    n_jobs=-1
)

stacking_model.fit(X_train_resampled, y_train_resampled)

evaluate_model_balacc(
    name="Stacking RF + LGBM + Meta XGB",
    model=stacking_model,
    X_train=X_train_resampled,
    y_train=y_train_resampled,
    X_val=X_valid,
    y_val=y_valid
)

In [None]:
import shap

X_sample = X_valid.sample(n=200, random_state=42)

explainer = shap.TreeExplainer(best_lgb_model)
shap_values = explainer.shap_values(X_sample)

shap.summary_plot(shap_values, X_sample, max_display=15)

In [None]:
shap_inter = shap.TreeExplainer(best_lgb_model).shap_interaction_values(X_sample)
shap.summary_plot(shap_inter, X_sample)

In [None]:
data_test = data_test[(data_train.drop(columns=['next_buy'])).columns]

In [None]:
predictions = rf.predict(data_test_feat)

submission = pd.DataFrame({
    'MemberID': test['MemberID'],
    'next_buy': predictions.astype(int)
})

submission = submission.drop_duplicates(subset='MemberID', keep='first')

submission.to_csv('submission_rf.csv', index=False)

from google.colab import files
files.download('submission_rf.csv')

In [None]:
predictions = best_lgb_model.predict(data_test_feat)

submission = pd.DataFrame({
    'MemberID': test['MemberID'],
    'next_buy': predictions.astype(int)
})

submission = submission.drop_duplicates(subset='MemberID', keep='first')

submission.to_csv('submission_lgb_optuna_fulltrain.csv', index=False)

from google.colab import files
files.download('submission_lgb_optuna_fulltrain.csv')

In [None]:
predictions = predictor.predict(data_test_feat)

submission = pd.DataFrame({
    'MemberID': test['MemberID'],
    'next_buy': predictions.astype(int)
})

submission = submission.drop_duplicates(subset='MemberID', keep='first')

submission.to_csv('submission_auto_best.csv', index=False)

from google.colab import files
files.download('submission_auto_best.csv')

In [None]:
submission['next_buy'].value_counts()

In [None]:
submission = pd.DataFrame({
    'MemberID': data_test['MemberID'],
    'next_buy': predictions.astype(int)
})

submission.to_csv('submission.csv', index=False)