> LOAD & BASIC CLEANING

In [1]:
import pandas as pd
import numpy as np
import joblib

df = pd.read_csv('data/skill_builder_data.csv', encoding='latin1')

df = df.copy()
df['user_id'] = df['user_id'].astype(str)
df['problem_id'] = df['problem_id'].astype(str)
df['skill_id'] = df['skill_id'].fillna('unknown').astype(str)
df['correct'] = pd.to_numeric(df['correct'], errors='coerce').fillna(0).astype(int)

df['ms_first_response'] = pd.to_numeric(df.get('ms_first_response', 0), errors='coerce').fillna(0)
df['ms_first_response'] = df['ms_first_response'].clip(upper=600000)
df['log_response_time'] = np.log1p(df['ms_first_response'])

for c in ['hint_count','hint_total','attempt_count','opportunity']:
    df[c] = pd.to_numeric(df.get(c,0), errors='coerce').fillna(0).astype(int)

df = df.sort_values(['user_id']).reset_index(drop=True)

  df = pd.read_csv('data/skill_builder_data.csv', encoding='latin1')
  result = getattr(ufunc, method)(*inputs, **kwargs)


> FEATURE ENGINEERING

In [2]:
# past correct
df['past_correct_skill'] = (
    df.groupby(['user_id','skill_id'])['correct']
    .transform(lambda s: s.shift().fillna(0).cumsum())
)

# attempts
df['attempts_skill'] = df.groupby(['user_id','skill_id']).cumcount()

# success rate
df['success_rate_skill'] = df['past_correct_skill'] / df['attempts_skill'].replace(0, np.nan)
df['success_rate_skill'] = df['success_rate_skill'].fillna(0)

# streak
def prev_streak(s):
    prev = s.shift().fillna(0).astype(int).to_numpy()
    out = np.zeros(len(prev), dtype=int)
    cur = 0
    for i, v in enumerate(prev):
        cur = cur + 1 if v == 1 else 0
        out[i] = cur
    return out

df['prev_streak_skill'] = df.groupby(['user_id','skill_id'])['correct'].transform(prev_streak)


# difficulty estimation
prob_stats = df.groupby('problem_id')['correct'].mean()
df['difficulty_est'] = 1 - df['problem_id'].map(prob_stats).fillna(0.5)

> LABEL ENCODING

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['skill_id', 'problem_id']
encoders = {}

for c in cat_cols:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c].astype(str))
    encoders[c] = le

joblib.dump(encoders, 'label_encoders.joblib')

> FEATURE SELECTION (Mutual Information)

In [None]:
from sklearn.feature_selection import mutual_info_classif

candidate_feats = [
    'skill_id','problem_id',
    'past_correct_skill','attempts_skill','success_rate_skill','prev_streak_skill',
    'hint_count','hint_total','attempt_count',
    'log_response_time','difficulty_est','opportunity'
]

X = df[candidate_feats].fillna(0)
y = df['correct']

mi = mutual_info_classif(X, y, random_state=42)
mi_series = pd.Series(mi, index=candidate_feats).sort_values(ascending=False)

top_features = mi_series.index[:10].tolist()
joblib.dump(top_features, 'selected_features.joblib')

print("Selected features:", top_features)

> TEMPORAL SPLIT

In [None]:
def temporal_split(df, ratio=0.1):
    train_idx, test_idx = [], []
    for uid, g in df.groupby('user_id'):
        n = len(g)
        n_test = max(1, int(np.ceil(n * ratio)))
        train_idx += list(g.index[:-n_test])
        test_idx  += list(g.index[-n_test:])
    return df.loc[train_idx], df.loc[test_idx]

train_df, test_df = temporal_split(df)

X_train = train_df[top_features]
y_train = train_df['correct']
X_test  = test_df[top_features]
y_test  = test_df['correct']

> HYPERPARAMETER TUNING (RandomizedSearchCV)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

param_grid = {
    "num_leaves": [31, 50, 70],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [200, 500, 800],
    "max_depth": [-1, 5, 10]
}

base_model = LGBMClassifier(random_state=42)

tuner = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_grid,
    n_iter=10,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=1
)

tuner.fit(X_train, y_train)

best_model = tuner.best_estimator_
print("Best params →", tuner.best_params_)

joblib.dump(best_model, "model/lgb_best.joblib")

> FINAL EVALUATION (TEST SET)

In [None]:
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    accuracy_score,
    precision_score
)
import matplotlib.pyplot as plt

# --- Prediksi ---
proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

# --- Metrics ---
auc_score = roc_auc_score(y_test, proba)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)

print("AUC:", auc_score)
print("Accuracy:", acc)
print("Precision:", prec)

# --- Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,5))

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="viridis")   # <-- 4 warna (gradasi hijau-biru)
plt.title("Confusion Matrix (4-color heatmap)")
plt.grid(False)
plt.show()

> SAVE PIPELINE

In [None]:
pipeline = {
    "encoders": encoders,
    "selected_features": top_features,
    "model": best_model
}

joblib.dump(pipeline, "assist_model_pipeline.joblib")
print("Saved → assist_model_pipeline.joblib")