# 1. Import Library

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
# knnimputer
from sklearn.impute import KNNImputer

import joblib

import subprocess
import time



# 2. Configure MLflow

In [9]:
print("--- Proses Dimulai ---")
def start_mlflow_server():
    process = subprocess.Popen(
        ["mlflow", "server", "--host", "127.0.0.1", "--port", "8085", "--serve-artifacts"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )

    time.sleep(5)
    return process

print("Starting MLFlow server...")
mlflow_process = start_mlflow_server()

mlflow.set_tracking_uri("http://127.0.0.1:8085") 
mlflow.set_experiment("Airline Passenger Satisfaction")

--- Proses Dimulai ---
Starting MLFlow server...


<Experiment: artifact_location='mlflow-artifacts:/989256094349529342', creation_time=1759855909385, experiment_id='989256094349529342', last_update_time=1759855909385, lifecycle_stage='active', name='Airline Passenger Satisfaction', tags={}>

# 3. EDA

In [None]:
def preprocess_data(df):
    """Fungsi untuk membersihkan dan melakukan Pre-processing data."""
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    if 'id' in df.columns:
        df = df.drop(columns=['id'])
    if 'Arrival Delay in Minutes' in df.columns:
        mean_delay = df['Arrival Delay in Minutes'].mean()
        df['Arrival Delay in Minutes'].fillna(mean_delay, inplace=True)
    categorical_cols = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    return df

with mlflow.start_run(run_name="EDA"):
    mlflow.set_tag("developer", "Lutfi Alvaro Pratama")

    # --- 1. Memuat Data & EDA Awal ---
    print("\n[Tahap 1] Memuat data dan logging info awal...")
    try:
        df_train = pd.read_csv('./data/train.csv')
        df_test = pd.read_csv('./data/test.csv')
    except IndexError:
        print("Error: Pastikan file 'train.csv' dan 'test.csv' ada di dalam folder './data/'.")
        exit()
    mlflow.log_param("train_data_path", df_train)
    mlflow.log_param("test_data_path", df_test)
    mlflow.log_metric("initial_train_rows", df_train.shape[0])
    mlflow.log_metric("initial_train_cols", df_train.shape[1])
    initial_nulls = df_train['Arrival Delay in Minutes'].isnull().sum()
    mlflow.log_metric("initial_arrival_delay_nulls", initial_nulls)

    plt.figure(figsize=(12, 10))
    sns.heatmap(df_train.corr(), annot=True, fmt=".2f", cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    mlflow.log_artifact('correlation_matrix.png')
    plt.close()


[Tahap 1] Memuat data dan logging info awal...
🏃 View run EDA at: http://127.0.0.1:8085/#/experiments/989256094349529342/runs/2c76ec7a737c49308550d34e107272a2
🧪 View experiment at: http://127.0.0.1:8085/#/experiments/989256094349529342


# 4. Pre-processing

In [None]:
with mlflow.start_run(run_name="Data Preparation with KNNImputer"):
    print("\n[Tahap 2] Memulai persiapan data...")
    mlflow.log_param("imputation_strategy", "KNNImputer (k=5)")
    
    # --- A. Pembersihan Awal dan Pemisahan Fitur & Target ---
    for df in [df_train, df_test]:
        for col in ['Unnamed: 0', 'id']:
            if col in df.columns:
                df.drop(columns=[col], inplace=True)

    y_train_full_labels = df_train['satisfaction']
    X_train_full_raw = df_train.drop(columns=['satisfaction'])
    
    y_test_labels = df_test['satisfaction']
    X_test_raw = df_test.drop(columns=['satisfaction'])

    # --- B. Identifikasi Tipe Kolom ---
    numeric_cols = X_train_full_raw.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = X_train_full_raw.select_dtypes(include=['object']).columns.tolist()
    
    # --- C. Proses Fitur Numerik (Scaling & Imputation) ---
    scaler = StandardScaler()
    imputer = KNNImputer(n_neighbors=5)

    # Fit scaler & imputer HANYA pada data training untuk mencegah data leakage
    print("Fitting scaler dan imputer pada data training...")
    X_train_numeric_scaled = scaler.fit_transform(X_train_full_raw[numeric_cols])
    X_train_numeric_imputed = imputer.fit_transform(X_train_numeric_scaled)
    
    # Transform data test menggunakan scaler & imputer yang sudah di-fit
    print("Transforming data test...")
    X_test_numeric_scaled = scaler.transform(X_test_raw[numeric_cols])
    X_test_numeric_imputed = imputer.transform(X_test_numeric_scaled)
    
    # Kembalikan ke DataFrame dengan nama kolom dan index yang benar
    X_train_numeric_processed = pd.DataFrame(X_train_numeric_imputed, columns=numeric_cols, index=X_train_full_raw.index)
    X_test_numeric_processed = pd.DataFrame(X_test_numeric_imputed, columns=numeric_cols, index=X_test_raw.index)

    # --- D. Proses Fitur Kategorikal (One-Hot Encoding) ---
    print("Melakukan one-hot encoding pada fitur kategorikal...")
    X_train_categorical_processed = pd.get_dummies(X_train_full_raw[categorical_cols], drop_first=True)
    X_test_categorical_processed = pd.get_dummies(X_test_raw[categorical_cols], drop_first=True)

    # --- E. Gabungkan & Sejajarkan Kolom ---
    X_train_full_processed = pd.concat([X_train_numeric_processed, X_train_categorical_processed], axis=1)
    X_test_processed = pd.concat([X_test_numeric_processed, X_test_categorical_processed], axis=1)
    
    # Sejajarkan kolom test dengan train untuk memastikan konsistensi
    train_cols = X_train_full_processed.columns
    X_test_processed = X_test_processed.reindex(columns=train_cols, fill_value=0)
    
    # --- F. Encoding Target & Split Data untuk Validasi ---
    label_encoder = LabelEncoder()
    y_train_full = label_encoder.fit_transform(y_train_full_labels)
    y_test = label_encoder.transform(y_test_labels)
    joblib.dump(label_encoder, 'label_encoder.joblib')
    mlflow.log_artifact('label_encoder.joblib')


[Tahap 2] Pre-processing data...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Arrival Delay in Minutes'].fillna(mean_delay, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Arrival Delay in Minutes'].fillna(mean_delay, inplace=True)


🏃 View run Pre-processing at: http://127.0.0.1:8085/#/experiments/989256094349529342/runs/a1f8addb85fb40948b0aaf7c9b29758a
🧪 View experiment at: http://127.0.0.1:8085/#/experiments/989256094349529342


# 5. Data Split

In [None]:
with mlflow.start_run(run_name="Data Split"):
    print("\n[Tahap 3] Membagi data training untuk validasi...")
    
    test_split_ratio = 0.2
    random_state_split = 42
    
    mlflow.log_param("validation_size", test_split_ratio)
    mlflow.log_param("stratify_random_state", random_state_split)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full_processed, y_train_full, test_size=test_split_ratio, random_state=random_state_split, stratify=y_train_full
    )


[Tahap 3] Membagi data training untuk validasi...
🏃 View run Data Split at: http://127.0.0.1:8085/#/experiments/989256094349529342/runs/abde8b13f07e44bcaa9101a22fc4a8a8
🧪 View experiment at: http://127.0.0.1:8085/#/experiments/989256094349529342


# 6. Modeling, Hyperparameter Tuning, & Evaluation

In [None]:
# --- 4. Modeling & Hyperparameter Tuning ---
with mlflow.start_run(run_name="Modeling & Hyperparameter Tuning"):
    print("\n[Tahap 3] Memulai Hyperparameter Tuning...")
    param_grid = {
        'n_estimators': [200, 300],
        'max_depth': [8, 10, 12, 15],
        'min_samples_split': [20, 40],
        'min_samples_leaf': [10, 20],
        'max_features': ['sqrt', 'log2']
    }
    mlflow.log_param("param_grid", str(param_grid))
    
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced') # Tambah class_weight
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    mlflow.log_params(grid_search.best_params_)
    
    # --- 4. Evaluasi Model ---
    # (Fungsi evaluate_model_and_log tidak perlu diubah, jadi saya copy-paste langsung)
    def evaluate_model_and_log(model, X_data, y_true, data_name, label_encoder):
        """Fungsi untuk evaluasi dan logging metrik serta artefak ke MLflow."""
        y_pred = model.predict(X_data)
        y_pred_proba = model.predict_proba(X_data)[:, 1]
        class_names = label_encoder.classes_
        
        print(f"\n===== Mengevaluasi {data_name} =====")
        print(classification_report(y_true, y_pred, target_names=class_names))

        mlflow.log_metric(f"{data_name}_accuracy", accuracy_score(y_true, y_pred))
        mlflow.log_metric(f"{data_name}_precision", precision_score(y_true, y_pred))
        mlflow.log_metric(f"{data_name}_recall", recall_score(y_true, y_pred))
        mlflow.log_metric(f"{data_name}_f1_score", f1_score(y_true, y_pred))
        
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=class_names, yticklabels=class_names)
        plt.title(f'Confusion Matrix - {data_name}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        cm_path = f"confusion_matrix_{data_name}.png"
        plt.savefig(cm_path)
        plt.close()
        mlflow.log_artifact(cm_path, "evaluation_plots")
    
        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric(f"{data_name}_auc", roc_auc)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
        plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
        plt.title(f'Receiver Operating Characteristic (ROC) - {data_name}')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc="lower right")
        plt.grid(alpha=0.3)
        roc_path = f"roc_curve_{data_name}.png"
        plt.savefig(roc_path)
        plt.close()
        mlflow.log_artifact(roc_path, "evaluation_plots")

    print("\n[Tahap 4] Mengevaluasi model terbaik...")
    evaluate_model_and_log(best_model, X_train, y_train, "Training", label_encoder)
    evaluate_model_and_log(best_model, X_val, y_val, "Validation", label_encoder)
    evaluate_model_and_log(best_model, X_test_processed, y_test, "Test", label_encoder)

    # --- 5. Menyimpan Model ke MLflow ---
    print("\n[Tahap 5] Menyimpan model ke MLflow Server...")
    mlflow.sklearn.log_model(best_model, "random_forest_model")

print("\n--- MLflow Run Selesai ---")


[Tahap 4] Memulai Hyperparameter Tuning...
Fitting 3 folds for each of 32 candidates, totalling 96 fits

[Tahap 5] Mengevaluasi model terbaik...

===== Mengevaluasi Training =====
                         precision    recall  f1-score   support

neutral or dissatisfied       0.97      0.99      0.98     47103
              satisfied       0.99      0.95      0.97     36020

               accuracy                           0.97     83123
              macro avg       0.98      0.97      0.97     83123
           weighted avg       0.97      0.97      0.97     83123


===== Mengevaluasi Validation =====
                         precision    recall  f1-score   support

neutral or dissatisfied       0.96      0.97      0.96     11776
              satisfied       0.96      0.94      0.95      9005

               accuracy                           0.96     20781
              macro avg       0.96      0.96      0.96     20781
           weighted avg       0.96      0.96      0.96     207




[Tahap 6] Menyimpan model ke MLflow Server...




🏃 View run Modeling & Hyperparameter Tuning at: http://127.0.0.1:8085/#/experiments/989256094349529342/runs/ff8dd2d67d664dc993847728955e4385
🧪 View experiment at: http://127.0.0.1:8085/#/experiments/989256094349529342

--- MLflow Run Selesai ---


# 7. Export Model

In [14]:
# --- 7. Ekspor Model Final ---
model_filename = "best_airline_satisfaction_model.joblib"
joblib.dump(best_model, model_filename, compress=3)
print(f"\nModel final juga diekspor secara terpisah ke: '{model_filename}'")
print("\n--- Proses Selesai ---")


Model final juga diekspor secara terpisah ke: 'best_airline_satisfaction_model.joblib'

--- Proses Selesai ---


# 8. Stop MLflow Server

In [None]:
print("Menghentikan server MLflow...")
mlflow_process.terminate()
mlflow_process.wait()