# 1. Import Library

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score

import joblib

import subprocess
import time

# 2. Configure MLflow

In [2]:
print("--- Proses Dimulai ---")
def start_mlflow_server():
    process = subprocess.Popen(
        ["mlflow", "server", "--host", "0.0.0.0", "--port", "8085", "--serve-artifacts"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )

    time.sleep(5)
    return process

print("Starting MLFlow server...")
mlflow_process = start_mlflow_server()

mlflow.set_tracking_uri("http://127.0.0.1:8085") 
mlflow.set_experiment("Airline Passenger Satisfaction")

--- Proses Dimulai ---
Starting MLFlow server...


2025/10/07 14:28:04 INFO mlflow.tracking.fluent: Experiment with name 'Airline Passenger Satisfaction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1759822084224, experiment_id='1', last_update_time=1759822084224, lifecycle_stage='active', name='Airline Passenger Satisfaction', tags={}>

# 3. EDA

In [3]:
def preprocess_data(df):
    """Fungsi untuk membersihkan dan melakukan Pre-processing data."""
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    if 'id' in df.columns:
        df = df.drop(columns=['id'])
    if 'Arrival Delay in Minutes' in df.columns:
        mean_delay = df['Arrival Delay in Minutes'].mean()
        df['Arrival Delay in Minutes'].fillna(mean_delay, inplace=True)
    categorical_cols = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    return df

with mlflow.start_run(run_name="EDA"):
    mlflow.set_tag("developer", "Lutfi Alvaro Pratama")

    # --- 1. Memuat Data & EDA Awal ---
    print("\n[Tahap 1] Memuat data dan logging info awal...")
    try:
        df_train = pd.read_csv('./data/train.csv')
        df_test = pd.read_csv('./data/test.csv')
    except IndexError:
        print("Error: Pastikan file 'train.csv' dan 'test.csv' ada di dalam folder './data/'.")
        exit()
    mlflow.log_param("train_data_path", './data/train.csv')
    mlflow.log_param("test_data_path", './data/test.csv')
    mlflow.log_metric("initial_train_rows", df_train.shape[0])
    mlflow.log_metric("initial_train_cols", df_train.shape[1])
    initial_nulls = df_train['Arrival Delay in Minutes'].isnull().sum()
    mlflow.log_metric("initial_arrival_delay_nulls", initial_nulls)



[Tahap 1] Memuat data dan logging info awal...


# 4. Pre-processing

In [4]:
with mlflow.start_run(run_name="Pre-processing"):
    # --- 2. Pre-processing & Feature Engineering ---
    print("\n[Tahap 2] Pre-processing data...")
    categorical_cols_to_encode = df_train.select_dtypes(include=['object']).columns.drop('satisfaction').tolist()
    
    mlflow.log_param("categorical_features_encoded", str(categorical_cols_to_encode))
    mlflow.log_param("imputation_strategy_arrival_delay", "mean")
    
    y_train_full_labels = df_train['satisfaction']
    X_train_full_raw = df_train.drop(columns=['satisfaction'])
    X_train_full_processed = preprocess_data(X_train_full_raw)
    
    y_test_labels = df_test['satisfaction']
    X_test_raw = df_test.drop(columns=['satisfaction'])
    X_test_processed = preprocess_data(X_test_raw)
    
    label_encoder = LabelEncoder()
    y_train_full = label_encoder.fit_transform(y_train_full_labels)
    y_test = label_encoder.transform(y_test_labels)
    
    mlflow.log_metric("processed_train_cols", X_train_full_processed.shape[1])



[Tahap 2] Pre-processing data...


# 5. Data Split

In [5]:
with mlflow.start_run(run_name="Data Split"):
    print("\n[Tahap 3] Membagi data training untuk validasi...")
    train_cols = X_train_full_processed.columns
    X_test_processed = X_test_processed.reindex(columns=train_cols, fill_value=0)
    
    test_split_ratio = 0.2
    random_state_split = 42
    
    mlflow.log_param("validation_size", test_split_ratio)
    mlflow.log_param("stratify_random_state", random_state_split)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full_processed, y_train_full, test_size=test_split_ratio, random_state=random_state_split, stratify=y_train_full
    )


[Tahap 3] Membagi data training untuk validasi...


# 6. Modeling, Hyperparameter Tuning, & Evaluation

In [6]:
# --- 4. Modeling & Hyperparameter Tuning ---
with mlflow.start_run(run_name="Modeling & Hyperparameter Tuning"):
    print("\n[Tahap 4] Memulai Hyperparameter Tuning...")
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 30],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [5, 10]
    }
    mlflow.log_param("param_grid", str(param_grid))
    
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    mlflow.log_params(grid_search.best_params_)
    
    # --- 5. Evaluasi Model ---
    def evaluate_model_and_log(model, X_data, y_true, data_name, label_encoder):
        """Fungsi untuk evaluasi dan logging metrik serta artefak ke MLflow."""
        y_pred = model.predict(X_data)
        y_pred_proba = model.predict_proba(X_data)[:, 1]
        class_names = label_encoder.classes_
        
        print(f"\n===== Mengevaluasi {data_name} =====")
        print(classification_report(y_true, y_pred, target_names=class_names))

        mlflow.log_metric(f"{data_name}_accuracy", accuracy_score(y_true, y_pred))
        mlflow.log_metric(f"{data_name}_precision", precision_score(y_true, y_pred))
        mlflow.log_metric(f"{data_name}_recall", recall_score(y_true, y_pred))
        mlflow.log_metric(f"{data_name}_f1_score", f1_score(y_true, y_pred))
        
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=class_names, yticklabels=class_names)
        plt.title(f'Confusion Matrix - {data_name}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        cm_path = f"confusion_matrix_{data_name}.png"
        plt.savefig(cm_path)
        plt.close()
        mlflow.log_artifact(cm_path, "evaluation_plots")
    
        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric(f"{data_name}_auc", roc_auc)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
        plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
        plt.title(f'Receiver Operating Characteristic (ROC) - {data_name}')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc="lower right")
        plt.grid(alpha=0.3)
        roc_path = f"roc_curve_{data_name}.png"
        plt.savefig(roc_path)
        plt.close()
        mlflow.log_artifact(roc_path, "evaluation_plots")
    
    print("\n[Tahap 5] Mengevaluasi model terbaik...")
    evaluate_model_and_log(best_model, X_train, y_train, "Training", label_encoder)
    evaluate_model_and_log(best_model, X_val, y_val, "Validation", label_encoder)
    evaluate_model_and_log(best_model, X_test_processed, y_test, "Test", label_encoder)

    # --- 6. Menyimpan Model ke MLflow ---
    print("\n[Tahap 6] Menyimpan model ke MLflow Server...")
    mlflow.sklearn.log_model(best_model, "random_forest_model")

print("\n--- MLflow Run Selesai ---")


[Tahap 4] Memulai Hyperparameter Tuning...
Fitting 3 folds for each of 16 candidates, totalling 48 fits

[Tahap 5] Mengevaluasi model terbaik...

===== Mengevaluasi Training =====
                         precision    recall  f1-score   support

neutral or dissatisfied       0.99      1.00      0.99     47103
              satisfied       1.00      0.98      0.99     36020

               accuracy                           0.99     83123
              macro avg       0.99      0.99      0.99     83123
           weighted avg       0.99      0.99      0.99     83123


===== Mengevaluasi Validation =====
                         precision    recall  f1-score   support

neutral or dissatisfied       0.96      0.97      0.97     11776
              satisfied       0.97      0.94      0.95      9005

               accuracy                           0.96     20781
              macro avg       0.96      0.96      0.96     20781
           weighted avg       0.96      0.96      0.96     207

# 7. Export Model

In [7]:
# --- 7. Ekspor Model Final ---
model_filename = "best_airline_satisfaction_model.joblib"
joblib.dump(best_model, model_filename)
print(f"\nModel final juga diekspor secara terpisah ke: '{model_filename}'")
print("\n--- Proses Selesai ---")


Model final juga diekspor secara terpisah ke: 'best_airline_satisfaction_model.joblib'

--- Proses Selesai ---
