# Step 2 - Experiment tracking with MLflow

Details and rationale are implemented in the code cells below.


### 1. Load libraries

Details and rationale are implemented in the code cells below.


In [None]:
# Import libraries
import mlflow
import mlflow.sklearn
import pandas as pd

from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from sklearn.impute import SimpleImputer

### 2. Load prepared data

Details and rationale are implemented in the code cells below.


In [None]:
# 1. Load data
df = pd.read_csv('../data/output/train_clean_sample.csv', low_memory=False)

### 3. One-hot encoding

Details and rationale are implemented in the code cells below.


In [None]:
object_cols = df.select_dtypes(include='object').columns
print(f"Categorical columns to encode : {list(object_cols)}")

if len(object_cols) > 0:
    df = pd.get_dummies(df, columns=object_cols, drop_first=True)

### 4. Split features and target

Details and rationale are implemented in the code cells below.


In [None]:
X = df.drop(columns=['TARGET'])
y = df['TARGET']

imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

### 5. Train/test split

Details and rationale are implemented in the code cells below.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

### 6. Initialize MLflow

Details and rationale are implemented in the code cells below.


In [None]:
# Tracking with MLflow
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("random_forest_test")
mlflow.set_tags({
    "project": "Credit Scoring Project 6",
    "stage": "experiment_1",
    "author": "David Worsley-Tonks"
})

### 7. Define hyperparameters

Details and rationale are implemented in the code cells below.


In [None]:
params = {
    "n_estimators": 100,
    "max_depth": 5
}

### 8. Train and track the model with MLflow

Details and rationale are implemented in the code cells below.


In [None]:
if mlflow.active_run():
    mlflow.end_run()

with mlflow.start_run():
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Log hyperparameters
    for key, value in params.items():
        mlflow.log_param(key, value)

    # Log metric
    mlflow.log_metric("accuracy", acc)

    # Signature and input example for the model
    signature = infer_signature(X_train, y_pred)
    input_example = X_train.iloc[:5]

    mlflow.sklearn.log_model(
        model,
        name="random_forest_model",
        signature=signature,
        input_example=input_example
    )

    print("Experiment ID:", mlflow.active_run().info.experiment_id)
    print("Run ID:", mlflow.active_run().info.run_id)

### 9. Explore results in the MLflow UI

Details and rationale are implemented in the code cells below.


In [None]:
# Save datasets for modeling
X_train.to_parquet("../data/output/X_train.parquet")
X_test.to_parquet("../data/output/X_test.parquet")
y_train.to_frame().to_parquet("../data/output/y_train.parquet")
y_test.to_frame().to_parquet("../data/output/y_test.parquet")

### Conclusion

Details and rationale are implemented in the code cells below.
