In [None]:
import argparse
import pandas as pd
import json
import os
import joblib
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from features import prepare_X_y
from preprocess import prepare_dataset

import mlflow
import mlflow.sklearn


def run_train(users_csv, flights_csv, hotels_csv=None):
    print("INFO: [MLflow] Starting model training...")

    # --- Connect to your MLflow tracking server ---
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("Voyage Analytics Model")

    # --- Start MLflow run ---
    with mlflow.start_run(run_name="RandomForest_Training"):

        print("INFO: Merging datasets...")
        df = prepare_dataset(users_csv, flights_csv, hotels_csv)

        print("INFO: Preparing features and target...")
        X, y, preprocessor, num_cols, cat_cols = prepare_X_y(df, target="price")

        print(f"INFO: Using {len(num_cols)} numeric and {len(cat_cols)} categorical columns.")
        print(f"INFO: Training pipeline on {X.shape[0]} rows, {X.shape[1]} features")

        # --- Model setup ---
        params = {
            "n_estimators": 200,
            "max_depth": 10,
            "random_state": 42
        }

        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", RandomForestRegressor(**params))
        ])

        # --- Log model parameters ---
        mlflow.log_params(params)

        # --- Train model ---
        pipeline.fit(X, y)

        # --- Save columns.json ---
        os.makedirs("src", exist_ok=True)
        columns_info = {"num_cols": num_cols, "cat_cols": cat_cols, "target": "price"}
        with open("src/columns.json", "w", encoding="utf-8") as f:
            json.dump(columns_info, f, indent=2)
        print("INFO: columns.json saved in src/")

        # --- Save model locally ---
        model_dir = "model/voyage_model/1"
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir, "model.pkl")
        joblib.dump(pipeline, model_path)
        print(f"INFO: Model saved at {model_path}")

        # --- Log model to MLflow ---
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="model",
            registered_model_name="VoyagePricePredictor"
        )

        print("INFO: Model logged to MLflow successfully.")

        # --- Example metric (just for demo) ---
        r2 = pipeline.score(X, y)
        mlflow.log_metric("r2_score", r2)
        print(f"INFO: Logged metric r2_score = {r2:.4f}")

    print("✅ Training complete and tracked with MLflow!")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--users", required=True)
    parser.add_argument("--flights", required=True)
    parser.add_argument("--hotels", required=False)
    args = parser.parse_args()
    run_train(args.users, args.flights, args.hotels)


Training Script (train_regression.py)

Argument Parsing
Takes file paths for users, flights, and hotels datasets from the command line, making the script reusable and automation-friendly.

MLflow Setup
Connects to an MLflow tracking server, sets an experiment, and starts a run to track parameters, metrics, and models.

Dataset Preparation
Merges users, flights, and hotels data into a single dataset and prepares features (X) and target (y) using preprocessing utilities.

Feature Engineering
Separates numerical and categorical columns and applies appropriate preprocessing through a Scikit-learn preprocessor.

Model Pipeline
Uses a Scikit-learn Pipeline combining preprocessing and a RandomForestRegressor, ensuring consistent transformations during training and inference.

Model Training & Logging
Trains the model, logs hyperparameters and R² score to MLflow, and registers the model in the MLflow Model Registry.

Model Persistence
Saves the trained pipeline using Joblib and stores feature metadata in columns.json to maintain schema consistency during predictions.

1. Environment Setup

A Python virtual environment is activated to ensure dependency isolation and reproducibility. All required libraries are installed using a requirements.txt file, which includes Flask for API development, Pandas for data handling, Scikit-learn for machine learning, MLflow for experiment tracking, and Joblib for model serialization.

This setup ensures that the project can be consistently executed across different systems without dependency conflicts.

2. Model Training Pipeline

The regression model is trained using a modular training script. The training process begins by merging the users, flights, and hotels datasets into a unified dataset. This consolidated view enables richer feature representation and improved prediction accuracy.

Feature preparation is handled through a dedicated preprocessing pipeline that separates numerical and categorical variables. Categorical features are encoded, numerical features are scaled as required, and the target variable is defined as flight price.

A Random Forest Regressor is selected for its ability to handle non-linear relationships and mixed feature types. The model is trained using a Scikit-learn pipeline that combines preprocessing and model training into a single, reproducible workflow.

3. Experiment Tracking with MLflow

MLflow is integrated into the training pipeline to track model parameters, metrics, and artifacts. During training, hyperparameters such as the number of trees and maximum depth are logged, along with performance metrics like the R² score.

The trained model is registered in the MLflow Model Registry, enabling version control, comparison across experiments, and smooth promotion to production stages.

4. Model Serialization and Metadata Storage

After training, the complete pipeline is saved locally using Joblib. Along with the model, a columns.json file is generated to store metadata about the features used during training. This ensures that incoming API requests align exactly with the model’s expected input structure.

This design prevents schema mismatches during inference and improves model reliability in production.