# Flight Price Prediction
#### Tuning with CV and logging with MLFlow 

In [5]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error



from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

import mlflow
import mlflow.sklearn
import mlflow.xgboost


# Data Import

In [None]:


from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class Dropper(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.feature_names)

class Normalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = MinMaxScaler()

    def fit(self, X, y=None):
        self.numericalFeatures = X.select_dtypes(exclude='object').columns
        self.otherFeatures = X.columns.difference(self.numericalFeatures)
        self.scaler.fit(X[self.numericalFeatures])
        return self

    def transform(self, X):
        df_normalized = pd.DataFrame(self.scaler.transform(X[self.numericalFeatures]), columns=self.numericalFeatures)
        return pd.concat([df_normalized, X[self.otherFeatures]], axis=1)

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ohEncoder = OneHotEncoder()

    def fit(self, X, y=None):
        self.categoricalFeatures = X.select_dtypes(include='object').columns
        self.otherFeatures = X.columns.difference(self.categoricalFeatures)
        self.ohEncoder.fit(X[self.categoricalFeatures])
        return self

    def transform(self, X):
        df_encoded = pd.DataFrame(self.ohEncoder.transform(X[self.categoricalFeatures]).toarray(), columns=self.ohEncoder.get_feature_names_out())
        return pd.concat([df_encoded, X[self.otherFeatures]], axis=1)


pipe = Pipeline([
    ('flightDrop', Dropper(['flight'])),
    # ('targetDrop', Dropper(['target'])),
    ('encoder', CategoricalEncoder()),
    ('scaler', Normalizer())
])


X = pd.read_csv(os.path.join('../archive', 'Clean_Dataset.csv'), index_col=0)
X = pipe.fit_transform(X)
y = X['price'].to_numpy().reshape(-1, 1)
X.drop(columns=['price'], axis=1, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=0.3)



In [None]:
models = [

    ( 
        "Decision Tree",
        DecisionTreeRegressor(),
        {
            'max_depth': [3, 4, 5, 8, 12, 25, 40],
            'min_samples_split': [2, 4, 6, 8],
            'min_samples_leaf': [1, 2, 4]
        },
    ),
    (
        "Random Forest",
        RandomForestRegressor(),
        {
            'n_estimators': [100, 200, 300, 500],
            'max_depth': [5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
    ),
    (
        "Gradient Boosting",
        GradientBoostingRegressor(),
        {
            'n_estimators': [100, 200, 300, 500],
            'learning_rate': [0.1, 0.05, 0.01],
            'max_depth': [3, 4, 5, 6],
            'min_samples_split': [2, 4, 6],
            'min_samples_leaf': [1, 2, 4]
            #"loss": "squared_error",
        }
    ),
    (
        "XBGradient Boost",
        XGBRegressor(),
        {
            'n_estimators': [100, 200, 300, 400, 500],
            'learning_rate': [0.1, 0.05, 0.01, 0.001],

            # Overfitting 
            'min_child_weight' : [3, 10, 50, 100, 200],
            'max_depth': [6, 8, 16],
            'gamma': [0, 0.1, 0.5],

            'subsample': [0.5, 0.8, 1],
            'colsample_bytree': [0.5, 0.8, 1],
            # 'eta' : [0.01, 0.05, 0.1 , 0.2]
            # 'num_round' : [ ] 

            # 'reg_alpha': [0, 0.1, 0.5], 
            # 'reg_lambda': [0, 0.1, 0.5]
        }
    ),
    (
        "Neural Network (MLP Regressor)",
        MLPRegressor(max_iter=1000),
        {
            'hidden_layer_sizes': [(50,50), (100,), (100, 50)],
            'activation': ['relu', 'tanh'],
            'solver': ['adam', 'sgd'],
            'learning_rate': ['constant', 'adaptive']
        }
    )
]

Using RMSE because it penalizes large errors and is  easy to interpret (same scale as prices).

In pricing predictions, RMSE is commonly used as it directly measures prediction accuracy.

In [30]:
def hyperparameter_tuning(model, params, X, y, tuning_type='grid', cv=5, n_iter=10, scoring='neg_root_mean_squared_error', random_state=42):
    search = None
    
    if tuning_type == 'grid':
        search = GridSearchCV(model, param_grid=params, cv=cv, scoring=scoring, n_jobs=-1)
    elif tuning_type == 'random':
        search = RandomizedSearchCV(model, param_distributions=params, cv=cv, n_iter=n_iter, 
                                    scoring=scoring, random_state=random_state, n_jobs=-1)
    else:
        raise ValueError("Invalid tuning_type. Choose either 'grid' or 'random'.")

    search.fit(X, y) 
    
    best_params = search.best_params_
    best_score = search.best_score_
    best_model = search.best_estimator_

    print(f"Scoring is {scoring}")
    print("Best Cross-Validation Score:", best_score)
    # print("Best Parameters:", best_params)
    
    return best_params, best_score, best_model


# Tuning

Find the best parameters using random or grid search w/ CV.

Then saves model, parameters, and metrics.

In [37]:

model_logs = []

for model_name, model, params in models:
    
    best_params, best_score, best_model = hyperparameter_tuning(model, params, X, y, tuning_type='random')

    # best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    metrics = {
        "MAE" : mae,
        "RMSE" : rmse,
        "r2" : r2,
        "best CV RMSE" : best_score
    }

    
    
    print('-' * 30)
    print(f"Model: {model_name}")
    print(f"  Best CV Score: {best_score:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R²: {r2:.4f}")
    print('-' * 30)
    print()

    model_logs.append({
        "model_name": model_name,
        "model": best_model,
        "params": best_params,
        # "score": best_score,
        "metrics": metrics,
    })


Scoring is neg_root_mean_squared_error
Best Cross-Validation Score: -0.04178971600569685
------------------------------
Model: Decision Tree
  Best CV Score: -0.0418
  RMSE: 0.0476
  MAE: 0.0287
  R²: 0.9348
------------------------------



  self.best_estimator_.fit(X, y, **fit_params)


Scoring is neg_root_mean_squared_error
Best Cross-Validation Score: -0.04333864404919406
------------------------------
Model: Random Forest
  Best CV Score: -0.0433
  RMSE: 0.0354
  MAE: 0.0196
  R²: 0.9638
------------------------------



  y = column_or_1d(y, warn=True)


Scoring is neg_root_mean_squared_error
Best Cross-Validation Score: -0.03960334651663975
------------------------------
Model: Gradient Boosting
  Best CV Score: -0.0396
  RMSE: 0.0401
  MAE: 0.0236
  R²: 0.9537
------------------------------

Scoring is neg_root_mean_squared_error
Best Cross-Validation Score: -0.040003799863844024
------------------------------
Model: XBGradient Boost
  Best CV Score: -0.0400
  RMSE: 0.0375
  MAE: 0.0223
  R²: 0.9595
------------------------------



  y = column_or_1d(y, warn=True)


Scoring is neg_root_mean_squared_error
Best Cross-Validation Score: -0.047364349192800304
------------------------------
Model: Neural Network (MLP Regressor)
  Best CV Score: -0.0474
  RMSE: 0.0334
  MAE: 0.0198
  R²: 0.9679
------------------------------



In [None]:
import mlflow
import mlflow.sklearn



# run_name = "tuning"
# artifact_path = "artifact"

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Model Tuning")

for i, element in enumerate(model_logs):
    model_name = element["model_name"]
    model = element["model"]
    params = element["params"]
    metrics = element["metrics"]

    # model_name = element[0]
    # model = element[1]
    # params = element[2]
    # metrics = model_logs[i]
    
    # with mlflow.start_run(run_name=run_name) as run:
    with mlflow.start_run():
        #   model.get_depth()  model.get_n_leaves())
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)  
        # mlflow.set_tag("Training Info", model_name)
        mlflow.set_tag("mlflow.runName", model_name)

        # signature = infer_signature(X_train, model.predict(X_train))
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, model_name)
            # mlflow.xgboost.log_model(
            #     sk_model=model, input_example=X_test, artifact_path=artifact_path
            # )
        else:
            mlflow.sklearn.log_model(model, model_name)  
            # mlflow.sklearn.log_model(
            #     sk_model=model, input_example=X_test, artifact_path=artifact_path
            # )





🏃 View run Decision Tree at: http://localhost:5000/#/experiments/675764874958078904/runs/6abb9a20680549e69c726913e3f806c1
🧪 View experiment at: http://localhost:5000/#/experiments/675764874958078904




🏃 View run Random Forest at: http://localhost:5000/#/experiments/675764874958078904/runs/8801d978318e4cfaa32c0421f2801910
🧪 View experiment at: http://localhost:5000/#/experiments/675764874958078904




🏃 View run Gradient Boosting at: http://localhost:5000/#/experiments/675764874958078904/runs/762de682d3e14f44aeb5775a65939e0f
🧪 View experiment at: http://localhost:5000/#/experiments/675764874958078904




🏃 View run XBGradient Boost at: http://localhost:5000/#/experiments/675764874958078904/runs/d21596d3207f4d41b1c1ee42aa4c8259
🧪 View experiment at: http://localhost:5000/#/experiments/675764874958078904




🏃 View run Neural Network (MLP Regressor) at: http://localhost:5000/#/experiments/675764874958078904/runs/7e39ef9589e4420f994aab2477090888
🧪 View experiment at: http://localhost:5000/#/experiments/675764874958078904


In [9]:

run_id = "7e39ef9589e4420f994aab2477090888"
model_name = "NN"
model_uri = f"runs:{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)

Registered model 'NN' already exists. Creating a new version of this model...


MlflowException: Not a proper runs:/ URI: runs:7e39ef9589e4420f994aab2477090888/NN. Runs URIs must be of the form 'runs:/<run_id>/run-relative/path/to/artifact'