# Hyper-parameter optimization:

This notebook focuses on hyper-parameter optimization for the `MLPRegressor` model. 

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, csc_matrix
import logging
import time
from functools import partial
from hyperopt import hp,fmin,tpe,Trials
from hyperopt.pyll.base import scope
import price_alchemy.data_preprocessing as dp
import price_alchemy.config as cfg
import cred
import price_alchemy.logging_setup as ls
import price_alchemy.data_loading as dl
import price_alchemy.model_dispatcher as md
import price_alchemy.hpo as hpo
import price_alchemy.train as tr
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import mlflow
from mlflow.models import infer_signature

In [2]:
cd ..

/Users/mehuljain/Documents/course_related/ML_Ops/project/Price_Alchemy


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [4]:
# df= dl.load_data_sql(cred.MYSQL_PASSWORD)

## Load the data:

In [7]:
# in case of no internet
df= pd.read_csv('data/train.csv')

In [8]:
df_sample= dp.sample_df(df, sample_size=20000)

## Data preprocessing:

In [24]:
text_prep= cfg.TEXT_PREP_OPTS['spacy']
col_trans= 'tfidf_full'

X,y= dp.preprocessing_pipe(df_sample, text_prep, cfg.COL_TRANS_OPTS[col_trans])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['category_name'].replace('', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m_df['price'] = pd.to_numeric(m_df['price'], errors='coerce')


In [25]:
X.shape, y.shape

((19915, 22725), (19915,))

In [26]:
dp.dump_preprocessed_data([X,y], "tfidf_fullvocab_data_sm.pickle")

## Fine-tune parameters for the model:

In [8]:
model_dispatch= md.models['mlp']

In [9]:
# define optimization function
optmization_function=partial(hpo.optimize,
                            X=X,
                            y=y.values,
                            model=model_dispatch)

# define trials 
trials=Trials()

result=fmin(fn=optmization_function,
        space=md.PARAMS,
        algo=tpe.suggest,
        max_evals=15,
        trials=trials,
        )

100%|██████████| 15/15 [1:13:04<00:00, 292.32s/trial, best loss: 0.638574052984292] 


What are the optimized parameters?

In [10]:
result

{'batch_size': 392,
 'hidden_layers': 2.0,
 'hidden_neurons': 40.0,
 'learning_rate': 0,
 'learning_rate_init': 0.07178543610281606,
 'max_iter': 174.0}

Retrain the model

In [11]:
lr_type=["invscaling","adaptive"]

ps= { 
    'hidden_layer_sizes': tuple([int(result['hidden_neurons'])] * int(result['hidden_layers'])),
    "max_iter":int(result["max_iter"]),
    "learning_rate_init": float(result["learning_rate_init"]),
    "batch_size":int(result["batch_size"]),
    "learning_rate":lr_type[int(result["learning_rate"])]
    }

print(ps)

# set optimized parameters
model_dispatch.set_params(**ps)

# train the model
model,metrics= tr.train_model(X, y.values, model=model_dispatch)

{'hidden_layer_sizes': (40, 40), 'max_iter': 174, 'learning_rate_init': 0.07178543610281606, 'batch_size': 392, 'learning_rate': 'invscaling'}


Print the metrics from the cross-validation

In [12]:
metrics

{'mse': 1195.190586288016,
 'rmse': 34.26525658530161,
 'r_2': 0.20634697349776737,
 'rmsle': 0.638574052984292}

In [13]:
model.predict(X[:20])

array([ 9.85326749, 15.7522718 , 23.77956996,  9.85326749, 22.33101569,
       51.88262699, 13.65574284,  9.85326749, 13.32648811,  9.85326749,
       30.14747206, 12.97507125, 23.79788408, 13.01321044, 23.39780098,
       49.76458554, 15.34987673, 19.69728455, 83.94379329, 17.0519751 ])

In [14]:
y[:20].values

array([ 12.,  76.,  12.,   3.,  30., 225.,  16.,  12.,  15.,  17.,  28.,
         9.,  19.,   4.,  14.,  71.,  10.,  14.,  80.,  17.])

## Log the results in MLFlow:

Log the optimal parameters in MLFlow for later use

In [16]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:9000")

# Create a new MLflow Experiment
mlflow.set_experiment("tfidf_chargram")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(model.get_params())

    # Log the loss metric
    mlflow.log_metric("mean squared error", metrics['mse'])
    mlflow.log_metric("root mean squared error", metrics['rmse'])
    mlflow.log_metric("mean squared log error", metrics['rmsle'])
    mlflow.log_metric("r2", metrics['r_2'])

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("sample_size", f"{X.shape[0]}")
    mlflow.set_tag("num_cols", f"{X.shape[1]}")
    mlflow.set_tag("text_preprocessor", f"{text_prep}")
    mlflow.set_tag("column_transformer", f"{col_trans}")
    mlflow.set_tag("model_name",f"{type(model).__name__}")

    # Infer the model signature
    signature = infer_signature(X, model.predict(X))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="mlp3_reg",
        signature=signature,
        input_example=X,
        registered_model_name="mlp-chargram-optimized",
    )

Registered model 'mlp-chargram-optimized' already exists. Creating a new version of this model...
2024/04/19 20:45:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: mlp-chargram-optimized, version 2
Created version '2' of model 'mlp-chargram-optimized'.
