# Experiment tracking with MLFLow:

This notebook is all about performing and tracking different experiments using MLFlow.

In [1]:
import numpy as np
import pandas as pd    
# price alchemy imports
import price_alchemy.config as cfg
from price_alchemy.config import WordVectorTransformer
from price_alchemy.data_loading import load_data_sql, load_data_gcp
from price_alchemy.data_preprocessing import sample_df, preprocessing_pipe
from price_alchemy import train
from price_alchemy import model_dispatcher as mdl
from cred import MYSQL_PASSWORD

import mlflow
from mlflow.models import infer_signature

In [2]:
cd ..

/Users/mehuljain/Documents/course_related/ML_Ops/project/Price_Alchemy


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
ls

README.md             [34mmlruns[m[m/               setup.cfg
[34mconfig[m[m/               [34mmodels[m[m/               setup.py
[34mdags[m[m/                 [34mnotebooks[m[m/            [34mtests[m[m/
[34mdata[m[m/                 [34mplugins[m[m/              tox.ini
docker-compose.yaml   pyproject.toml        [34mworking_data[m[m/
[34mlogs[m[m/                 requirements.txt
[34mmlartifacts[m[m/          requirements_dev.txt


## Load the data:

In [4]:
df= load_data_sql(MYSQL_PASSWORD)

In [5]:
# in case of no internet
# df= pd.read_csv('data/train.csv')

Since the dataset is very large, we will select a subset of rows from the dataset.

In [6]:
df_sample= sample_df(df, sample_size=20000)

## Data Preprocessing

In [7]:
text_prep= cfg.TEXT_PREP_OPTS['spacy']
col_trans= 'tfidf_chargram'

X,y= preprocessing_pipe(df_sample, text_prep, cfg.COL_TRANS_OPTS[col_trans])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['category_name'].replace('', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m_df['price'] = pd.to_numeric(m_df['price'], errors='coerce')


In [8]:
text_prep, col_trans

('version_2', 'tfidf_chargram')

In [9]:
X.shape, y.shape

((19899, 21597), (19899,))

## Model Training:

In [10]:
model_dispatch= mdl.models['mlp_three']

In [11]:
model,metrics= train.train_model(X, y.values, model=model_dispatch)

In [12]:
type(model).__name__

'MLPRegressor'

Let's see the metrics from the training 

In [13]:
metrics

{'mse': 884.0765325883979,
 'rmse': 29.679470443352887,
 'r_2': 0.270624432316357,
 'rmsle': 0.6238843318348232}

In [14]:
model.predict(X[:20])

array([21.94690821, 69.25834846, 40.55355851,  9.33425172, 35.92195709,
       72.68633802, 58.03286239, 48.7548652 , 15.86368067,  9.33425172,
       24.81447286, 10.98481595, 17.84595648, 16.71253165,  9.33425172,
        9.46586458, 17.01603458, 19.4312279 , 72.20292543, 35.29102883])

In [15]:
y[:20].values

array([15., 81., 10., 12., 24., 29., 64., 46., 10.,  4., 35.,  8., 22.,
       11., 12.,  4., 15., 16., 22., 30.])

## Logging experiment:

In [16]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("tfidf_chargram")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(model.get_params())

    # Log the loss metric
    mlflow.log_metric("mean squared error", metrics['mse'])
    mlflow.log_metric("root mean squared error", metrics['rmse'])
    mlflow.log_metric("mean squared log error", metrics['rmsle'])
    mlflow.log_metric("r2", metrics['r_2'])

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("sample_size", f"{X.shape[0]}")
    mlflow.set_tag("num_cols", f"{X.shape[1]}")
    mlflow.set_tag("text_preprocessor", f"{text_prep}")
    mlflow.set_tag("column_transformer", f"{col_trans}")
    mlflow.set_tag("model_name",f"{type(model).__name__}")

    # Infer the model signature
    signature = infer_signature(X, model.predict(X))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="mlp3_reg",
        signature=signature,
        input_example=X,
        registered_model_name="mlp-chargram",
    )

Registered model 'mlp-chargram' already exists. Creating a new version of this model...
2024/04/15 18:52:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: mlp-chargram, version 2
Created version '2' of model 'mlp-chargram'.
