# Batch train models

# Purpose
Train a batch of models on the same dataset, same train/test and cross validation

# Methodology
* Run the [03.1_train_model.ipynb](03.1_train_model.ipynb) for each model.
* model, model score etc. is logged with [mlflow](https://mlflow.org/).

# Setup

In [30]:
# %load imports.py
from typing import no_type_check
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sympy as sp


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import papermill as pm

import sys
from src.logger import logger
import src.mlflow_utils

## Batch run

In [34]:
def create_output_path(parameters, artifact_dir = 'artifacts'):
    
    if not os.path.exists(artifact_dir):
        os.mkdir(artifact_dir)
    
    rolling_str = 'rolling' if parameters['rolling_cv'] else 'random'
    output_name = create_run_name(parameters=parameters)
    output_path = os.path.join(artifact_dir,f'{output_name}.ipynb')
    return output_path

def create_run_name(parameters):
    
    rolling_str = 'rolling' if parameters['rolling_cv'] else 'random'
    name = f'{parameters["model_name"]}_{rolling_str}_{parameters["test_size"]}'
    return name
    
    

In [35]:
artifact_dir = 'artifacts'

parameters = {
    'experiment' : 'rolling_fit',
    'model_name' : 'polynomial',
    'data_path' : '../../data/processed/data_extended.csv',
    'n_splits' : 6,
    'n_repeats' :1,
    'rolling':True,
    'test_size':0.2,
    'input_path':r'pipeline/03.1_train_model.ipynb',
}

models = {
    
    'polynomial' : {
        'input_path':r'pipeline/03.1_train_model.ipynb',
    },

    'SVR' : {
        'input_path':r'pipeline/03.3_train_SVR.ipynb',
    },    
    
    'XGBoost' : {
        'input_path':r'pipeline/03.2_train_xgboost.ipynb',
    },
    
    'ridge' : {
        'input_path':r'pipeline/03.4_train_ridge.ipynb',
    },
    
    'lasso' : {
        'input_path':r'pipeline/03.5_train_lasso.ipynb',
    },

            
}


df_parameters = pd.DataFrame(columns=parameters.keys())

for model_name, model in models.items():
        
    for test_size in [0.1,0.2,0.3]:
    
        for rolling_cv in [True,False]:   
            
            s = pd.Series(parameters)
            
            if not rolling:
                s['experiment'] = 'random_fit'
            
            s['rolling_cv']=rolling_cv
            s['rolling_test']=True
            s['test_size'] = test_size
            
        
            s.update(model)
            s['model_name'] = model_name
            s['run_name'] = create_run_name(parameters=s)
            
            s['output_path'] = create_output_path(parameters=s, artifact_dir = artifact_dir)
            
            df_parameters = df_parameters.append(s, ignore_index=True)

df_parameters

Unnamed: 0,experiment,model_name,data_path,n_splits,n_repeats,rolling,test_size,input_path,output_path,rolling_cv,rolling_test,run_name
0,random_fit,polynomial,../../data/processed/data_extended.csv,6,1,True,0.1,pipeline/03.1_train_model.ipynb,artifacts\polynomial_rolling_0.1.ipynb,1.0,1.0,polynomial_rolling_0.1
1,random_fit,polynomial,../../data/processed/data_extended.csv,6,1,True,0.1,pipeline/03.1_train_model.ipynb,artifacts\polynomial_random_0.1.ipynb,0.0,1.0,polynomial_random_0.1
2,random_fit,polynomial,../../data/processed/data_extended.csv,6,1,True,0.2,pipeline/03.1_train_model.ipynb,artifacts\polynomial_rolling_0.2.ipynb,1.0,1.0,polynomial_rolling_0.2
3,random_fit,polynomial,../../data/processed/data_extended.csv,6,1,True,0.2,pipeline/03.1_train_model.ipynb,artifacts\polynomial_random_0.2.ipynb,0.0,1.0,polynomial_random_0.2
4,random_fit,polynomial,../../data/processed/data_extended.csv,6,1,True,0.3,pipeline/03.1_train_model.ipynb,artifacts\polynomial_rolling_0.3.ipynb,1.0,1.0,polynomial_rolling_0.3
5,random_fit,polynomial,../../data/processed/data_extended.csv,6,1,True,0.3,pipeline/03.1_train_model.ipynb,artifacts\polynomial_random_0.3.ipynb,0.0,1.0,polynomial_random_0.3
6,random_fit,SVR,../../data/processed/data_extended.csv,6,1,True,0.1,pipeline/03.3_train_SVR.ipynb,artifacts\SVR_rolling_0.1.ipynb,1.0,1.0,SVR_rolling_0.1
7,random_fit,SVR,../../data/processed/data_extended.csv,6,1,True,0.1,pipeline/03.3_train_SVR.ipynb,artifacts\SVR_random_0.1.ipynb,0.0,1.0,SVR_random_0.1
8,random_fit,SVR,../../data/processed/data_extended.csv,6,1,True,0.2,pipeline/03.3_train_SVR.ipynb,artifacts\SVR_rolling_0.2.ipynb,1.0,1.0,SVR_rolling_0.2
9,random_fit,SVR,../../data/processed/data_extended.csv,6,1,True,0.2,pipeline/03.3_train_SVR.ipynb,artifacts\SVR_random_0.2.ipynb,0.0,1.0,SVR_random_0.2


In [36]:
assert df_parameters['run_name'].is_unique

In [37]:
for run_name, parameters in df_parameters.iterrows():

    logger.info(f'Running notebook for: {parameters["run_name"]}')
    output_path = parameters['output_path']
    
    if os.path.exists(output_path):
        if os.path.getsize(output_path) > 1000:
            logger.info('Already run, skipping...')
            continue
    
    try:
        pm.execute_notebook(input_path=parameters['input_path'], 
                        output_path=output_path,                   
                        parameters={'run_params':dict(parameters)},
                        cwd=artifact_dir);
    except Exception as e:
        logger.error(e)
        continue

2021-06-14 19:40:25,468 - Running notebook for: polynomial_rolling_0.1


Executing:   0%|          | 0/26 [00:00<?, ?cell/s]

2021-06-14 19:40:40,693 - Running notebook for: polynomial_random_0.1


Executing:   0%|          | 0/26 [00:00<?, ?cell/s]

2021-06-14 19:40:56,935 - Running notebook for: polynomial_rolling_0.2


Executing:   0%|          | 0/26 [00:00<?, ?cell/s]

2021-06-14 19:41:12,113 - Running notebook for: polynomial_random_0.2


Executing:   0%|          | 0/26 [00:00<?, ?cell/s]

2021-06-14 19:41:30,679 - Running notebook for: polynomial_rolling_0.3


Executing:   0%|          | 0/26 [00:00<?, ?cell/s]

2021-06-14 19:41:50,327 - Running notebook for: polynomial_random_0.3


Executing:   0%|          | 0/26 [00:00<?, ?cell/s]

2021-06-14 19:42:05,533 - Running notebook for: SVR_rolling_0.1


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 19:42:38,555 - Running notebook for: SVR_random_0.1


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 19:43:23,536 - Running notebook for: SVR_rolling_0.2


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 19:43:53,873 - Running notebook for: SVR_random_0.2


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 19:44:33,196 - Running notebook for: SVR_rolling_0.3


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 19:44:59,452 - Running notebook for: SVR_random_0.3


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 19:45:31,709 - Running notebook for: XGBoost_rolling_0.1


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 20:03:45,048 - Running notebook for: XGBoost_random_0.1


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 20:32:13,530 - Running notebook for: XGBoost_rolling_0.2


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 20:48:19,313 - Running notebook for: XGBoost_random_0.2


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 21:13:47,959 - Running notebook for: XGBoost_rolling_0.3


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 21:28:01,837 - Running notebook for: XGBoost_random_0.3


Executing:   0%|          | 0/24 [00:00<?, ?cell/s]

2021-06-14 21:50:35,369 - Running notebook for: ridge_rolling_0.1


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:51:01,735 - Running notebook for: ridge_random_0.1


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:51:31,132 - Running notebook for: ridge_rolling_0.2


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:51:56,458 - Running notebook for: ridge_random_0.2


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:52:25,732 - Running notebook for: ridge_rolling_0.3


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:52:50,073 - Running notebook for: ridge_random_0.3


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:53:17,432 - Running notebook for: lasso_rolling_0.1


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:53:44,490 - Running notebook for: lasso_random_0.1


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:54:16,316 - Running notebook for: lasso_rolling_0.2


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:54:43,663 - Running notebook for: lasso_random_0.2


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:55:13,662 - Running notebook for: lasso_rolling_0.3


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

2021-06-14 21:55:38,916 - Running notebook for: lasso_random_0.3


Executing:   0%|          | 0/25 [00:00<?, ?cell/s]

In [None]:
from mlflow.tracking import MlflowClient
import mlflow

In [None]:
print(mlflow.get_registry_uri())

In [None]:
import mlflow
run = mlflow.get_run(run_id='d44831137a6e40f9b1a938cf61676a92')

In [None]:
run.to_dictionary()

In [None]:
run.data.metrics