<h1><center>Main Notebook - Different for each Model</center></h1>

# Imports & Setup

In [18]:
%load_ext autoreload
%autoreload 2
from utils import *
import utils.transformers.transformers as tr
import utils.transformers.sk4pandas as s4p
import utils.transformers.knimp as knp

import os
import pickle
import urllib.request
import warnings

import git
from IPython.core.magic import register_cell_magic
import mlflow
import pandas as pd
from sklearn import set_config
from sklearn.base import clone
from sklearn.compose import make_column_selector
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, loguniform

warnings.filterwarnings('ignore')
set_config(display='text')

git_repo = git.Repo(os.getcwd())

@register_cell_magic
def run_and_save(line, cell):
    'Run and save python code block to a file'
    with open(line, 'wt') as fd:
        fd.write(cell)
    code = compile(cell, line, 'exec')
    exec(code, globals())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Init Datasets

In [None]:
raw_main_df = pd.read_csv('data/train.csv')
raw_sub_df = pd.read_csv('data/test.csv')
raw_train_df, raw_test_df = train_test_split(raw_main_df, test_size=100, random_state=42)

X_all, y_all = raw_main_df.drop('Survived', axis=1), raw_main_df['Survived']
X_train, y_train = raw_train_df.drop('Survived', axis=1), raw_train_df['Survived']
X_test, y_test = raw_test_df.drop('Survived', axis=1), raw_test_df['Survived']

raw_train_df.sample(5, random_state=42)

# Prepare Pipeline

In [22]:
%%run_and_save blueprints/pipeline.py


clip_outliers = [(
    'Float',
    tr.ClipOutliers(std_band=3),
    make_column_selector(dtype_include=['float64'])
)]

fillna_constant = [(
    'Cabin',
    s4p.SimpleImputer(strategy='constant', fill_value='UNKNOWN'),
    ['Cabin']
),(
    'Age',
    s4p.SimpleImputer(strategy='constant', fill_value=-9999.),
    ['Age']
)]


steps_main = [
    ('Set working columns', tr.SetupFeatures(cols_ignore=['PassengerId', 'Name', 'Ticket']) ),
    ('Format Cabins', tr.Cabin() ),
    ('Standardize', s4p.StandardScaler(cols_select=['Age', 'Fare']) ),
    ('Clip Outliers', tr.ClipOutliers(cols_select=['Age', 'Fare'], std_band=3)),
    ('Replace NaNs With Constants', s4p.ColumnTransformer(fillna_constant, remainder='passthrough') ),
    ('Prepare coltypes for imputation', tr.AsTypes() ),
    ('Impute NaNs left with KNN', knp.KNImp(min_frequency=20) ),
    ('OneHot Encoding', s4p.OneHotEncoder(cols_select=['Cabin', 'Pclass', 'Sex', 'SibSp', 'Embarked', 'Parch']) ),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=150))
]

pipe = Pipeline(steps_main)

# Grid Params

In [39]:
%%run_and_save blueprints/grid_params.py

n_iter = 70

ml_params_distributions = {
    'Gradient Boosting__n_estimators': uniform(250, 400).rvs(n_iter).astype('int'),
#    'Gradient Boosting__max_depth': [6, 8, 10, 15],
    'Gradient Boosting__learning_rate': loguniform(0.001, 0.07),
    'Gradient Boosting__subsample': uniform(0.8, 1),
    'Gradient Boosting__min_samples_leaf': uniform(20, 70).rvs(n_iter).astype('int'),
#    'Gradient Boosting__max_features': ['sqrt']
}

rs = RandomizedSearchCV (
    pipe,
    param_distributions = ml_params_distributions,
    n_iter=n_iter,
    n_jobs=-1,
    refit=True,
    cv=CVSplitter(5, 90),
    return_train_score=True,
)

# ML Flow Run

In [None]:
mlflow.sklearn.autolog(max_tuning_runs=5)

with mlflow.start_run(run_name='Gradient Boost') as run:
    mlflow.set_tags(get_lastcommit_infos(git_repo))
    mlflow.log_artifact('blueprints/pipeline.py', 'blueprints')
    mlflow.log_artifact('blueprints/grid_params.py', 'blueprints')
    #pipe.fit(X_train, y_train)
    rs.fit(X_all, y_all)

# Refit best model

In [41]:
run_id = '0aa918632c43464ab973ec1f8666aceb'
url_local_model = f'file:///Users/i538262/Desktop/Developments/PERSO/KAGGLE%20Titanic/mlruns/0/{run_id}/artifacts/model/model.pkl'
with urllib.request.urlopen(url_local_model) as model_file:
    best_model = pickle.load(model_file)
    
best_model_allfit = clone(best_model)
best_model_allfit.fit(X_all, y_all)

2022/05/26 00:17:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'db3485b3ebe64ade9c7f84a7ebe6434a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
                 SetupFeatures(cols_ignore=['PassengerId', 'Name', 'Ticket'])),
                ('Format Cabins', Cabin()),
                ('Standardize',
                 StandardScaler(cols_ignore=['Pclas...`
       631, 529, 643, 588, 355, 286, 435, 377, 552, 413, 290, 555, 621,
       478, 523, 595, 649, 294, 580, 452, 319, 354, 384, 304, 619, ...`


<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'A', 'F', 'G', 'E', 'T', 'D', 'UNKNOWN', 'B', 'C'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'Q', 'C', 'S'}


2022/05/26 00:18:51 INFO mlflow.sklearn.utils: Logging the 5 best runs, 65 runs will be omitted.
                            'Embarked'],
  ...`
                  Age__fill_value=-9999.0, Age__strategy='constant',
                  Cabin__cols_ignore=[], Cabin__cols_select=[],
                  Cabin__fill_value='UNKNOWN', Cabin__...`
                            'Embarked'],
  ...`
                  Age__fill_value=-9999.0, Age__strategy='constant',
                  Cabin__cols_ignore=[], Cabin__cols_select=[],
                  Cabin__fill_value='UNKNOWN', Cabin__...`
                            'Embarked'],
  ...`
                  Age__fill_value=-9999.0, Age__strategy='constant',
                  Cabin__cols_ignore=[], Cabin__cols_select=[],
                  Cabin__fill_value='UNKNOWN', Cabin__...`
                            'Embarked'],
  ...`
                  Age__fill_value=-9999.0, Age__strategy='constant',
                  Cabin__cols_ignore=[], Cabin__cols_select=[],
  

RandomizedSearchCV(cv=<utils.CVSplitter object at 0x7fb5a4dd6d90>,
                   estimator=Pipeline(steps=[('Set working columns',
                                              SetupFeatures(cols_ignore=['PassengerId',
                                                                         'Name',
                                                                         'Ticket'])),
                                             ('Format Cabins', Cabin()),
                                             ('Standardize',
                                              StandardScaler(cols_ignore=['Pclass',
                                                                          'Sex',
                                                                          'SibSp',
                                                                          'Parch',
                                                                          'Cabin',
                                                             

# Submit models predictions

In [42]:
branch_name = get_lastcommit_infos(git_repo)['Branch']
submit(f'submissions/{branch_name}/{run_id}.csv', raw_sub_df.PassengerId, best_model_allfit.predict(raw_sub_df))

<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'A', 'D', 'C', 'UNKNOWN', 'E', 'G', 'B', 'F'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'S', 'Q', 'C'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'T', 'A', 'D', 'C', 'UNKNOWN', 'E', 'G', 'B', 'F'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'S', 'Q', 'C'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'T', 'A', 'D', 'C', 'UNKNOWN', 'E', 'G', 'B', 'F'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5}
	- Embarked {nan, 'S', 'Q', 'C'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'T', 'A', 'D', 'C', 'UNKNOWN', 'E', 'G', 'B', 'F'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'S', 'Q', 'C'}
<KNImputer> 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'E', 'B', 'A', 'G', 'UNKNOWN', 'F', 'C', 'D', 'T'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'Q', 'S', 'C'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'E', 'B', 'A', 'G', 'UNKNOWN', 'F', 'C', 'D', 'T'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'Q', 'S', 'C'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'B', 'E', 'A', 'G', 'UNKNOWN', 'F', 'C', 'D', 'T'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5}
	- Embarked {nan, 'Q', 'S', 'C'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'E', 'B', 'A', 'G', 'UNKNOWN', 'F', 'C', 'D'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'Q', 'S', 'C'}
<KNImputer> 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'D', 'F', 'G', 'A', 'B', 'UNKNOWN', 'T', 'C', 'E'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'S', 'C', 'Q', nan}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'D', 'F', 'G', 'B', 'A', 'UNKNOWN', 'C', 'E'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'S', 'C', 'Q', nan}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'D', 'F', 'G', 'B', 'A', 'UNKNOWN', 'T', 'C', 'E'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'S', 'C', 'Q', nan}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'D', 'F', 'G', 'B', 'A', 'UNKNOWN', 'T', 'C', 'E'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'Q', 'C', 'S', nan}
<KNImpute

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'UNKNOWN', 'C', 'D', 'E', 'A', 'F', 'G', 'B'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'Q', nan, 'C', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'UNKNOWN', 'C', 'D', 'E', 'T', 'A', 'G', 'F', 'B'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'Q', nan, 'C', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'UNKNOWN', 'C', 'D', 'E', 'T', 'A', 'F', 'G', 'B'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'Q', nan, 'C', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'UNKNOWN', 'C', 'D', 'E', 'A', 'F', 'G', 'B'}
	- Pclass {1, 2, 3}
	- Sex {'male', 'female'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {'Q', nan, 'C', 'S'}
<KNImputer> Co

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'T', 'F', 'UNKNOWN', 'B', 'E', 'A', 'G', 'D', 'C'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'T', 'F', 'UNKNOWN', 'B', 'E', 'A', 'G', 'D', 'C'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'T', 'F', 'UNKNOWN', 'B', 'E', 'G', 'A', 'D', 'C'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'T', 'F', 'UNKNOWN', 'B', 'E', 'A', 'G', 'D', 'C'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> C

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'B', 'A', 'D', 'T', 'C', 'G', 'F', 'UNKNOWN', 'E'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'B', 'A', 'D', 'T', 'C', 'G', 'F', 'UNKNOWN', 'E'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'B', 'A', 'D', 'T', 'C', 'G', 'F', 'UNKNOWN', 'E'}
	- Pclass {1, 2, 3}
	- Sex {'female', 'male'}
	- SibSp {0, 1, 2, 3, 4, 5, 8}
	- Parch {0, 1, 2, 3, 4, 5, 6}
	- Embarked {nan, 'C', 'Q', 'S'}
<KNImputer> Cols that will be OneHot encoded :
	- Cabin {'B', 'A', 'D', 'T', 'C', 'G', 'F', 'UNKNOWN', 'E'}
	- Pclass {1, 2, 3}
	- S

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

# Model insights

# Code tests