<h1><center>Main Notebook - Logistic Regression</center></h1>

# Imports & Setup

In [39]:
%load_ext autoreload
%autoreload 2
from utils import *
import utils.transformers.transformers as tr
import utils.transformers.sk4pandas as s4p

import os
import pickle
import urllib.request

import git
from IPython.core.magic import register_cell_magic
import mlflow
import pandas as pd
from sklearn.base import clone
from sklearn.compose import make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, loguniform

git_repo = git.Repo(os.getcwd())

@register_cell_magic
def run_and_save(line, cell):
    'Run and save python code block to a file'
    with open(line, 'wt') as fd:
        fd.write(cell)
    code = compile(cell, line, 'exec')
    exec(code, globals())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Init Datasets

In [9]:
raw_main_df = pd.read_csv('data/train.csv')
raw_sub_df = pd.read_csv('data/test.csv')
raw_train_df, raw_test_df = train_test_split(raw_main_df, test_size=100, random_state=42)

X_all, y_all = raw_main_df.drop('Survived', axis=1), raw_main_df['Survived']
X_train, y_train = raw_train_df.drop('Survived', axis=1), raw_train_df['Survived']
X_test, y_test = raw_test_df.drop('Survived', axis=1), raw_test_df['Survived']

raw_train_df.sample(5, random_state=42)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
678,679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43.0,1,6,CA 2144,46.9,,S
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S
691,692,1,3,"Karun, Miss. Manca",female,4.0,0,1,349256,13.4167,,C
499,500,0,3,"Svensson, Mr. Olof",male,24.0,0,0,350035,7.7958,,S
303,304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q


In [28]:
X_train.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# Prepare Pipeline

In [29]:
%%run_and_save blueprints/pipeline.py

clip_outliers = [
    (
        'Float',
        tr.ClipOutliers(std_band=3.),
        make_column_selector(dtype_include=['float64'])
    )
]

fillna = [
    (
        'Mean',
        s4p.SimpleImputer(strategy='mean'),
        make_column_selector(dtype_include=['float64'])
    ),
    (
        'Most Frequent',
        s4p.SimpleImputer(strategy='most_frequent'),
        make_column_selector(dtype_include=['int64', 'object'])
    )
]

standardize = [
    (
        'Floats Ints',
        s4p.StandardScaler(),
        ['Age', 'Fare']
    )
]


steps_main = [
    ('Format Cabins', tr.Cabin() ),
    ('Set working columns', tr.SetupFeatures(cols_ignore=['PassengerId', 'Name', 'Ticket', 'Parch']) ),
    ('Convert Types', tr.AsTypes(coltypes_overwrite={'Age': 'float64'}) ),
    ('Clip Outliers', s4p.ColumnTransformer(clip_outliers, remainder='passthrough') ),
    ('Standardize', s4p.ColumnTransformer(standardize, remainder='passthrough')),
    ('Fill NaN', s4p.ColumnTransformer(fillna, remainder='passthrough') ),
    ('One Hot', s4p.OneHotEncoder(cols_select=['Pclass', 'Sex', 'Cabin', 'SibSp', 'Embarked']) ),
    ('Logistic Regression', LogisticRegression())
]

pipe = Pipeline(steps_main)

# Grid Params

In [36]:
%%run_and_save blueprints/grid_params.py


ml_params_distributions = {
    'Logistic Regression__C': uniform(1, 10),
}

rs = RandomizedSearchCV (
    pipe,
    param_distributions = ml_params_distributions,
    n_iter=20,
    n_jobs=-1,
    refit=True,
    cv=CVSplitter(5, 80),
    return_train_score=True
)

# ML Flow Run

In [None]:
mlflow.sklearn.autolog(max_tuning_runs=3)

with mlflow.start_run(run_name='Logistic Regression') as run:
    mlflow.set_tags(get_lastcommit_infos(git_repo))
    mlflow.log_artifact('blueprints/pipeline.py', 'blueprints')
    mlflow.log_artifact('blueprints/grid_params.py', 'blueprints')
    #pipe.fit(X_all, y_all)
    rs.fit(X_all, y_all)

# Refit best model

In [40]:
run_id = '50467b191ae0416b8e84b3467a17c307'
# url_local_model = f'file:///Users/i538262/Desktop/Developments/PERSO/KAGGLE%20Titanic/mlruns/0/{run_id}/artifacts/model/model.pkl' # model
url_local_model = f'file:///Users/i538262/Desktop/Developments/PERSO/KAGGLE%20Titanic/mlruns/0/{run_id}/artifacts/best_estimator/model.pkl' # best estimator (from rs)

with urllib.request.urlopen(url_local_model) as model_file:
    best_model = pickle.load(model_file)
    
best_model_allfit = clone(best_model)
best_model_allfit.fit(X_all, y_all)

2022/04/22 10:36:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '336c66360d254f87b6c2dcc6bea5839d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
                  Float__std_band=3.0,
                  transformers=[('Float', ClipOutliers(std_band=3.0),
                                 <sklearn.compose._column_transformer.make...`
                  Most Frequent__strategy='most_frequent',
                  transformers=[('Mean', SimpleImputer(strategy='mean'),
                                 <sklearn.compose._column_transformer.make...`


Pipeline(steps=[('Format Cabins', Cabin()),
                ('Set working columns',
                 SetupFeatures(cols_ignore=['PassengerId', 'Name', 'Ticket',
                                            'Parch'])),
                ('Convert Types',
                 AsTypes(coltypes_overwrite={'Age': 'float64'})),
                ('Clip Outliers',
                 ColumnTransformer(Float__cols_ignore=[], Float__cols_select=[],
                                   Float__std_band=3.0,
                                   transformers=[('Float',
                                                  ClipOutliers(std_band=3.0),
                                                  <sklearn....
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fb8c801f490>),
                                                 ('Most Frequent',
                                                  SimpleImputer(strategy='most_frequent'),
                  

# Submit models predictions

In [42]:
branch_name = get_lastcommit_infos(git_repo)['Branch']
submit(f'submissions/{branch_name}/{run_id}.csv', raw_sub_df.PassengerId, best_model_allfit.predict(raw_sub_df))

Do you want to create <submissions/logistic_regression> ? (Y or N)


 Y


# Code tests