#### Step 1 - Train a baseline model and test

In [1]:
from mlpl import fe, prep, models, vis, utils
from mlpl.pipetools import pipe, pmodels, pdefaults, dt

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import sklearn
import os
from hyperopt import hp
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
pd.set_option('precision', 3)
pd.set_option('float_format', '{:.3f}'.format)
example_sub_path = 'data/gender_submission.csv'
####################
# Load data
sub = pd.read_csv(example_sub_path)
np.random.seed(42)

### Create Pipeline

In [2]:
label_name = 'Survived'
trn_path = 'data/train.csv'
test_path = 'data/test.csv'

# Pipeline class will keep track of your processed files, model metrics and experiments. 
lr_pipeline = pipe.Pipeline(label_name = label_name,
                            overwrite = True,
                            project_path = 'lr_pipeline',
                            train_data_path = trn_path,
                            test_data_path = test_path,
                            minimize_metric = False,
                            useful_limit = 0.001,
                            line_search_iter = 1,
                            n_random_seeds = 1,
                            bayesian_search_iter= 50,
                            bayesian_search_count = 1,
                            final_bayesian_search_iter = 0,
                            line_search_patience = 2,
                            line_search_params = {'C': (1e-7, 1e3)})

No project found, creating...
random_seed_count: 1
line_search_param_count: 1
line_search_iteration_count: 1

Model will run at least 2 times for each step.
Formula is: 2 * line_search_iteration_count * random_seed_count * line_search_param_count
Loading data...
cols_to_drop does not exist.
Saving baseline function
No baseline model was found.
No model_params was found.
Saved project to lr_pipeline


### Add baseline step

In [3]:
fixed_params_lr = dict(score=accuracy_score,
                       model=sklearn.linear_model.LogisticRegression,                       
                       max_iter=5000,
                       verbose = 0,
                       n_jobs = 3,
                       model_type = 'linear',
                       folds=[KFold(n_splits= 5, shuffle = True, random_state = 42),
                              KFold(n_splits= 5, shuffle = True, random_state = 13),
                              KFold(n_splits= 5, shuffle = True, random_state = 100)
                              ])


lr_search_space = dict(C = hp.loguniform('C', -7, 3),
                       class_weight =  hp.choice('class_weight', ['balanced', None]),
                       solver =  hp.choice('solver ', ['lbfgs', 'sag']),
                       )

lr_pipeline.set_baseline_step(model = pmodels.train_sklearn_pipeline,
                                proc = pdefaults.default_sklearn_preprocess,
                                search_model_params= lr_search_space,
                                fixed_model_params = fixed_params_lr
                               )


### Run baseline and get test predictions

In [4]:
res = lr_pipeline.run_baseline(return_result = True)

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




#### Create submission

In [5]:
# Convert test_preds to int from probabilities

# Since this competition requires values to be 0 or 1,
# We have to adjust a decision threshold. While selecting this threshold,
# criteria is to make mean value of test_preds to label in training set.
# This step is not necessary in most projects
test_preds = (res['test_preds'] > 0.55).astype('int')

# Prepare submission file
to_sub = sub.copy()
to_sub[label_name] = test_preds
to_sub.to_csv('titanic_sub.csv', index = False)
test_preds.mean()

# Baseline LB score: 0.76555

0.3827751196172249

#### Save project for following stages

In [6]:
lr_pipeline.save_project()

Saving baseline function
Saved project to lr_pipeline
