### Step 2 - Add new steps and evaluate

In [1]:
import pandas as pd
import numpy as np
from mlpl import fe, prep, models, vis, utils, encodingpred
from mlpl.pipetools import pipe, pmodels, pdefaults, steps, putils
from sklearn.model_selection import KFold
import sklearn
import os
from hyperopt import hp
import warnings
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')
pd.set_option('precision', 3)
pd.set_option('float_format', '{:.3f}'.format)
np.random.seed(42)
#label_name = 'Survived'

example_sub_path = 'data/gender_submission.csv'
sub = pd.read_csv(example_sub_path)

### Load Pipeline

In [2]:
lr_pipeline = pipe.Pipeline(project_path = 'lr_pipeline')

A project exists on this path, loading...
Load cols_to_drop from: BASELINE_colstodrop
baseline model found.
loading model params
Project loaded from lr_pipeline


### Try New Steps

### Generalizable FE/Preprocessing
- Click on feature names to see steps

- A number of preprocessing steps are tried on columns. <br>
Right now, they mostly consist of alternative imputations to baseline.<br>

- I already coded some automated FE methods, but they are not in default steps yet.

- Note that some steps are considered useless despite they improve the<br>
cv metric. Steps that increase the cv less than **'useful_limit'** will not be applied.<br>
This parameter was set in baseline stage.

- The reason for that is some steps will overfit to the cv and you should keep only<br>
the ones that result in a significant improvement. This is especially the case for projects<br>
with a small dataset.

In [3]:
steps.try_default_nominal_steps(lr_pipeline,
                                ohe = True,
                                group_outliers = True,
                                ohe_max_unique = 5000)

In [4]:
# Not better than the baseline
steps.try_default_numeric_steps(lr_pipeline,
                                ohe = True,
                                binning = True)

#### Baseline score: 0.76555
#### Leaderboard score after default steps: 0.78468

### Custom FE

#### Add title

In [5]:
# Extract title from Name

def add_title(feature_properties, train, test, label_name):
    # From: https://www.kaggle.com/kpacocha/top-5-titanic-machine-learning-from-disaster
    def fe_title(df, col):
        title_col = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
        title_col = np.where((title_col=='Capt') | (title_col=='Countess')
                           | (title_col=='Don') | (title_col=='Dona')
                           | (title_col=='Jonkheer') | (title_col=='Lady')
                           | (title_col=='Sir') | (title_col=='Major')
                           | (title_col=='Rev') | (title_col=='Col'),
                           'Other',title_col)
        
        title_col = pd.Series(title_col)
        title_col = title_col.replace('Ms','Miss')
        title_col = title_col.replace('Mlle','Miss')
        title_col = title_col.replace('Mme','Mrs')
        return title_col
    
    # utils.utilize is a python decorator that transforms a function from:
    # - takes dataframe, column name as input, returns pd.Series
    # to:
    # - takes multiple dataframes, can return a pd.Series, can add new column to
    #   dataframes with a new name or replaces the original.
    #   This behavior is controlled by 'mode' argument.
    #  mode:
    #  - 'add': add resulting column to the dataframe with a generated name
    #  - 'replace': replace original column. 
    #  - 'return' : return pd.Series for each df.
    #  
    # utilize also has join_dfs argument (default=True)
    # if join_dfs = True, operation is carried out after concatenating the column
    # from dataframes.
    
    # Process name, append result to train and test.
    utils.utilize(mode = 'add')(fe_title)([train, test], 'Name')
    
    # This is the name of the added column.
    # Names are generated by utilize using this template:
    #     '{function_name}_{col}'
    #
    # (This is if col is a single string. It can be a list)
    
    new_name = 'fe_title_Name'
    
    # Label encode new column and replace it.
    utils.utilize(mode = 'replace')(prep.label_encode)([train, test], new_name)
    
    # One hot encode new column
    train, test = prep.one_hot_encode([train, test], col = new_name, sparse = True)
    return [], train, test

res = lr_pipeline.add_step_apply_if_useful(proc = add_title)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




#### Add ticket prefix

In [6]:
def add_prefix(feature_properties, train, test, label_name, col_name):
    def prefix(df, col):
        def get_prefix(x):
            x = str(x)
            if len(x) == 1:
                return x
            else:
                return x.split(' ')[0][0]
        return df[col].apply(lambda x: get_prefix(x))
    
    utils.utilize(mode = 'add')(prefix)([train, test], col_name)
    new_name = f'prefix_{col_name}'
    utils.utilize(mode = 'replace')(prep.label_encode)([train, test], new_name)
    train, test = prep.one_hot_encode([train, test],
                                      col = new_name,
                                      mode = 'replace')
    return [], train, test

def add_prefix_group_outliers(
        feature_properties, train, test,
        label_name, col_name, limit = 10):
    @utils.utilize(mode = 'add')
    def prefix(df, col):
        def get_prefix(x):
            x = str(x)
            if len(x) == 1:
                return x
            else:
                return x.split(' ')[0][0]
        return df[col].apply(lambda x: get_prefix(x))

    prefix([train, test], col_name)
    new_name = f'prefix_{col_name}'
    utils.utilize(mode = 'replace')(prep.label_encode)([train, test], new_name)
    prep.group_outliers_replace([train, test], new_name, limit = limit)
    train, test = prep.one_hot_encode([train, test],
                                      col = new_name,
                                      mode = 'add')
    
    # Don't drop the original column, but don't use it in training
    return [col_name], train, test

lr_pipeline.add_step(proc = add_prefix,
                     group = 'prefix_ticket',
                     proc_params= {'col_name': 'Ticket'})

lr_pipeline.add_step(proc = add_prefix_group_outliers,
                     group = 'prefix_ticket',
                     proc_params= {'col_name': 'Ticket'})

res = lr_pipeline.group_apply_useful('prefix_ticket')

#### Add ticket integer part

In [7]:
@utils.utilize(mode = 'add')
def add_integer(df, col):
    def get_int(x):
        try:
            return int(x.split(' ')[-1])
        except:
            return -9000
    return df[col].apply(lambda x: get_int(x))

def ticket_integer(feature_properties, train, test, label_name):
    add_integer([train, test], 'Ticket')
    return [], train, test

def bin_ticket_integer(feature_properties, train, test, label_name, bins = 10):
    add_integer([train, test], 'Ticket')
    new_name = 'add_integer_Ticket'
    
    prep.full_binning([train, test], new_name, bins = bins)
    train, test = prep.one_hot_encode([train, test],
                                      new_name,
                                      mode = 'replace')
    return [], train, test

def auto_bin_ticket_integer(feature_properties, train, test, label_name): 
    add_integer([train, test], 'Ticket')
    new_name = 'add_integer_Ticket'
    
    print(train[new_name].dtype)
    prep.auto_bin_cols([train, test], new_name)
    train, test = prep.one_hot_encode([train, test],
                                      new_name,
                                      mode = 'replace')
    return [], train, test

lr_pipeline.add_step(proc = ticket_integer,
                     group = 'ticket_integer')

lr_pipeline.add_step(proc = bin_ticket_integer,
                     group = 'ticket_integer',
                     proc_params = {'bins': 10})

lr_pipeline.add_step(proc = auto_bin_ticket_integer,
                     group = 'ticket_integer')

res = lr_pipeline.group_apply_useful('ticket_integer')

### Train models with bayesian search

### LR

In [8]:
# When model is not specified, it is the baseline model
lr_pipeline.add_model('lr')
res = lr_pipeline.run_model('lr',
                            hyperparam_search = False,
                            return_pred = True,
                            use_final_params = True)

### SVC

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

fixed_hparams = dict(model = SVC,
                     probability = True,
                     random_state = 42,
                     score = accuracy_score,
                     max_iter = 2000,
                     folds=[KFold(n_splits= 5, shuffle = True, random_state = 42),
                              KFold(n_splits= 5, shuffle = True, random_state = 13),
                              KFold(n_splits= 5, shuffle = True, random_state = 100)
                              ])


search_hparams = dict(C = hp.loguniform('C', -3, 7),
                      gamma = hp.loguniform('gamma', -3, 3),
                      class_weight =  hp.choice('class_weight', ['balanced', None]),
                      kernel = hp.choice('kernel', ['linear', 'rbf', 'poly'])
                      )

lr_pipeline.add_model('svc',
                      model = pmodels.train_sklearn_pipeline,
                      fixed_hparams = fixed_hparams,
                      search_hparams = search_hparams)

res = lr_pipeline.run_model('svc', return_pred = True, hyperparam_search = True)

### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

fixed_hparams = dict(model = RandomForestClassifier,
                     folds = lr_pipeline.baseline_step['model_params']['folds'],
                     random_state = 42,
                     score = accuracy_score)


search_hparams = dict(min_samples_split = hp.uniform('min_samples_split', 0.05, 1.0),
                      min_samples_leaf = hp.uniform('min_samples_leaf', 0.05, 0.5),
                      class_weight =  hp.choice('class_weight', ['balanced', None]))

lr_pipeline.add_model('rf',
                      model = pmodels.train_sklearn_pipeline,
                      fixed_hparams = fixed_hparams,
                      search_hparams = search_hparams)

res = lr_pipeline.run_model('rf', hyperparam_search = True)

### KNeighbors

In [11]:
from sklearn.neighbors import KNeighborsClassifier

fixed_hparams = dict(model = KNeighborsClassifier,
                     folds = lr_pipeline.baseline_step['model_params']['folds'],
                     score = accuracy_score)


search_hparams = dict(n_neighbors  = hp.choice('n_neighbors', np.arange(4,25)),
                      leaf_size = hp.choice('leaf_size', np.arange(15,50)))

lr_pipeline.add_model('kn',
                      model = pmodels.train_sklearn_pipeline,
                      fixed_hparams = fixed_hparams,
                      search_hparams = search_hparams)

res = lr_pipeline.run_model('kn', hyperparam_search = True)

#### Create submission

In [16]:
# Convert to (1,0) from probabilities
test_preds = (res['test_preds'] > 0.7).astype('int')

# Print mean to adjust threshold
print(test_preds.mean())

# Save submission
sub = pd.read_csv(r'data/gender_submission.csv')
to_sub = sub.copy()
to_sub[lr_pipeline.label_name] = test_preds
to_sub.to_csv('titanic_sub.csv', index = False)

KeyError: 'test_preds'

### Blending

In [19]:
res = putils.blend_from_csv(directory = lr_pipeline.test_preds_path)

# Convert to (1,0) from probabilities
test_preds = (res > 0.655).astype('int')

# Print mean to adjust threshold
print(test_preds.mean())

# Save submission
sub = pd.read_csv(r'data/gender_submission.csv')
to_sub = sub.copy()
to_sub[lr_pipeline.label_name] = test_preds
to_sub.to_csv('titanic_sub.csv', index = False)

0.3827751196172249
