## Imports

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

## Timer

In [3]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

## setup from previous notebook

In [4]:
TRAIN_PATH = '../data/train.csv'
TEST_PATH = '../data/test.csv'
N_SPLITS = 5
RANDOM_STATE = 42


def extracting_title_age_imputing(data_df, train_df, test_df):
    '''
    Function to extract title and impute age

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        dataframe (pandas.DataFrame): DataFrame on which to train
        dataframe (pandas.DataFrame): DataFrame on which to test
    '''
    data_df['Title'] = data_df['Name']

    for name_string in data_df['Name']:
        data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

    mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col':
                'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
                'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt':
                'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
    data_df.replace({'Title': mapping}, inplace=True)

    titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']
    for title in titles:
        age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
        data_df.loc[(data_df['Age'].isnull()) & (data_df['Title'] == title), 'Age'] = age_to_impute

    train_df['Age'] = data_df['Age'][:891]
    test_df['Age'] = data_df['Age'][891:]
    data_df.drop('Title', axis = 1, inplace = True)

def family_size(data_df, train_df, test_df):
    '''
    Function to combine Parch and Sibsp columns to get family size on board

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        dataframe (pandas.DataFrame): DataFrame on which to train
        dataframe (pandas.DataFrame): DataFrame on which to test
    '''
    data_df['Family_Size'] = data_df['Parch'] + data_df['SibSp']
    train_df['Family_Size'] = data_df['Family_Size'][:891]
    test_df['Family_Size'] = data_df['Family_Size'][891:]

def spliting_name(data_df):
    '''
    Function to split name column to name and last name

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    '''
    data_df['Last_Name'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])
    
def imputing_fare(data_df):
    '''
    Function to impute fare column

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    '''
    data_df['Fare'].fillna(data_df['Fare'].mean(), inplace=True)


def family_survival(data_df, train_df, test_df):
    '''
    Function to get family survival rate

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        dataframe (pandas.DataFrame): DataFrame on which to train
        dataframe (pandas.DataFrame): DataFrame on which to test
    '''
    DEFAULT_SURVIVAL_VALUE = 0.5
    data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

    for grp, grp_df in data_df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                            'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):

        if (len(grp_df) != 1):
            # A Family group is found.
            for ind, row in grp_df.iterrows():
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
    for _, grp_df in data_df.groupby('Ticket'):
        if (len(grp_df) != 1):
            for ind, row in grp_df.iterrows():
                if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                    smax = grp_df.drop(ind)['Survived'].max()
                    smin = grp_df.drop(ind)['Survived'].min()
                    passID = row['PassengerId']
                    if (smax == 1.0):
                        data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                    elif (smin==0.0):
                        data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

    # # Family_Survival in TRAIN_DF and TEST_DF:
    train_df['Family_Survival'] = data_df['Family_Survival'][:891]
    test_df['Family_Survival'] = data_df['Family_Survival'][891:]

def fare_bin(data_df, train_df, test_df):
    '''
    Function to get fare bin and encode it

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        dataframe (pandas.DataFrame): DataFrame on which to train
        dataframe (pandas.DataFrame): DataFrame on which to test
    '''
    data_df['Fare'].fillna(data_df['Fare'].median(), inplace = True)
    data_df['FareBin'] = pd.qcut(data_df['Fare'], 5)

    label = LabelEncoder()
    data_df['FareBin_Code'] = label.fit_transform(data_df['FareBin'])

    train_df['FareBin_Code'] = data_df['FareBin_Code'][:891]
    test_df['FareBin_Code'] = data_df['FareBin_Code'][891:]

    train_df.drop(['Fare'], axis=1, inplace=True)
    test_df.drop(['Fare'], axis=1, inplace=True)

def age_bin(data_df, train_df, test_df):
    '''
    Function to get age bin and encode it

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        dataframe (pandas.DataFrame): DataFrame on which to train
        dataframe (pandas.DataFrame): DataFrame on which to test
    '''
    data_df['AgeBin'] = pd.qcut(data_df['Age'], 4)

    label = LabelEncoder()
    data_df['AgeBin_Code'] = label.fit_transform(data_df['AgeBin'])

    train_df['AgeBin_Code'] = data_df['AgeBin_Code'][:891]
    test_df['AgeBin_Code'] = data_df['AgeBin_Code'][891:]

    train_df.drop(['Age'], axis=1, inplace=True)
    test_df.drop(['Age'], axis=1, inplace=True)

def replacing_sex(train_df, test_df):
    '''    
    Function to replace sex to male or female

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to train
        dataframe (pandas.DataFrame): DataFrame on which to test
    '''
    train_df['Sex'].replace(['male','female'],[0,1],inplace=True)
    test_df['Sex'].replace(['male','female'],[0,1],inplace=True)

def aplying_preprocesing(data_df,train_df, test_df):
    '''
    Function to apply preprocesing and drop columns

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        dataframe (pandas.DataFrame): DataFrame on which to train
        dataframe (pandas.DataFrame): DataFrame on which to test
    '''
    extracting_title_age_imputing(data_df, train_df, test_df)
    family_size(data_df, train_df, test_df)
    spliting_name(data_df)
    imputing_fare(data_df)
    family_survival(data_df, train_df, test_df)
    fare_bin(data_df, train_df, test_df)
    age_bin(data_df, train_df, test_df)
    replacing_sex(train_df, test_df)
    train_df.drop(['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
               'Embarked'], axis = 1, inplace = True)
    test_df.drop(['Name','PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
              'Embarked'], axis = 1, inplace = True)

## XGBoost

In [3]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
data_df = pd.concat([train_df, test_df])

aplying_preprocesing(data_df, train_df, test_df)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)

In [54]:
parameters = {
'max_depth' : [3, 5, 10, 15],
'subsample' : [0.1, 0.3, 0.5, 0.8, 1],
'n_estimators' : [100,250, 500,750, 1000],
'learning_rate' : [0.02, 0.05,0.1]
}

In [55]:
folds = 5
param_comb = 10

kf = KFold(n_splits = folds, shuffle = True, random_state = 42)
xgb = XGBClassifier()
random_search = GridSearchCV(xgb, param_grid=parameters, scoring='roc_auc', cv = kf.split(X, y), verbose = 3)

start_time = timer(None) 
random_search.fit(X, y)
timer(start_time)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV 1/5] END learning_rate=0.02, max_depth=3, n_estimators=100, subsample=0.1;, score=0.909 total time=   0.3s
[CV 2/5] END learning_rate=0.02, max_depth=3, n_estimators=100, subsample=0.1;, score=0.840 total time=   0.2s
[CV 3/5] END learning_rate=0.02, max_depth=3, n_estimators=100, subsample=0.1;, score=0.914 total time=   0.3s
[CV 4/5] END learning_rate=0.02, max_depth=3, n_estimators=100, subsample=0.1;, score=0.815 total time=   0.3s
[CV 5/5] END learning_rate=0.02, max_depth=3, n_estimators=100, subsample=0.1;, score=0.887 total time=   0.2s


KeyboardInterrupt: 

In [23]:
#print('\n All results:')
#print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
#print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
#print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
#results = pd.DataFrame(random_search.cv_results_)
#results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=10, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=250,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

 Best hyperparameters:
{'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 250, 'subsample': 0.1}


### XGB test

In [36]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
data_df = pd.concat([train_df, test_df])

aplying_preprocesing(data_df, train_df, test_df)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']
X_test_kaggle = test_df.copy()

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
X_test_kaggle = std_scaler.transform(X_test_kaggle)

k_fold = KFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE
)

scores = []

for train_index, test_index in k_fold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    xgb = XGBClassifier(learning_rate=0.05, max_depth=10, n_estimators=250, subsample=0.1)
    xgb.fit(X_train, y_train)
    y_predict = xgb.predict(X_test)

    acc_score = round(accuracy_score(y_test, y_predict),3)

    print(acc_score)

    scores.append(acc_score)

print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")

0.877
0.826
0.888
0.815
0.843

Average: 85.0 %
Std: 2.8 %


## CatBoost

In [20]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
data_df = pd.concat([train_df, test_df])

aplying_preprocesing(data_df, train_df, test_df)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)

In [56]:
parameters = {
'iterations' : [50,100,250,500, 750, 1000],
'depth' : [1, 3, 5,6, 7,8, 9, 10],
'learning_rate' : [0.01,0.025,0.05, 0.1, 0.3],
'l2_leaf_reg' : [1, 3, 5, 10, 100],
'random_strength' : [10,20,50,42,1512],
'bagging_temperature' : [0, 0.1, 0.5,1,10,100],
'logging_level' : ['Silent']
}

In [None]:
folds = 5
param_comb = 100

kf = KFold(n_splits = folds, shuffle = True, random_state = 42)
cbc = CatBoostClassifier()
random_search = RandomizedSearchCV(cbc, param_distributions=parameters, scoring='roc_auc', cv = kf.split(X, y), verbose = 3, n_iter=param_comb)

start_time = timer(None) 
random_search.fit(X, y)
timer(start_time)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END bagging_temperature=100, depth=8, iterations=50, l2_leaf_reg=100, learning_rate=0.025, logging_level=Silent, random_strength=1512;, score=0.887 total time=   0.3s
[CV 2/5] END bagging_temperature=100, depth=8, iterations=50, l2_leaf_reg=100, learning_rate=0.025, logging_level=Silent, random_strength=1512;, score=0.811 total time=   0.2s
[CV 3/5] END bagging_temperature=100, depth=8, iterations=50, l2_leaf_reg=100, learning_rate=0.025, logging_level=Silent, random_strength=1512;, score=0.904 total time=   0.2s
[CV 4/5] END bagging_temperature=100, depth=8, iterations=50, l2_leaf_reg=100, learning_rate=0.025, logging_level=Silent, random_strength=1512;, score=0.800 total time=   0.2s
[CV 5/5] END bagging_temperature=100, depth=8, iterations=50, l2_leaf_reg=100, learning_rate=0.025, logging_level=Silent, random_strength=1512;, score=0.881 total time=   0.2s
[CV 1/5] END bagging_temperature=100, depth=7, iterations

In [59]:
#print('\n All results:')
#print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
#print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
#print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
#results = pd.DataFrame(random_search.cv_results_)
#results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 Best estimator:
<catboost.core.CatBoostClassifier object at 0x7f2bd9553ac0>

 Best hyperparameters:
{'random_strength': 42, 'logging_level': 'Silent', 'learning_rate': 0.01, 'l2_leaf_reg': 1, 'iterations': 250, 'depth': 7, 'bagging_temperature': 0.1}


### CatBoost test

In [60]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
data_df = pd.concat([train_df, test_df])

aplying_preprocesing(data_df, train_df, test_df)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']
X_test_kaggle = test_df.copy()

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
X_test_kaggle = std_scaler.transform(X_test_kaggle)

k_fold = KFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE
)

scores = []

for train_index, test_index in k_fold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    xgb = CatBoostClassifier(learning_rate=0.01, depth=5, iterations=750,random_strength=42, logging_level= 'Silent', l2_leaf_reg=1, bagging_temperature=0 )
    xgb.fit(X_train, y_train)
    y_predict = xgb.predict(X_test)

    acc_score = round(accuracy_score(y_test, y_predict),3)

    print(acc_score)

    scores.append(acc_score)

print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")

0.855
0.82
0.888
0.809
0.854

Average: 84.5 %
Std: 2.8 %


## SVC

In [5]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
data_df = pd.concat([train_df, test_df])

aplying_preprocesing(data_df, train_df, test_df)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)

In [6]:
parameters = {
'kernel' : ['linear', 'rbf', 'poly'],
'gamma' : [0.1, 0.5, 1, 5, 10, 25, 50 ,100],
'C' : [0.1, 1, 5, 10, 25, 50, 100, 250, 500, 1000],
'degree' : [0, 1, 2, 3, 4, 5, 6]
}

In [None]:
folds = 5
param_comb = 5000

kf = KFold(n_splits = folds, shuffle = True, random_state = 42)
svc = SVC()
random_search = RandomizedSearchCV(svc, param_distributions=parameters, scoring='roc_auc', cv = kf.split(X, y), verbose = 3, n_iter=param_comb)

start_time = timer(None) 
random_search.fit(X, y)
timer(start_time)



Fitting 5 folds for each of 1680 candidates, totalling 8400 fits
[CV 1/5] END C=0.1, degree=0, gamma=0.1, kernel=linear;, score=0.895 total time=   0.0s
[CV 2/5] END C=0.1, degree=0, gamma=0.1, kernel=linear;, score=0.838 total time=   0.0s
[CV 3/5] END C=0.1, degree=0, gamma=0.1, kernel=linear;, score=0.897 total time=   0.0s
[CV 4/5] END C=0.1, degree=0, gamma=0.1, kernel=linear;, score=0.802 total time=   0.0s
[CV 5/5] END C=0.1, degree=0, gamma=0.1, kernel=linear;, score=0.888 total time=   0.0s
[CV 1/5] END C=0.1, degree=0, gamma=0.1, kernel=rbf;, score=0.883 total time=   0.0s
[CV 2/5] END C=0.1, degree=0, gamma=0.1, kernel=rbf;, score=0.846 total time=   0.0s
[CV 3/5] END C=0.1, degree=0, gamma=0.1, kernel=rbf;, score=0.909 total time=   0.0s
[CV 4/5] END C=0.1, degree=0, gamma=0.1, kernel=rbf;, score=0.810 total time=   0.0s
[CV 5/5] END C=0.1, degree=0, gamma=0.1, kernel=rbf;, score=0.875 total time=   0.0s
[CV 1/5] END C=0.1, degree=0, gamma=0.1, kernel=poly;, score=0.500 tot

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
#print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
#print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
#results = pd.DataFrame(random_search.cv_results_)
#results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

## GradientBoosting

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
data_df = pd.concat([train_df, test_df])

aplying_preprocesing(data_df, train_df, test_df)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)

In [None]:
parameters = {
'min_samples_split' : [2, 5, 10, 25, 50, 100],
'max_depth' : [5,10,15,20,25,50],
'learning_rate' : [0.1, 0.01, 0.05, 0.025,0.3],
'n_estimators' : [25, 50,100, 250, 500, 1000, 2500],
'subsample' : [0.7, 0.75,0.8,0.85],
'min_samples_leaf' : [1, 5, 10, 25, 50, 100, 250]
}

In [None]:
folds = 5
param_comb = 5000

kf = KFold(n_splits = folds, shuffle = True, random_state = 42)
svc = GradientBoostingClassifier()
random_search = RandomizedSearchCV(svc, param_distributions=parameters, scoring='roc_auc', cv = kf.split(X, y), verbose = 3, n_iter=param_comb)

start_time = timer(None) 
random_search.fit(X, y)
timer(start_time)

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
#print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
#print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
#results = pd.DataFrame(random_search.cv_results_)
#results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

## MLP

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
data_df = pd.concat([train_df, test_df])

aplying_preprocesing(data_df, train_df, test_df)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)

In [None]:
parameters = {
'hidden_layer_sizes' : [(10,), (25,), (50,), (100), (250), (500,), (1000,)],
'activation' : ['identity','logistic','tanh','relu'],
'solver' : ['lbfgs', 'sgd', 'adam'],
'alpha' : [0.0001, 0.00001,0.000025,0.00005, 0.00025, 0.0005, 0.001],
'learning_rate' : ['constant', 'invscaling', 'adaptive'],
'min_samples_leaf' : [1, 5, 10, 25, 50, 100, 250],
}

In [None]:
folds = 5
param_comb = 5000

kf = KFold(n_splits = folds, shuffle = True, random_state = 42)
svc = MLPClassifier(max_iter=1500)
random_search = RandomizedSearchCV(svc, param_distributions=parameters, scoring='roc_auc', cv = kf.split(X, y), verbose = 3, n_iter=param_comb)

start_time = timer(None) 
random_search.fit(X, y)
timer(start_time)

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
#print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
#print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
#results = pd.DataFrame(random_search.cv_results_)
#results.to_csv('xgb-random-grid-search-results-01.csv', index=False)