# Apply learnings to Titanic

In [1]:
import os
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: 
    path = Path('../input/titanic')
else:
    path = Path('titanic')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

In [2]:
import numpy as np, pandas as pd
from collections import Counter
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [3]:
df = pd.read_csv(path/'train.csv')
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print('Imbalanced dataset?')
print(pd.DataFrame(Counter(df['Survived']), index=['survived']).transpose().sort_index())
df.isna().sum()

Imbalanced dataset?
   survived
0       549
1       342


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = df.columns[df.dtypes==object].tolist()
numerical_columns = df.columns[df.dtypes!=object].tolist()

print('categorical: ', categorical_columns)
print('numerical: ', numerical_columns)

categorical:  ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
numerical:  ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


### Let's build the model

In [8]:
import re

def get_title(name):
    pattern = ',\s.*\.\s'
    match_results = re.search(pattern, name)
    result = match_results.group()[2:-2]
    if result not in ['Mr', 'Miss', 'Mrs', 'Master', 'Dr', 'Rev']:
        result = 'Other'
    return result

In [64]:
# gridsearch 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, RepeatedStratifiedKFold

def grid_search(params, random=False): 
    
    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', missing=-999.0,
                        random_state=2, use_label_encoder=False)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    # kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=2)
    
    if random:
        grid = RandomizedSearchCV(xgb, params, scoring='accuracy', cv=kfold, n_iter=20, n_jobs=-1, random_state=2)
    else:
        grid = GridSearchCV(xgb, params, scoring='accuracy', cv=kfold, n_jobs=-1)
    
    grid.fit(X_train_transformed, y_train)
    best_params = grid.best_params_
    print("Best params:", best_params)
    best_score = grid.best_score_
    print("Best score: {:.5f}".format(best_score))

    print(xgb)

    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', 
                        random_state=2, use_label_encoder=False, 
                        params=best_params, missing=-999.0)
    print(xgb)
    xgb.fit(X_train_transformed, y_train)
    y_pred = xgb.predict(X_test_transformed)
    print('Test score: {0:0.8f}'.format(accuracy_score(y_pred, y_test)))

# Load data

In [10]:
df = pd.read_csv(path/'train.csv')
X = df.iloc[:, 2:]
y = df.iloc[:, 1]

In [11]:
from sklearn.base import TransformerMixin

class CustomPreprossessing(TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # my custom code
        X['title'] = X.apply(lambda x: get_title(x['Name']), axis=1)
        X['Cabin'].fillna('Empty', inplace=True)
        X['cab'] = [i if i in ['A', 'B', 'C', 'D', 'E', 'F'] else 'Other' for i in X['Cabin'].str[:1]]
        X = X.loc[:, ~X.columns.isin(['Name', 'Ticket', 'Cabin'])]
        return X      

In [12]:
from sklearn.base import TransformerMixin 

class NullValueImputer(TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for column in X.columns.tolist():
            if column in X.columns[X.dtypes==object].tolist():
                X[column] = X[column].fillna(X[column].mode()[0])     # omd: should I change to: .mode()[0] ?
            else:
                X[column]=X[column].fillna(-999.0)
        return X

In [13]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

class SparseMatrix(TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        categorical_columns= X.columns[X.dtypes==object].tolist()
        ohe = OneHotEncoder(handle_unknown='ignore')               # omd: may need to change later
        # ohe = OneHotEncoder() 
        hot = ohe.fit_transform(X[categorical_columns])
        cold_df = X.select_dtypes(exclude=["object"])
        cold = csr_matrix(cold_df)
        final_sparse_matrix = hstack((hot, cold))
        final_csr_matrix = final_sparse_matrix.tocsr()
        return final_csr_matrix

In [14]:
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [40]:
from sklearn.pipeline import Pipeline

kfold = KFold(n_splits=5, shuffle=True, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2)

data_pipeline = Pipeline([('custom_prepross', CustomPreprossessing()), ('null_imputer', NullValueImputer()), ('sparse', SparseMatrix())])
X_train_transformed = data_pipeline.fit_transform(X_train)
X_test_transformed = data_pipeline.fit_transform(X_test)

In [16]:
def cross_val(model):
    # roc_scores = cross_val_score(model, X_train_transformed, y_train, scoring='roc_auc', cv=kfold)
    # roc = (roc_scores.mean())
    accuracy_scores = cross_val_score(model, X_train_transformed, y_train, scoring='accuracy', cv=kfold)
    acc = (accuracy_scores.mean())
    print(model)
    
    return acc

# Baseline

In [17]:
cross_val(XGBClassifier(missing=-999.0))

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=-999.0, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=None, verbosity=None)


0.7918303220738413

In [89]:
# booster='gblinear'
full_pipeline = Pipeline([('custom_prepross', CustomPreprossessing()),
                          ('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix()), 
                          ('xgb', XGBClassifier(booster='gblinear', missing=-999.0))])
full_pipeline.fit(X_train, y_train)



In [77]:
# random forest
from xgboost import XGBRFClassifier

full_pipeline = Pipeline([('custom_prepross', CustomPreprossessing()),
                          ('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix()), 
                          ('xgb', XGBRFClassifier(missing=-999.0))])
full_pipeline.fit(X_train, y_train)



In [74]:
# booster='dart'
full_pipeline = Pipeline([('custom_prepross', CustomPreprossessing()),
                          ('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix()), 
                          ('xgb', XGBClassifier(booster='dart', one_drop=True, missing=-999.0))])
full_pipeline.fit(X_train, y_train)



In [71]:
# params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}     # from GridSearch below
params = {'subsample': 1, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'colsample_bynode': 0.8}
full_pipeline = Pipeline([('custom_prepross', CustomPreprossessing()),
                          ('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix()), 
                          ('xgb', XGBClassifier(params=params, missing=-999.0))])
full_pipeline.fit(X_train, y_train)
# print(XGBClassifier(params=params, missing=-999.0))

Parameters: { "params" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [90]:
from sklearn.metrics import accuracy_score 

y_pred = full_pipeline.predict(X_train)
print('Train accuracy: {0:0.6f}'.format(accuracy_score(y_train, y_pred)))
y_pred = full_pipeline.predict(X_test)
print('Test accuracy: {0:0.6f}'.format(accuracy_score(y_test, y_pred)))

Train accuracy: 0.835329
Test accuracy: 0.807175


### Score 'Hold-Out' Test dataset

| Kaggle score | hyperparameters | Accuracy Score |  Date  |
|----------|---------------- |--------|---------|
| 0.72966  |  'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300 | train: 0.83388  test: 0.8116591928251121 | Jul 17, 2022 |
| 0.72966  |  'subsample': 1, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'colsample_bynode': 0.8 | train: 0.83238 test: 0.8116591928251121 | Jul 17, 2022 |
| 0.73684  |  booster='dart', one_drop=True | train: 0.979042  test: 0.802691 | Jul 17, 2022 |
| 0.77272  |  XGBRFClassifier | train: 0.886228  test: 0.829596 | Jul 17, 2022 |
|  0.75358 | XGBClassifier(booster='gblinear') | train: 0.835329  test: 0.807175  | Jul 17, 2022 |



In [91]:
df_holdout = pd.read_csv(path/'test.csv')
X_holdout = df_holdout.iloc[:, 1:]
y_holdout = full_pipeline.predict(X_holdout)
df_submission = df_holdout[['PassengerId']]
df_submission['Survived'] = y_holdout.tolist()
print(df_submission.head())
df_submission.to_csv(path/'submission.csv', index=False)

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1


---

### GridSearch

In [88]:
# Best params: {'subsample': 1, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'colsample_bynode': 0.8}
# Best score: 0.83238
# Test score: 0.81165919

grid_search(params={'learning_rate':[0.01],
                    'n_estimators':[100],
                    'scale_pos_weight':[1],
                    'max_depth':[6],
                    'gamma':[0.5], # Best score: 0.83532     #Test score: 0.81165919
                    'min_child_weight':[1],  # Best score: 0.83532   # Test score: 0.81165919
                    'subsample':[1],
                    'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]
                    },
            random=False)

Best params: {'colsample_bytree': 1, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 1}
Best score: 0.83532
XGBClassifier(base_score=None, booster='gbtree', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=-999.0, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=2, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=None,
              verbosity=None)
XGBClassifier(base_score=None, booster='gbtree', colsample_bylevel=None,
     

--- 

### What if test dataset has additional categories for a variable?  
[Regular Expressions in Python](http://chris35wills.github.io/courses/Intermediate_python/regexp/) 

In [76]:
df = pd.read_csv(path/'train.csv')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [83]:
import re

def get_title(name):
    pattern = ',\s.*\.\s'
    match_results = re.search(pattern, name)
    result = match_results.group()[2:-2]
    if result not in ['Mr', 'Miss', 'Mrs', 'Master', 'Dr', 'Rev']:
        result = 'Other'
    return result

In [84]:
df['title'] = df.apply(lambda x: get_title(x['Name']), axis=1)

In [21]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,title
0,1,0,3,male,22.0,1,0,7.25,S,Mr
1,2,1,1,female,38.0,1,0,71.2833,C,Mrs
2,3,1,3,female,26.0,0,0,7.925,S,Miss
3,4,1,1,female,35.0,1,0,53.1,S,Mrs
4,5,0,3,male,35.0,0,0,8.05,S,Mr


In [86]:
df['title'].value_counts()

Mr        517
Miss      182
Mrs       124
Master     40
Other      15
Dr          7
Rev         6
Name: title, dtype: int64