# Apply learnings to Titanic

In [20]:
import os
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: 
    path = Path('../input/titanic')
else:
    path = Path('titanic')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

In [21]:
import numpy as np, pandas as pd
from collections import Counter
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [22]:
df = pd.read_csv(path/'train.csv')
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
print('Imbalanced dataset?')
print(pd.DataFrame(Counter(df['Survived']), index=['survived']).transpose().sort_index())
df.isna().sum()

Imbalanced dataset?
   survived
0       549
1       342


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [24]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = df.columns[df.dtypes==object].tolist()
numerical_columns = df.columns[df.dtypes!=object].tolist()

print('categorical: ', categorical_columns)
print('numerical: ', numerical_columns)

categorical:  ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
numerical:  ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


### Let's build the model

In [25]:
from sklearn.base import TransformerMixin 

class NullValueImputer(TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for column in X.columns.tolist():
            if column in X.columns[X.dtypes==object].tolist():
                X[column] = X[column].fillna(X[column].mode()[0])     # omd: should I change to: .mode()[0] ?
            else:
                X[column]=X[column].fillna(-999.0)
        return X

In [26]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

class SparseMatrix(TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        categorical_columns= X.columns[X.dtypes==object].tolist()
        ohe = OneHotEncoder(handle_unknown='ignore')               # omd: may need to change later
        # ohe = OneHotEncoder() 
        hot = ohe.fit_transform(X[categorical_columns])
        cold_df = X.select_dtypes(exclude=["object"])
        cold = csr_matrix(cold_df)
        final_sparse_matrix = hstack((hot, cold))
        final_csr_matrix = final_sparse_matrix.tocsr()
        return final_csr_matrix

In [27]:
df = pd.read_csv(path/'train.csv')
df = df.loc[:, ~df.columns.isin(['Name', 'Ticket', 'Cabin'])]

X = df.iloc[:, 2:]
y = df.iloc[:, 1]

In [28]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [29]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

data_pipeline = Pipeline([('null_imputer', NullValueImputer()), ('sparse', SparseMatrix())])
X_train_transformed = data_pipeline.fit_transform(X_train)

In [30]:
from sklearn.pipeline import Pipeline
data_pipeline = Pipeline([('null_imputer', NullValueImputer()), ('sparse', SparseMatrix())])
X_train_transformed = data_pipeline.fit_transform(X_train)

In [31]:
def cross_val(model):
    # roc_scores = cross_val_score(model, X_train_transformed, y_train, scoring='roc_auc', cv=kfold)
    # roc = (roc_scores.mean())
    accuracy_scores = cross_val_score(model, X_train_transformed, y_train, scoring='accuracy', cv=kfold)
    acc = (accuracy_scores.mean())
    print(model)
    
    return acc

# Baseline

In [32]:
cross_val(XGBClassifier(missing=-999.0))

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=-999.0, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=None, verbosity=None)


0.8203344181348895

In [33]:
full_pipeline = Pipeline([('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix()), 
                          ('xgb', XGBClassifier(n_estimators=100, 
                                                missing=-999.0))])

In [34]:
full_pipeline.fit(X_train, y_train)



In [35]:
full_pipeline.predict(X_test)

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1])

In [39]:
y_pred = full_pipeline.predict(X_train)
accuracy_score(y_train, y_pred)

0.9805389221556886

In [40]:
from sklearn.metrics import accuracy_score

y_pred = full_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.7982062780269058