In [150]:
import pandas as pd
import numpy

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import FunctionTransformer

## Import data

In [151]:
train_url = 'https://bit.ly/titanic-train-set'
test_url = 'https://bit.ly/titanic-test-set'

In [152]:
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)

In [153]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [154]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Exploratory data analysis (easy steps)
- Missing data
- Categorical data
- Text/date data
- drop columns

### Missing data
Let's find all the columns with missing data

In [155]:
df_train.isna().sum().loc[df_train.isna().sum() > 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [156]:
df_test.isna().sum().loc[df_test.isna().sum() > 0]

Age       86
Fare       1
Cabin    327
dtype: int64

The train set has missing values in Age, Cabin and Embarqued. <br>
The test set has missing data in Age fare and Cabin.

### Categorical data

In [157]:
cat_cols = [col for col in df_train.columns if df_train[col].dtype=="O"]
for col in cat_cols:
    print('Unique value in column ',col, ' : ',len(df_train[col].unique()))

Unique value in column  Name  :  891
Unique value in column  Sex  :  2
Unique value in column  Ticket  :  681
Unique value in column  Cabin  :  148
Unique value in column  Embarked  :  4


Sex and Embarqued are categorical data <br>
Name, ticket and cabin are probably text data

We can summarize all this by splitting all the columns in different categories

In [158]:
target = ['Survived']

drop_cols = ['Ticket', 'PassengerId']
passthrough_cols = ['Pclass', 'SibSp', 'Parch']

num_cols_with_missing_data = ['Age', 'Fare']
cat_cols_with_missing_data = ['Embarked']
cat_cols_without_missing_data = ['Sex']

text_cols_with_missing_data = ['Cabin']
text_cols_without_missing_data = ['Name']

### Verify if there is missing columns or duplicate columns

In [159]:
 all_cols = [
    target,
    drop_cols,
    passthrough_cols,
    num_cols_with_missing_data,
    cat_cols_with_missing_data,
    cat_cols_without_missing_data,
    text_cols_with_missing_data,
    text_cols_without_missing_data
 ]

In [160]:
all_cols_set = set()
for list_ in all_cols:
    for col in list_:
        if(col in all_cols_set):
            print('Warning, column ',col,' is duplicate')
        all_cols_set.add(col)

original_cols_set = set(df_train.columns)
badly_written_cols = all_cols_set - original_cols_set
missing_cols = original_cols_set - all_cols_set
print('Columns badly written :', badly_written_cols)
print('Missing columns :', missing_cols)

Columns badly written : set()
Missing columns : set()


## Exploratory data analysis (advanced steps)
- Outliers (removing outliers)
- Scaling and transform
    - Standard scaler
    - Robust scaler
    - Log-transform
- Feature selection / Dimensionality reduction

## Separate features and target

In [161]:
X = df_train.drop(columns = target)
Y = df_train[target].values.ravel()

In [162]:
X.shape, Y.shape

((891, 11), (891,))

## Create preprocessing pipeline
check KNN Imputer

In [163]:
missing_cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)

In [164]:
import pandas as pd
#custom function
def extract_first_letter(serie):
    return pd.DataFrame(serie.str[0])

In [165]:
preprocess_cabin = make_pipeline(
    FunctionTransformer(extract_first_letter),
    SimpleImputer(strategy='constant', fill_value='MISSING_CABIN'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)

#test preprocess cabin
preprocess_cabin.fit_transform(X['Cabin'])

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [166]:
preprocessing = make_column_transformer(
    ('drop' ,                                 drop_cols),
    ('passthrough' ,                          passthrough_cols),
    (SimpleImputer(strategy='median') ,       num_cols_with_missing_data),
    (missing_cat_preprocessing ,              cat_cols_with_missing_data),
    (OneHotEncoder(handle_unknown='ignore') , cat_cols_without_missing_data),
    (preprocess_cabin ,                       'Cabin'),
    (CountVectorizer() ,                      'Name')
)

## Complete pipeline

In [167]:
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN

In [253]:
knn_pipeline = Pipeline([
    ('preprocessing' , preprocessing),
    ('knn', KNN())
])

In [169]:
ridge_pipeline = Pipeline([
    ('preprocessing' , preprocessing),
    ('ridge', RidgeClassifier())
])

In [170]:
rf_pipeline = Pipeline([
    ('preprocessing' , preprocessing),
    ('rf', RandomForestClassifier())
])

In [172]:
knn_pipeline.fit(X, Y)
pred_knn = knn_pipeline.predict(X)
(pred_knn == Y).mean()

0.8249158249158249

In [173]:
ridge_pipeline.fit(X, Y)
pred_ridge = ridge_pipeline.predict(X)
(pred_ridge == Y).mean()

0.9988776655443322

In [174]:
rf_pipeline.fit(X, Y)
pred_rf = rf_pipeline.predict(X)
(pred_rf == Y).mean()

1.0

## K-fold Cross Validation

In [213]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [208]:
folds = KFold(n_splits=5, shuffle=True, random_state=0)

In [209]:
cv_score = cross_val_score(knn_pipeline, X, Y, cv=folds)
cv_score.mean(), cv_score.std()

(0.7238842508317118, 0.01921603325051323)

In [210]:
cv_score = cross_val_score(ridge_pipeline, X, Y, cv=folds)
cv_score.mean(), cv_score.std()

(0.8136902893729208, 0.012023241910489538)

In [211]:
cv_score = cross_val_score(rf_pipeline, X, Y, cv=folds)
cv_score.mean(), cv_score.std()

(0.8237900947837549, 0.007687619488076668)

### GridSearch on KNN

In [221]:
knn= {}
knn['pipeline'] = knn_pipeline

knn['hyperparameter'] = {}
knn['hyperparameter']['knn__n_neighbors'] = [1,3,5,7,9,13,17,21,25]
knn['hyperparameter']['knn__weights'] = ['uniform', 'distance']

knn['gridsearch'] = GridSearchCV(estimator  = knn['pipeline'],
                                 param_grid = knn['hyperparameter'],
                                 scoring    = 'accuracy',
                                 cv         = folds)

In [259]:
knn['gridsearch'].fit(X, Y)
knn['gridsearch'].best_params_ , knn['gridsearch'].best_score_

({'knn__n_neighbors': 3, 'knn__weights': 'distance'}, 0.7329044002259746)

### GridSearch on ridge

In [243]:
ridge= {}
ridge['pipeline'] = ridge_pipeline

ridge['hyperparameter'] = {}
ridge['hyperparameter']['ridge__alpha'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 100]

ridge['gridsearch'] = GridSearchCV(estimator = ridge['pipeline'],
                                 param_grid  = ridge['hyperparameter'],
                                 scoring     = 'accuracy',
                                 cv          = folds)

In [260]:
ridge['gridsearch'].fit(X, Y)
ridge['gridsearch'].best_params_ , ridge['gridsearch'].best_score_

({'ridge__alpha': 5}, 0.8293955181721172)

### GridSearch on RandomForest

In [265]:
rf= {}
rf['pipeline'] = rf_pipeline

rf['hyperparameter'] = {}
rf['hyperparameter']['rf__n_estimators'] = [150, 200, 250]
rf['hyperparameter']['rf__criterion'] = ['gini', 'entropy']
rf['hyperparameter']['rf__max_features'] = ['sqrt', 'auto', 'log2']

rf['gridsearch'] = GridSearchCV(estimator    = rf['pipeline'],
                                 param_grid  = rf['hyperparameter'],
                                 scoring     = 'accuracy',
                                 cv          = folds)

In [266]:
rf['gridsearch'].fit(X, Y)
rf['gridsearch'].best_params_ , rf['gridsearch'].best_score_

({'rf__criterion': 'entropy',
  'rf__max_features': 'sqrt',
  'rf__n_estimators': 200},
 0.832772581758835)

In [267]:
final_pipeline = make_pipeline(
    preprocessing,
    RandomForestClassifier(criterion='entropy', 
                           max_features='sqrt', 
                           n_estimators=200)
)

In [271]:
final_pipeline.fit(X,Y)
final_pipeline.predict(df_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,