In [None]:
import os, sys
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns 

from os.path import join 
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from IPython.display import display_html

import warnings
warnings.filterwarnings("ignore")

In [None]:
DataPath = r'D:\GitWork\titanic\data'
OutPath  = r'D:\GitWork\titanic\output'

f_abspath = join(DataPath, 'train.csv')
df_train = pd.read_csv(f_abspath)

f_abspath = join(DataPath, 'test.csv')
df_test = pd.read_csv(f_abspath)

df_data = df_train.append(df_test)
display(df_data.head())
display(df_data.info())

In [None]:
# Preprocess: Name

# def getTitle(name):
#     if '.' in name:
#         return name.split(',')[1].split('.')[0].strip()
#     else:
#         return 'unknown'
# df_data['Title'] = df_data.Name.map(lambda x: getTitle(x))

col_selected = ['Title','Name_1','Name_2','Name_3']
for col in col_selected: df_data[col] = None

for idx, row in df_data.iterrows():
    instr = row.Name
    if ',' in instr:
        splits = instr.split(',')
        name1  = splits[0].strip()
        instr  = splits[1].strip()
    else:
        name1 = 'unknown'

    if '.' in instr:
        splits = instr.split('.')
        title  = splits[0].strip()
        instr  = splits[1].strip()
    else:
        title = 'unknown'
        
    name2 = instr
    name3 = ''
    
    pattern = r'\([A-Za-z \'\"\.-]+\)\"?'
    match = re.findall(pattern, name2)
    if match:
        name3 = match[0].strip()
        name2 = name2.replace(name3,'').strip()
        
    mask = (df_data.PassengerId == row.PassengerId)
    df_data.loc[mask, col_selected] = [title,name1,name2,name3]

display(df_data.head())

In [None]:
# Preprocess: Family = SibSp + Parch

df_data['Family'] = df_data.SibSp + df_data.Parch

kwargs = { 'margins': True, 'margins_name': 'Total'}

pt = pd.pivot_table(
    df_data,
    index   = 'Family', 
    columns = 'Survived', 
    values  = 'PassengerId', 
    aggfunc = 'count',
    **kwargs
).fillna(0).astype(int)
pt['Rate_S1'] = (pt[1.0] / pt.Total).round(4)

display(pt)

In [None]:
# Fill missing values

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table','table style="display:inline;margin:10px"'), raw=True)
    return

df1 = df_train.isnull().sum().to_frame().rename(columns={0:'Train null values'})
df2 = df_test.isnull().sum().to_frame().rename(columns={0:'Test null values'})
df3 = df_data.isnull().sum().to_frame().rename(columns={0:'All null values'})

display_side_by_side(df1, df2, df3)

In [None]:
# Fill missing values: Fare

display(df_data.Fare.describe())

df = df_data[df_data.Fare.isnull()]
display(df)

mask = (df_data.Pclass==3)
print('Describe of Fare of Pclass 3:')
display(df_data[mask].Fare.describe())

median_fare = df_data[mask].Fare.median()
print('Median of Fare of Pclass 3:', median_fare)

mean_fare = df_data[mask].Fare.mean().round(4)
print('\nFill missing fare with {}', mean_fare)

df_data['Fare_F'] = df_data.Fare.copy()
df_data.Fare_F.fillna(mean_fare, inplace=True)

display(df_data.Fare_F.describe())

In [None]:
# Fill missing values: Embarked

display(df_data.Embarked.describe(include=['O']))

df = df_data[df_data.Embarked.isnull()]
display(df)

df = df_data[df_data.Cabin.notnull()]

# If exist the same fare tickets?
mask = (df.Fare==80.0)
display(df[mask])

# The passenger is single, pclass 1 and cabin head 'B2'
mask = ((df.Pclass==1) & (df.Sex=='female') & df.Cabin.str.startswith('B2'))
display(df[mask])
df = df[mask].groupby('Embarked')['PassengerId'].agg('count').to_frame() 
display(df)

most_embarked = df.PassengerId.idxmax()
print("Fill missing Embarked with '{}'".format(most_embarked))

df_data['Embarked_F'] = df_data.Embarked.copy()
df_data.Embarked_F.fillna(most_embarked, inplace=True)

display(df_data.Embarked_F.describe(include=['O']))

In [None]:
# Explore missing values: Age

display(df_data.Age.describe())

ct1 = df_data.groupby('Title').agg({'Age':['mean','median']}).round(3)

# ct1 = df_data.groupby('Title').agg(
#     mean=('Age', 'mean'), median=('Age','median')
# ).round(4)

ct2 = pd.crosstab(
    df_data.Title, 
    df_data.Sex, 
    values  = df_data.PassengerId, 
    aggfunc = ['count'], 
    **kwargs
).fillna(0).astype(int)

ct3 = pd.crosstab(
    df_data.Title, 
    df_data.Sex, 
    values  = df_data.Age, 
    aggfunc = ['count'], 
    **kwargs
).fillna(0).astype(int)

display_side_by_side(ct1, ct2, ct3)

In [None]:
# Fill missing values: Age 

df_data['Age_F'] = df_data.Age.copy()

median_Dr  = df_data[df_data.Age_F.notnull() & (df_data.Title=='Dr')]['Age'].median()
median_Mas = df_data[df_data.Age_F.notnull() & (df_data.Title=='Master')]['Age'].median()
median_Mis = df_data[df_data.Age_F.notnull() & (df_data.Title=='Miss')]['Age'].median()
median_Mr  = df_data[df_data.Age_F.notnull() & (df_data.Title=='Mr')]['Age'].median()
median_Mrs = df_data[df_data.Age_F.notnull() & (df_data.Title=='Mrs')]['Age'].median()
median_Ms  = df_data[df_data.Age_F.notnull() & (df_data.Title=='Ms')]['Age'].median()

df_data.loc[df_data.Age_F.isnull() & (df_data.Title=='Dr'),     'Age_F'] = median_Dr
df_data.loc[df_data.Age_F.isnull() & (df_data.Title=='Master'), 'Age_F'] = median_Mas
df_data.loc[df_data.Age_F.isnull() & (df_data.Title=='Miss'),   'Age_F'] = median_Mis
df_data.loc[df_data.Age_F.isnull() & (df_data.Title=='Mr'),     'Age_F'] = median_Mr
df_data.loc[df_data.Age_F.isnull() & (df_data.Title=='Mrs'),    'Age_F'] = median_Mrs
df_data.loc[df_data.Age_F.isnull() & (df_data.Title=='Ms'),     'Age_F'] = median_Ms

ct2 = pd.crosstab(
    df_data.Title, 
    df_data.Sex, 
    values = df_data.Age, 
    aggfunc=['count'], 
    **kwargs
).fillna(0).astype(int)

ct3 = pd.crosstab(
    df_data.Title, 
    df_data.Sex, 
    values = df_data.Age_F, 
    aggfunc=['count'], 
    **kwargs
).fillna(0).astype(int)

display_side_by_side(ct2, ct3)

In [None]:
# Merge Titles
# ther are 6 clssses, i.e. Master, Mr, Miss, Mrs, Midlife (35-50), Mature (50-80)

df_data['Title_F'] = df_data.Title.copy()
print(sorted(df_data.Title_F.unique()))

# df_data.Title_F.replace(['Master'], 'Master', inplace=True)
# df_data.Title_F.replace(['Mr'],  'Mr', inplace=True)

df_data.Title_F.replace(['Mlle','Mme', 'Ms'], 'Miss', inplace=True)
df_data.Title_F.replace(['the Countess'], 'Mrs', inplace=True)

mask = ['Don','Dona','Dr','Jonkheer','Lady','Major','Rev','Sir']
df_data.Title_F.replace(mask, 'Midlife', inplace=True)

df_data.Title_F.replace(['Capt','Col'], 'Mature', inplace=True)

display(sorted(df_data.Title_F.unique()))

ct = df_data.groupby('Title_F').agg(Count=('Title_F','count')).round(3)
display(ct)

In [None]:
# Explore missing values: Cabin

display(df_data.Cabin.describe(include=['O']))

df_data['Cabin_F'] = df_data.apply(lambda x: x.Cabin[0] if pd.notnull(x.Cabin) else '-', axis=1)

pt1 = pd.pivot_table(
    df_data,
    index   = 'Cabin_F',
    columns = 'Pclass',
    values  = 'PassengerId',
    aggfunc = len,
    **kwargs
).fillna('-')

pt2 = pd.pivot_table(
    df_data[df_data.Survived.notnull()],
    index   = 'Cabin_F',
    columns = 'Pclass',
    values  = 'PassengerId',
    aggfunc = len,
    **kwargs
).fillna('-')

pt3 = pd.pivot_table(
    df_data[df_data.Survived.notnull()],
    index   = 'Cabin_F',
    columns = 'Survived',
    values  = 'PassengerId',
    aggfunc = len,
    **kwargs
).fillna(0).astype(int)
pt3['Rate_S1'] = (pt3[1.0]/pt3.Total).round(4)

display_side_by_side(pt1, pt2, pt3)

# pt4 = pd.pivot_table(
#     df_data,
#     index   = 'Cabin_Head',
#     columns = ['Survived','Pclass'],
#     values  = 'Cabin_F',
#     aggfunc = len,
#     **kwargs
# ).fillna('-')
# print()
# display_side_by_side(pt3, pt4)


In [None]:
# Fill missing values: Cabin

# If Survied is alive, fill missing cabin as 'B'
mask = ((df_data.Survived==1) & (df_data.Cabin_F=='-'))
df_data.loc[mask, 'Cabin_F'] = 'B'

# If Survied is dead, fill missing cabin as 'G'
mask = ((df_data.Survived==0) & (df_data.Cabin_F=='-'))
df_data.loc[mask, 'Cabin_F'] = 'G'

# If Survived is null and Pclass is 1, fill missing cabin as 'B'
mask = (df_data.Survived.isnull() & (df_data.Cabin_F=='-') & (df_data.Pclass==1))
df_data.loc[mask, 'Cabin_F'] = 'B'

# If Survived is null and Pclass is 2, fill missing cabin as 'F'
mask = (df_data.Survived.isnull() & (df_data.Cabin_F=='-') & (df_data.Pclass==2))
df_data.loc[mask, 'Cabin_F'] = 'F'

# If Survived is null and Pclass is 3, fill missing cabin as 'G'
mask = (df_data.Survived.isnull() & (df_data.Cabin_F=='-') & (df_data.Pclass==3))
df_data.loc[mask, 'Cabin_F'] = 'G'

pt = pd.pivot_table(
    df_data,
    index   = 'Cabin_F',
    columns = 'Pclass',
    values  = 'PassengerId',
    aggfunc = len,
    **kwargs
).fillna('-')
display(pt)

In [None]:
# Explore: Ticket

df_data['Ticket_Head'] = df_data.Ticket.str.extract('([A-Za-z]*)', expand=False).str.upper()
print(df_data.Ticket_Head.unique())

In [None]:
col_selected = [
    'PassengerId','Age_F','Cabin_F','Embarked_F','Family','Fare_F','Pclass','Sex','Survived','Ticket','Title'
]
df = df_data[col_selected]
display(df.info())

# PPS (Predictive Power Score)
import ppscore as pps # importing ppscore

col_selected = ['Age_F','Cabin_F','Embarked_F','Family','Fare_F','Pclass','Sex','Survived','Ticket','Title']
df_pps = df_data[col_selected]

fig, ax = plt.subplots(figsize=(10,8)) 
matrix_df = pps.matrix(df_pps)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
matrix_df = matrix_df.apply(lambda x: round(x, 2)) # Rounding matrix_df's values to 0, XX

sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.75, annot=True, ax=ax)

In [None]:
def trainAndTest(train_df, test_df, col_selected):
    # Training
    trainX = train_df[col_selected]
    trainY = train_df.Survived
    
    model = RandomForestClassifier(
        random_state = 2, 
        n_estimators = 250, 
        min_samples_split = 20,
        oob_score = True
    ).fit(trainX, trainY)

    score = model.oob_score_
    
    # Test
    testX = test_df[col_selected]
    predictions = model.predict(testX).astype(int)
    
    return score, predictions

In [None]:
# Transform: Sex

le = LabelEncoder()
df_data['Sex_Code'] = le.fit_transform(df_data.Sex)

dict_sex = dict(zip(le.classes_, le.transform(le.classes_)))
display(dict_sex)

In [None]:
# Transform: Title

le = LabelEncoder()
df_data['Title_Code'] = le.fit_transform(df_data.Title_F)

dict_title = dict(zip(le.classes_, le.transform(le.classes_)))
display(dict_title)

In [None]:
# Transform: Fare

df_data['Fare_Bin5'] = pd.qcut(df_data.Fare_F, 5)

le = LabelEncoder()
df_data['Fare_BinCode5'] = le.fit_transform(df_data.Fare_Bin5)

df_data.drop('Fare_Bin5', axis=1, inplace=True)

dict_fare = dict(zip(le.classes_, le.transform(le.classes_)))
print('Dict fare:'); display(dict_fare)

In [None]:
# Add column: Connected_Survived
#
df_data['Connected_Survival'] = 0.5

for tk, df_grp in df_data.groupby('Ticket'):
    if (len(df_grp) > 1):
        for idx, row in df_grp.iterrows():
            smax = df_grp.drop(idx).Survived.max()
            smin = df_grp.drop(idx).Survived.min()
            passId = row.PassengerId
            if (smax == 1.0):
                df_data.loc[df_data.PassengerId == passId, 'Connected_Survival'] = 1
            elif (smax == 0.0):
                df_data.loc[df_data.PassengerId == passId, 'Connected_Survival'] = 0

df = df_data.groupby('Connected_Survival')['Connected_Survival'].agg(['count']).astype(int)
pt.loc['Total'] = df.sum(axis=0)
display(df)

# num_same_tk = len(df_data.groupby('Ticket').count() > 1)
len_tk_same = len(df_data.Ticket.value_counts() > 1)
print('People keep same ticket: {}'.format(len_tk_same))

len_tk_conn = df_data[df_data.Connected_Survival != 0.5]['Connected_Survival'].count()
print('People have connected infomation: {}'.format(len_tk_conn))

col_selected = ['PassengerId','Survived','Family','Ticket','Connected_Survival']
df = df_data[col_selected]

ct = pd.crosstab(
    df.Connected_Survival,
    df.Survived,
    values  = df.PassengerId,
    aggfunc = ['count'],
    **kwargs
).astype(int)
ct = ct.rename(columns=lambda x: x.replace('count','Count') if isinstance(x,str) else x)
ct[('Rate','S1')] = (ct[('Count',1.0)]/ct[('Count','Total')]).round(3)
display(ct)

In [None]:
# Explore: Sex

col_selected = ['PassengerId','Survived','Sex']
dset = df_data[col_selected]

kwargs = { 'margins': True, 'margins_name': 'Total'}
pt = pd.pivot_table(
    dset,
    index   = 'Sex', 
    columns = 'Survived', 
    values  = 'PassengerId', 
    aggfunc = len,
    **kwargs
).fillna(0).astype(int)
pt['Rate_S1'] = (pt[1.0] / pt.Total).round(3)

print('\nSurvived vs Sex:')
display_html(pt.to_html().replace('table','table style="display:inline;margin:30px"'), raw=True)

In [None]:
# Training: Sex

len_train = len(df_train)
tran_set = df_data[:len_train]
test_set  = df_data[len_train:]

col_selected = ['Sex_Code']
score, preds = trainAndTest(tran_set, test_set, col_selected)
print('Score:', score)

test_set['Survived'] = preds

col_selected = ['PassengerId','Survived','Sex']
dset = test_set[col_selected]
pt = pd.pivot_table(
    dset,
    index   = 'Sex', 
    columns = 'Survived', 
    values  = 'PassengerId', 
    aggfunc = len,
    **kwargs
).fillna(0).astype(int)
pt['Rate_S1'] = (pt[1.0] / pt.Total).round(3)

print('\nTest result:')
display_html(pt.to_html().replace('table','table style="display:inline;margin:30px"'), raw=True)

# score_str = int(round(score_sex * 10000))
# submit = join(OutPath, 'Submission_v5-2-1_{}.csv'.format(score_str))
# print("'{}' has been saved.".format(submit))

# out_set = resu_set[['PassengerId', 'Survived']]
# out_set.to_csv(submit, index=False)

In [None]:
# Explore Pclass

col_selected = ['PassengerId','Survived','Sex_Code','Pclass']
df = df_data[col_selected]

pt = pd.pivot_table(
    df,
    index   = 'Pclass', 
    columns = 'Survived', 
    values  = 'PassengerId', 
    aggfunc = len,
    **kwargs
).fillna(0).astype(int)
pt['Rate_S1'] = (pt[1.0] / pt.Total).round(4)

print('Pclass vs Survived:')
display_html(pt.to_html().replace('table','table style="display:inline;margin:30px"'), raw=True)


col_selected = ['PassengerId','Survived','Sex','Pclass']
dset = df_data[col_selected]
pt = pd.pivot_table(
    dset,
    index   = ['Sex','Pclass'], 
    columns = 'Survived', 
    values  = 'PassengerId', 
    aggfunc = len,
    **kwargs
).fillna(0).astype(int)
pt['Rate_S1'] = (pt[1.0] / pt.Total).round(4)

print('\nSex & Pclass vs Survived:')
display_html(pt.to_html().replace('table','table style="display:inline;margin:30px"'), raw=True)

In [None]:
# If Pclass classes more than 3 class, will the prediction accuracy rate increase? 
# The answer is not. Drop 'Pclass_Code' column

def getPclassCode(row):
    code = 0
    if row.Sex=='female':
        code = 1 if row.Pclass in [1,2] else 0.5
    else:
        code = 0.3 if row.Pclass == 1 else 0.1
    return code
    
df_data['Pclass_Code'] = df_data.apply(lambda x: getPclassCode(x), axis=1)
display(df_data.Pclass_Code.describe())

# Training and predict
len_train = len(df_train)
tran_set = df_data[:len_train]
test_set  = df_data[len_train:]

col_selected = ['Sex_Code', 'Pclass_Code']
score, preds = trainAndTest(tran_set, test_set, col_selected)

print('Score:', score)

df_data.drop('Pclass_Code', axis=1, inplace=True)