In [None]:
import os, sys
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns 

from os.path import join 
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from IPython.display import display_html

import warnings
warnings.filterwarnings("ignore")

In [None]:
DataPath = r'D:\GitWork\titanic\data'
OutPath  = r'D:\GitWork\titanic\output'

f_abspath = join(DataPath, 'train.csv')
df_train = pd.read_csv(f_abspath)

f_abspath = join(DataPath, 'test.csv')
df_test = pd.read_csv(f_abspath)

df_data = df_train.append(df_test)
display(df_data.head())
display(df_data.describe())
display(df_data.info())

In [None]:
# PPS (Predictive Power Score)
import ppscore as pps # importing ppscore

col_selected = df_data.columns.drop(['PassengerId', 'Cabin'])
df_pps = df_data[col_selected]

fig, ax = plt.subplots(figsize=(10,8)) 
matrix_df = pps.matrix(df_pps)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
matrix_df = matrix_df.apply(lambda x: round(x, 2)) # Rounding matrix_df's values to 0, XX

sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.75, annot=True, ax=ax)

In [None]:
# Show null informations
def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table','table style="display:inline;margin:10px"'), raw=True)
    return

df1 = df_train.isnull().sum().to_frame().rename(columns={0:'Train null values'})
df2 = df_test.isnull().sum().to_frame().rename(columns={0:'Test null values'})
df3 = df_data.isnull().sum().to_frame().rename(columns={0:'All null values'})

display_side_by_side(df1, df2, df3)

In [None]:
def trainAndTest(train_df, test_df, col_selected):
    # Training
    trainX = train_df[col_selected]
    trainY = train_df.Survived
    
    model = RandomForestClassifier(
        random_state = 2, 
        n_estimators = 250, 
        min_samples_split = 20,
        oob_score = True
    ).fit(trainX, trainY)

    score = model.oob_score_
    
    # Test
    testX = test_df[col_selected]
    test_df['Survived'] = model.predict(testX).astype(int)

    return score, test_df

# def pivotSurvivalRate(df, idx, cols, val):
#     kwargs = { 'margins': True, 'margins_name': 'Total'}
#     pt = pd.pivot_table(
#         df,
#         values  = val,
#         index   = idx,
#         columns = cols,
#         aggfunc=len,
#         **kwargs
#     ).fillna(0).rename(columns={0.0: 'S0', 1.0: 'S1'})
#     pt['Rate_S1'] = (pt.S1 / pt.Total).round(3)
#     return pt

In [None]:
# For Sex

# Transform Sex
le_sex = LabelEncoder()
df_data['Sex_Code'] = le_sex.fit_transform(df_data.Sex)

# Dcit Sex
dict_sex = { le_sex.transform([x])[0]:x for x in le_sex.classes_ }
display(dict_sex)

# Training and predict
len_train = len(df_train)
train_set = df_data[:len_train]
test_set  = df_data[len_train:]

col_selected = ['Sex_Code']
score, resu_set = trainAndTest(train_set, test_set, col_selected)
print('Score:', score)

print( '\nTotal passengers is {}'.format(len(resu_set)) )

pt = pd.pivot_table(
    resu_set[['PassengerId','Survived','Sex']], 
    index   = 'Sex', 
    columns = 'Survived', 
    values  = 'PassengerId', 
    aggfunc = len,
).fillna(0).astype(int)
pt['Rate'] = (pt.sum(axis=1) / len(resu_set)).round(3)
print('\nTest result:'); display(pt)

score_str = int(round(score * 10000))
submit = join(OutPath, 'Submission_v5-2-1_{}.csv'.format(score_str))
print("'{}' has been saved.".format(submit))

out_set = resu_set[['PassengerId', 'Survived']]
out_set.to_csv(submit, index=False)

In [None]:
# For Pclass
kwargs = { 'margins': True, 'margins_name': 'Total'}
pt = pd.pivot_table(
    df_data,
    values  = 'PassengerId',
    index   = ['Pclass', 'Sex'],
    columns = 'Survived',
    aggfunc = len,
    **kwargs
).fillna(0)

pt.rename(columns={0.0:0, 1.0:1}, inplace=True)
pt['Rate_S1'] = (pt[1] / pt.Total).round(3)
display(pt)

# Add Pclass code column 
def pclassCode(row, pclass):
    return 1 if (row.Sex == 'female') & (row.Pclass == pclass) else 0
    
df_data['Pclass_1'] = df_data.apply(lambda x: pclassCode(x, 1), axis=1)
df_data['Pclass_2'] = df_data.apply(lambda x: pclassCode(x, 2), axis=1)

df_tmp = df_data.filter(regex='PassengerId|Survived|Sex.*|Pclass.*')
display(df_tmp.head())

# Training and predict
len_train = len(df_train)
train_set = df_data[:len_train]
test_set  = df_data[len_train:]

col_selected = ['Sex_Code', 'Pclass_1', 'Pclass_2']
score, resu_set = trainAndTest(train_set, test_set, col_selected)
print('Score:', score)

print( '\nTotal passengers is {}'.format(len(resu_set)) )

pt = pd.pivot_table(
    resu_set[['PassengerId','Survived','Sex','Pclass']], 
    index = ['Pclass', 'Sex'],
    columns = 'Survived', 
    values  = 'PassengerId', 
    aggfunc = len
).fillna(0).astype(int)
pt['Rate'] = (pt.sum(axis=1) / len(resu_set)).round(3)
print('\nTest result:'); display(pt)

score_str = int(round(score * 10000))
submit = join(OutPath, 'Submission_v5-2-2_{}.csv'.format(score_str))
print("'{}' has been saved.".format(submit))

out_set = resu_set[['PassengerId', 'Survived']]
out_set.to_csv(submit, index=False)