In [1]:
import os, sys
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns 

from os.path import join 
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from IPython.display import display_html

import warnings
warnings.filterwarnings("ignore")

In [3]:
DataPath = r'D:\GitWork\titanic\data'
OutPath  = r'D:\GitWork\titanic\output'

f_abspath = join(DataPath, 'train.csv')
df_train = pd.read_csv(f_abspath)
display(df_train.head())
display(df_train.describe())

f_abspath = join(DataPath, 'test.csv')
df_test = pd.read_csv(f_abspath)
display(df_test.head())
display(df_test.describe())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [39]:
# Preprocess Pclass
features = ['PassengerId', 'Survived', 'Sex', 'Pclass', 'Fare']
df_data = df_train[features]

# For Sex mapping
le = LabelEncoder()
df_data.insert(3, 'Sex_Code', le.fit_transform(df_data.Sex))
display(df_data.head())

# For fare mapping
df_data['Bin4'] = pd.qcut(df_data.Fare, 4)
df_data['Bin4_Code'] = le.fit_transform(df_data.Bin4)

print('\nBin4 classes dict:')

def displayLeDict(le):
    dic = {
        le.transform([x])[0]: x for x in le.classes_
    }
    display(dic)
    return

displayLeDict(le)

indexes = [df_data.Pclass, df_data.Bin4_Code]
columns = [df_data.Survived]
kwargs = { 'margins': True, 'margins_name': 'Total' }
ct = pd.crosstab(indexes, columns, values=df_data.Fare, aggfunc='mean').round(4)
display(ct)

Unnamed: 0,PassengerId,Survived,Sex,Sex_Code,Pclass,Fare
0,1,0,male,1,3,7.25
1,2,1,female,0,1,71.2833
2,3,1,female,0,3,7.925
3,4,1,female,0,1,53.1
4,5,0,male,1,3,8.05



Bin4 classes dict:


{0: Interval(-0.001, 7.91, closed='right'),
 1: Interval(7.91, 14.454, closed='right'),
 2: Interval(14.454, 31.0, closed='right'),
 3: Interval(31.0, 512.329, closed='right')}

Unnamed: 0_level_0,Survived,0,1
Pclass,Bin4_Code,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0.8333,
1,2,27.9075,28.0449
1,3,89.9988,112.3438
2,0,0.0,
2,1,12.1892,12.3292
2,2,24.3621,24.0374
2,3,55.4833,41.8677
3,0,7.3967,7.4629
3,1,9.07,9.6964
3,2,20.411,18.2909


In [32]:
columns = [df_data.Pclass, df_data.Bin4_Code, df_data.Survived]
sector = df_data.Fare.groupby(columns).mean()
display(sector)

# df_data['PclassBin_Code'] = le.fit_transform(df_data.Bin4)
# display(df_data.head(10))

Pclass  Bin4_Code  Survived
1       0          0             0.833333
        2          0            27.907458
                   1            28.044907
        3          0            89.998832
                   1           112.343848
2       0          0             0.000000
        1          0            12.189151
                   1            12.329167
        2          0            24.362050
                   1            24.037400
        3          0            55.483340
                   1            41.867717
3       0          0             7.396727
                   1             7.462884
        1          0             9.070025
                   1             9.696431
        2          0            20.410991
                   1            18.290878
        3          0            45.672094
                   1            47.080188
Name: Fare, dtype: float64

In [None]:
# Cross table
ct = pd.crosstab(df_tmp.Pclass, df_tmp.Survived, margins=True, margins_name='Total')
ct['S1_Rate'] = (ct[1] / ct.Total).round(3)
display(ct)

df_plot = pd.DataFrame({
    'dead' : df_tmp.Pclass[df_tmp.Survived==0].value_counts(),
    'alive': df_tmp.Pclass[df_tmp.Survived==1].value_counts()
})

fig, ax = plt.subplots(figsize=(8,5))
plotBarUnstack(df_plot, ax, 'Pclass & Survived', 'Pclass', 'Count')
plt.show()