In [1]:
import os, sys
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns 

from os.path import join 
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from IPython.display import display_html

import warnings
warnings.filterwarnings("ignore")

In [2]:
DataPath = r'D:\GitWork\titanic\data'
OutPath  = r'D:\GitWork\titanic\output'

f_abspath = join(DataPath, 'train.csv')
df_train = pd.read_csv(f_abspath)

f_abspath = join(DataPath, 'test.csv')
df_test = pd.read_csv(f_abspath)

df_data = df_train.append(df_test)
display(df_data.info())
display(df_data.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


None

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# For Sex
col_select = ['PassengerId', 'Survived', 'Sex']
df_sex = df_data[col_select]

# Use pivot_table
kwargs = { 'margins': True, 'margins_name': 'Total' }
pt = df_sex.pivot_table(
    values='PassengerId',
    index='Sex',
    columns='Survived',
    aggfunc='count',
    **kwargs
)
pt.rename(columns={0.0:0, 1.0:1}, inplace=True)
pt['Rate_S1'] = (pt[1] / pt.Total).round(3)
display(pt)

# SexCode trandform
le_sex = LabelEncoder()
df_sex['SexCode'] = le_sex.fit_transform(df_sex.Sex)
print()
display(df_sex.info())
display(df_sex.head())

dict_sex = { le_sex.transform([x])[0]:x for x in le_sex.classes_ }
display(dict_sex)

# Merge SexCode
# col_select = ['PassengerId', 'SexCode']
# df_data = df_data.merge(df_sex[col_select], on="PassengerId", how="left")
df_data['SexCode'] = df_sex.SexCode.copy()
display(df_data.info())

Survived,0,1,Total,Rate_S1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,81,233,314,0.742
male,468,109,577,0.189
Total,549,342,891,0.384



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Sex          1309 non-null   object 
 3   SexCode      1309 non-null   int32  
dtypes: float64(1), int32(1), int64(1), object(1)
memory usage: 46.0+ KB


None

Unnamed: 0,PassengerId,Survived,Sex,SexCode
0,1,0.0,male,1
1,2,1.0,female,0
2,3,1.0,female,0
3,4,1.0,female,0
4,5,0.0,male,1


{0: 'female', 1: 'male'}

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
 12  SexCode      1309 non-null   int32  
dtypes: float64(3), int32(1), int64(4), object(5)
memory usage: 138.1+ KB


None

In [16]:
# For Fare
col_select = ['PassengerId', 'Survived', 'Fare']
df_fare = df_data[col_select]

pid_list = df_data[df_data.Fare.isnull()]['PassengerId'].tolist()
display(df_data.loc[df_data.PassengerId.isin(pid_list)])

df_fare.Fare.fillna(df_fare.Fare.mean())
display(df_fare.Fare.describe())




# # Use crosstab
# kwargs = { 'margins': True, 'margins_name': 'Total' }
# ct = pd.crosstab(
#     index   = [df_pcl.Fare],
#     columns = df_pcl.Survived,
#     values  = df_pcl.PassengerId,
#     aggfunc = len, 
#     **kwargs
# )
# ct.rename(columns={0.0:0, 1.0:1}, inplace=True)
# ct['Ratio_S1'] = (ct[1] / ct.Total).round(3)
# display(ct)


# Survived vs Pclass & Sex
# idx_list = [x1*10+x2 for x1, x2 in ct.index.tolist() if isinstance(x1, int)]
# dict_pcl = dict(zip(idx_list, (ct[1] / ct.Total).round(1)))
# print(dict_pcl)

# df_pcl['PclassCode'] = None
# df_pcl.PclassCode = (df_pcl.Pclass*10+df_pcl.SexCode).map(dict_pcl)
# display(df_pcl)

# # Merge PclassCode
# # col_select = ['PassengerId', 'PclassCode']
# # df_data = df_data.merge(df_pcl[col_select], on="PassengerId", how="left")
# df_data['PclassCode'] = df_pcl.PclassCode.copy()
# display(df_data.info())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode
152,1044,0.0,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,1


count    1308.000000
mean       33.295479
std        51.758668
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: Fare, dtype: float64

In [5]:
# For Age
display(df_data.Age.describe())

count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: Age, dtype: float64

In [6]:
# Trainging
len_train = len(df_train)
ds_train = df_data[:len_train]

# If only use sex code to train, the survival prediction will be all 0. 
col_select = ['PassengerId', 'Pclass', 'SexCode']
inps = ds_train[col_select]
lbls = ds_train.Survived
display(inps.head())
display(inps.info())

model_rf = RandomForestClassifier(
    random_state = 2, 
    n_estimators = 250, 
    min_samples_split = 20, 
    oob_score = True
)
model_rf.fit(inps, lbls)

score = model_rf.oob_score_
print('oob score :{:f}'.format(score))

Unnamed: 0,PassengerId,Pclass,SexCode
0,1,3,1
1,2,1,0
2,3,3,0
3,4,1,0
4,5,3,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  891 non-null    int64
 1   Pclass       891 non-null    int64
 2   SexCode      891 non-null    int32
dtypes: int32(1), int64(2)
memory usage: 24.4 KB


None

oob score :0.785634


In [7]:
# display(df_data.info())

# ds_train = df_data[:len_train]

# col_select = ['PassengerId', 'PclassCode', 'SexCode']
# inps = ds_train[col_select]
# lbls = ds_train.Survived
# display(inps.head())
# display(inps.info())

# model_rf = RandomForestClassifier(
#     random_state = 2, 
#     n_estimators = 250, 
#     min_samples_split = 20, 
#     oob_score = True
# )
# model_rf.fit(inps, lbls)

# score = model_rf.oob_score_
# print('oob score :{:f}'.format(score))

In [8]:
# Test

ds_test = df_data[len_train:]

col_select = ['PassengerId', 'Pclass', 'SexCode']
inps = ds_test[col_select]

display(inps.head())
display(inps.info())

ds_test['Survived'] = model_rf.predict(inps)
display(ds_test.head())

Unnamed: 0,PassengerId,Pclass,SexCode
0,892,3,1
1,893,3,0
2,894,2,1
3,895,3,1
4,896,3,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Pclass       418 non-null    int64
 2   SexCode      418 non-null    int32
dtypes: int32(1), int64(2)
memory usage: 11.4 KB


None

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode
0,892,0.0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,0.0,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,0.0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,0.0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1
4,896,0.0,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


In [9]:
# Analysis of prediction

col_select = ['PassengerId', 'Survived', 'Sex']
ds_tmp = ds_test[col_select]

pt = ds_tmp.pivot_table(
    values='PassengerId',
    index='Sex',
    columns='Survived',
    aggfunc='count',
    **kwargs
).fillna(0).astype(int)

pt.rename(columns={0.0:0, 1.0:1}, inplace=True)
pt['Rate_S1'] = (pt[1] / pt.Total).round(3)
display(pt)

# Save prediction
col_selected = ['PassengerId', 'Survived']
f_abspath = join(OutPath, 'Submission_v5.csv')
ds_test[col_selected].to_csv(f_abspath, index=False)

Survived,0,1,Total,Rate_S1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,72,80,152,0.526
male,266,0,266,0.0
Total,338,80,418,0.191


In [10]:
print('done')

done
