In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
pd.set_option('display.max_rows', 100)

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
train_path = os.path.join('data', 'train.csv')
test_path =os.path.join('data', 'test.csv')
sub_path = os.path.join('data', 'gender_submission.csv')

In [4]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sub = pd.read_csv(sub_path)

# data 확인

In [5]:
train.shape, test.shape, sub.shape

((891, 12), (418, 11), (418, 2))

In [6]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [7]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [8]:
sub.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


In [9]:
train_copy = train.copy()
test_copy = test.copy()

In [10]:
# 예측해야하는 column 찾기 - label
set(train.columns) - set(test.columns)

{'Survived'}

In [11]:
label = train['Survived']
train_index = train.shape[0]

In [12]:
# train, test concat 후 결합
train.drop('Survived', axis=1, inplace=True)
total_df = pd.concat([train, test]).reset_index(drop=True) # 결합 후 index 리셋

In [42]:
ID = total_df['PassengerId']

In [13]:
# NA 값 확인
print(total_df.isna().sum())
# 비율로 확인
print((total_df.isna().sum() / len(total_df))*100)

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64
PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.091673
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.076394
Cabin          77.463713
Embarked        0.152788
dtype: float64


In [14]:
# 77% NA -> 삭제
total_df.drop('Cabin', axis=1, inplace=True)

In [15]:
# 통계 수치 확인 -> numerical dtype only
total_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,80.0,8.0,9.0,512.3292


In [63]:
total_df['Fare'] = total_df['Fare'].fillna(train['Fare'].mean())

In [16]:
# Age는 평균 값으로 NA 값을 채움
# 다른 방법도 가능
total_df['Age'] = total_df['Age'].fillna(np.int32(total_df['Age'].mean()))

In [17]:
total_df[total_df['Embarked'].isna() == True]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [18]:
total_df.groupby('Embarked')['Fare'].mean()

Embarked
C    62.336267
Q    12.409012
S    27.418824
Name: Fare, dtype: float64

In [19]:
# Embarked는 평균 요금이 가장 비슷한 곳으로 선정하여 NA 값을 채움
total_df['Embarked'] = total_df['Embarked'].fillna('C')

In [20]:
# 변수 unique 확인
for col in total_df.columns:
    print('Column : {} \nunique 수 : {}'.format(col, total_df[col].nunique()))

Column : PassengerId 
unique 수 : 1309
Column : Pclass 
unique 수 : 3
Column : Name 
unique 수 : 1307
Column : Sex 
unique 수 : 2
Column : Age 
unique 수 : 98
Column : SibSp 
unique 수 : 7
Column : Parch 
unique 수 : 8
Column : Ticket 
unique 수 : 929
Column : Fare 
unique 수 : 281
Column : Embarked 
unique 수 : 3


In [21]:
# 범주형 변수 확인
cat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for col in cat_cols:
    print('Column : {} \nunique : {}'.format(col, total_df[col].unique()))

Column : Pclass 
unique : [3 1 2]
Column : Sex 
unique : ['male' 'female']
Column : SibSp 
unique : [1 0 3 4 2 5 8]
Column : Parch 
unique : [0 1 2 5 3 4 6 9]
Column : Embarked 
unique : ['S' 'C' 'Q']


# Feature Engineering

In [22]:
# Family_size
total_df['fam_size'] = total_df['SibSp']+total_df['Parch']

In [23]:
# PassengerId와 name이 가리키는 대상이 같음 -> Name 삭제
total_df.drop('Name', axis=1, inplace=True)

In [24]:
# One-hot-encoding
total_df['male'] = total_df['Sex'].map(lambda x : 1 if x =='male' else 0).astype(np.uint8)
total_df['female'] = total_df['Sex'].map(lambda x : 1 if x =='female' else 0).astype(np.uint8)

In [25]:
total_df['Pclass-1'] = total_df['Pclass'].map(lambda x : 1 if x == 1 else 0).astype(np.uint8)
total_df['Pclass-2'] = total_df['Pclass'].map(lambda x : 1 if x == 2 else 0).astype(np.uint8)
total_df['Pclass-3'] = total_df['Pclass'].map(lambda x : 1 if x == 3 else 0).astype(np.uint8)

In [26]:
total_df['Embarked-S'] = total_df['Embarked'].map(lambda x : 1 if x =='S' else 0).astype(np.uint8)
total_df['Embarked-C'] = total_df['Embarked'].map(lambda x : 1 if x =='C' else 0).astype(np.uint8)
total_df['Embarked-Q'] = total_df['Embarked'].map(lambda x : 1 if x =='Q' else 0).astype(np.uint8)

In [27]:
total_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,fam_size,male,female,Pclass-1,Pclass-2,Pclass-3,Embarked-S,Embarked-C,Embarked-Q
0,1,3,male,22.0,1,0,A/5 21171,7.25,S,1,1,0,0,0,1,1,0,0
1,2,1,female,38.0,1,0,PC 17599,71.2833,C,1,0,1,1,0,0,0,1,0
2,3,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,0,0,1,0,0,1,1,0,0
3,4,1,female,35.0,1,0,113803,53.1,S,1,0,1,1,0,0,1,0,0
4,5,3,male,35.0,0,0,373450,8.05,S,0,1,0,0,0,1,1,0,0


In [28]:
total_df['Age'].describe()

count    1309.000000
mean       29.704102
std        12.888034
min         0.170000
25%        22.000000
50%        29.000000
75%        35.000000
max        80.000000
Name: Age, dtype: float64

In [32]:
# 승객의 75프로 35세 이하
# 35세 이하와 36세 이상으로 범주화
total_df['Age'] = total_df['Age'].map(lambda x : 0 if x <= 35 else 1)
total_df['Age_young'] = total_df['Age'].map(lambda x : 1 if x == 0 else 0)
total_df['Age_old'] = total_df['Age'].map(lambda x : 1 if x ==1 else 0)

In [43]:
# Column 정리
total_df.drop(['PassengerId','Sex','SibSp','Parch','Ticket','Embarked','Pclass','Age'], axis=1, inplace=True)

In [46]:
total_df.head()

Unnamed: 0,Fare,fam_size,male,female,Pclass-1,Pclass-2,Pclass-3,Embarked-S,Embarked-C,Embarked-Q,Age_young,Age_old
0,7.25,1,1,0,0,0,1,1,0,0,1,0
1,71.2833,1,0,1,1,0,0,0,1,0,0,1
2,7.925,0,0,1,0,0,1,1,0,0,1,0
3,53.1,1,0,1,1,0,0,1,0,0,1,0
4,8.05,0,1,0,0,0,1,1,0,0,1,0


In [64]:
x_tr = total_df.loc[:train_index-1]
x_tst = total_df.loc[train_index:]

In [65]:
y_tr = label[:train_index]
y_tst = label[train_index:]

In [66]:
rf = RandomForestClassifier(random_state=1207)

In [67]:
rf.fit(x_tr, y_tr)

RandomForestClassifier(random_state=1207)

In [68]:
y_pred = rf.predict(x_tst)

In [77]:
sub['Survived'] = y_pred

In [79]:
sub.to_csv('./submissions/titanic_submission_rf_baseline.csv',index=False)