## 데이터 전처리, 모델 제작용 변수 선택, 생성

- PassengerId int64  승객 id
- Survived int64      0:사망, 1: 생존
- Pclass int64         티켓등급: 1등석, 2등석, 3등석
- Name object       성명
- Sex object          성별
- Age float64         나이
- SibSp int64         승선중인 형제나 배우자의 수
- Parch int64         승선중인 부모나 자녀의 수     
- Ticket object       티케 번호
- Fare float64         티켓 요금
- Cabin object        반 번호 
- Embarked object  승선한 항구의 이름 C: Cherbourg, Q:Queenstown, S: Southampton

In [53]:
import warnings
warnings.filterwarnings("ignore")

import random
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

In [54]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

#2개의 Dataframe을 연결 병합, srot=False: 병합후 index에 따라 정렬
df = pd.concat([train_df, test_df],sort=False).reset_index(drop=True)
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [55]:
#결측치 확인
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

### 티켓등급별 중위수를 구하여 Fare가 결측치인 승객의 Fare에 값을 할당

In [56]:
# 파생 변수 : 기존의 변수에 사칙연산을 가하여 정확도를 높이는 변수를 추가하는 기법
# 티켓 등급별 평균 요금 파생 변수의 추가
fare_mean = df[['Pclass','Fare']].groupby('Pclass').mean() #평균은 이상치의 영향을 받음. median권장
fare_mean

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,87.508992
2,21.179196
3,13.302889


In [57]:
# 파생 변수 : 기존의 변수에 사칙연산을 가하여 정확도를 높이는 변수를 추가하는 기법
# 티켓 등급별 평균 요금 파생 변수의 추가
Fare_median = df[['Pclass','Fare']].groupby('Pclass').median()
Fare_median

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,60.0
2,15.0458
3,8.05


In [58]:
#티켓 등급별 평균 요금
Fare_median = df[['Pclass','Fare']].groupby('Pclass').median().reset_index()
Fare_median

Unnamed: 0,Pclass,Fare
0,1,60.0
1,2,15.0458
2,3,8.05


In [59]:
Fare_median.columns = ['Pclass','Fare_median']
Fare_median

Unnamed: 0,Pclass,Fare_median
0,1,60.0
1,2,15.0458
2,3,8.05


In [60]:
# 파생 변수의 병합
# Pclass 변수를 기준으로 join, left: df 기준으로 df는 모든 관측치가 병합됨.
df = pd.merge(df, Fare_median, on='Pclass', how='left')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_median
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,8.05
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,60.0


In [61]:
# Fare가 결측치(NaN,null)값인 경우 평균값으로 대체
cdt = df['Fare'].isnull()
cdt

0       False
1       False
2       False
3       False
4       False
        ...  
1304    False
1305    False
1306    False
1307    False
1308    False
Name: Fare, Length: 1309, dtype: bool

In [62]:
df[cdt] #Pclass가 3이므로 3등급의 Fare의 중위수가 할당

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_median
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,8.05


In [63]:
#Fare가 결측치(NaN, null)값인 경우 중위수로 대체
df.loc[cdt, 'Fare'] = df['Fare_median']
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          2
Fare_median       0
dtype: int64

In [65]:
df.loc[1042:1044,:] #index 1043 확인

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_median
1042,1043,,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C,8.05
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,8.05,,S,8.05
1044,1045,,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36.0,0,2,350405,12.1833,,S,8.05


In [67]:
df= df.drop('Fare_median',axis=1)

In [68]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1309.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.276193
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.743584
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292
