# 타이타닉 데이터 전처리

In [285]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection, linear_model
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score

import joblib

import warnings
warnings.filterwarnings("ignore")

In [286]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## X-data 전처리
**Pclass**: [1,2,3], 범주형 데이터   
**Name**: ['Master.','Miss.','Mr.','Mrs.',], 범주형 데이터   
**Sex**: ['male','female'], 범주형 데이터   
**Age**: 결측치는 예측 모델 적용, 수치형 데이터   
**SibSp**: 수치형 데이터   
**Parch**: 수치형 데이터   
**Fare**: 수치형 데이터   
**Cabin**: ['T','A','B','C','D','E','F','G'], 결측치는 예측 모델 적용, 범주형 데이터   
**Embarked**: ['S','C','Q'], 결측치는 최빈 단어 'S'로 채움, 범주형 데이터   
**Family**: [0,1], SibSp + Parch, 범주형 데이터   
<br>
모든 범주형 데이터는 int형 변환 후 One-Hot Encoding 적용   
**PassngerId** 및 **Survived**는 데이터프레임 저장 시에만 사용

In [287]:
data_df = df.copy()

del data_df['Ticket']

data_df.replace(['male','female'], [0,1], inplace=True)

data_df['Family'] = data_df['SibSp'] + data_df['Parch']
data_df['Family'] = data_df['Family'].apply(lambda x: 1 if x else 0)

del data_df['SibSp']
del data_df['Parch']

# 운임은 경제 수준을 나타내는 상대적인 지표이기에 Quantile을 기준으로 5개 구간으로 분리
data_df['Fare'] = pd.qcut(data_df['Fare'], 5, labels=list(range(5)))

data_df['Embarked'] = data_df['Embarked'].fillna('S')
data_df.replace(['S','C','Q'], [0,1,2], inplace=True)

data_df['Cabin'] = data_df['Cabin'].str[0]
# T 선실이 최상층, A부터 G 순서대로 낮은 층의 선실
data_df.replace(['T','A','B','C','D','E','F','G'], range(8), inplace=True)

# Master.와 Miss.는 각각 18세 미만 남성/여성, Mr.와 Miss.는 각각 18세 이상 남성/여성
data_df['Name'] = data_df['Name'].apply(lambda x: str(x).split(',')[1].split()[0])
data_df.replace(['Master.','Mr.','Miss.','Mrs.'], range(4), inplace=True)

# Mlle.은 마드모아젤(Miss.와 동일), Ms.는 Miss.의 약어, Mme.는 마담(Mrs.와 동일)
data_df['Name'] = data_df['Name'].replace(['Mlle.', 'Ms.'], 2)
data_df['Name'] = data_df['Name'].replace('Mme.', 3)

# 나머지 이름은 나이와 성별에 따라 Rule 적용
# 조건에 맞지 않는 이름을 가진 행 중 유일하게 나이가 결측치인 766번 행의 경우,
# Dr.라는 이름과 male이란 성별을 통해 Mr.라는 이름이 유추되기 때문에 그대로 진행
for i in range(len(data_df)):
    if data_df['Name'][i] not in range(4):
        data_df['Name'][i] = data_df['Sex'][i] + (0 if data_df['Age'][i] < 18 else 1)
data_df['Name'] = data_df['Name'].astype(int)

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22.0,0,,0,1
1,2,1,1,3,1,38.0,4,3.0,1,1
2,3,1,3,2,1,26.0,1,,0,0
3,4,1,1,3,1,35.0,4,3.0,0,1
4,5,0,3,1,0,35.0,1,,0,0


## 결측치 처리
**1**. 나이 결측치에 예측 모델 적용   
~~**2**. 그룹별 운임을 개별 운임으로 변경~~   
~~**3**. 선실 결측치에 [Pclass, Fare] 열을 기반으로 Rule 적용~~   
**2**. 선실 결측치에 예측 모델 적용

### 나이 예측 모델 생성 및 적용

In [288]:
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
categorical_features = ['Name', 'Sex', 'Fare']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

age_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [289]:
age_df = data_df[ data_df['Age'] > 0 ].copy()

age_X = age_pipe.fit_transform(age_df)
age_Y = np.array(age_df[['Age']])

age_model = linear_model.LinearRegression()

age_model.fit(age_X, age_Y)

LinearRegression()

In [290]:
test_age_df = data_df[ data_df['Age'].isnull() ].copy()

test_age_X = age_pipe.transform(test_age_df)

pred_age_y = age_model.predict(test_age_X)

for i, pred in zip(test_age_df.index.tolist(), pred_age_y):
    data_df['Age'][i] = pred

data_df['Age'] = data_df['Age'].astype(int)

# 나이는 건강 상태 및 대피 순서를 나타내는 절대적인 지표이기에 전체 범위를 5개 구간으로 분리
# 라고 생각했는데 qcut이 더 높은 Accuracy 나옴
data_df['Age'] = pd.qcut(data_df['Age'], 5, labels=list(range(5)))

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,1,0,,0,1
1,2,1,1,3,1,3,4,3.0,1,1
2,3,1,3,2,1,1,1,,0,0
3,4,1,1,3,1,3,4,3.0,0,1
4,5,0,3,1,0,3,1,,0,0


### 나이 예측 모델 결과 확인

In [291]:
test_age_df['Age'] = pred_age_y
test_age_df['Age'] = test_age_df['Age'].astype(int)

In [292]:
test_age_df[test_age_df['Name'] == 0]['Age'].value_counts() # 18세 미만 남성 그룹

0    2
8    1
5    1
Name: Age, dtype: int64

In [293]:
test_age_df[test_age_df['Name'] == 1]['Age'].value_counts() # 18세 이상 남성 그룹

30    77
36    18
31    14
38    11
Name: Age, dtype: int64

In [294]:
test_age_df[test_age_df['Name'] == 2]['Age'].value_counts() # 18세 미만 여성 그룹

18    20
24     8
26     4
19     4
Name: Age, dtype: int64

In [295]:
test_age_df[test_age_df['Name'] == 3]['Age'].value_counts() # 18세 이상 여성 그룹

31    8
39    8
37    1
Name: Age, dtype: int64

### 선실 예측 모델 생성 및 적용

In [296]:
categorical_features = ['Pclass', 'Name', 'Sex', 'Age',
                        'Fare', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

cabin_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [297]:
cabin_df = data_df[ data_df['Cabin'].notnull() ].copy()

cabin_X = cabin_pipe.fit_transform(cabin_df)
cabin_Y = np.array(cabin_df[['Cabin']])

cabin_model = linear_model.LinearRegression()

cabin_model.fit(cabin_X, cabin_Y)

LinearRegression()

In [298]:
test_cabin_df = data_df[ data_df['Cabin'].isnull() ].copy()

test_cabin_X = cabin_pipe.transform(test_cabin_df)

pred_cabin_y = cabin_model.predict(test_cabin_X)

for i, pred in zip(test_cabin_df.index.tolist(), pred_cabin_y):
    data_df['Cabin'][i] = pred

data_df['Cabin'] = data_df['Cabin'].astype(int)

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,1,0,6,0,1
1,2,1,1,3,1,3,4,3,1,1
2,3,1,3,2,1,1,1,6,0,0
3,4,1,1,3,1,3,4,3,0,1
4,5,0,3,1,0,3,1,6,0,0


### 선실 예측 모델 결과 확인
선실(갑판)에 관한 정보는 [참고](https://ko.wikipedia.org/wiki/RMS_타이타닉#층별_구조)

In [299]:
test_cabin_df['Cabin'] = pred_cabin_y
test_cabin_df['Cabin'] = test_cabin_df['Cabin'].astype(int)

In [300]:
test_cabin_df[test_cabin_df['Pclass'] == 1]['Cabin'].value_counts()

2    25
3    13
1     2
Name: Cabin, dtype: int64

In [301]:
test_cabin_df[test_cabin_df['Pclass'] == 2]['Cabin'].value_counts()

4    65
6    53
5    50
Name: Cabin, dtype: int64

In [302]:
test_cabin_df[test_cabin_df['Pclass'] == 3]['Cabin'].value_counts()

5    228
6    200
7     45
4      6
Name: Cabin, dtype: int64

## 데이터 확인 및 저장

In [282]:
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,1,0,6,0,1
1,2,1,1,3,1,3,4,3,1,1
2,3,1,3,2,1,1,1,6,0,0
3,4,1,1,3,1,3,4,3,0,1
4,5,0,3,1,0,3,1,6,0,0


In [283]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    int64   
 4   Sex          891 non-null    int64   
 5   Age          891 non-null    category
 6   Fare         891 non-null    category
 7   Cabin        891 non-null    int64   
 8   Embarked     891 non-null    int64   
 9   Family       891 non-null    int64   
dtypes: category(2), int64(8)
memory usage: 58.0 KB


In [284]:
data_df.set_index('PassengerId').to_csv(f'titanic_data.csv')

## Pipeline 생성 및 저장

In [227]:
categorical_features = ['Pclass', 'Name', 'Sex', 'Age',
                        'Fare', 'Cabin', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [228]:
joblib.dump(pipe, 'titanic_pipe.pkl', compress=True)

['titanic_pipe.pkl']

## ========== 테스트 코드 ==========

### 데이터 세분화
1. 나이 결측치 행 제거

2. 나이 결측치 평균값 처리
3. 나이 및 선실 결측치 행 제거
4. 나이 및 선실 결측치 평균값 처리
5. 나이 결측치 행 제거 및 나이 구간화

### Data 1. 나이 결측치 행 제거

In [11]:
data_1 = data_df[ data_df['Age'] > 0 ].copy()
data_1['Age'] = data_1['Age'].astype(int)

del data_1['Cabin']

data_1.shape

(714, 7)

### Data 2. 나이 결측치 평균값 처리

In [12]:
data_2 = data_df.copy()

mean_age = data_2['Age'].mean(skipna=True)
data_2['Age'] = data_2['Age'].fillna(mean_age)
data_2['Age'] = data_2['Age'].astype(int)

del data_2['Cabin']

data_2.shape

(891, 7)

### Data 3. 나이 및 선실 결측치 행 제거

In [13]:
data_3 = data_df[ data_df['Age'] > 0 ].copy()
data_3['Age'] = data_3['Age'].astype(int)

data_3 = data_3[ data_3['Cabin'].notnull() ]
data_3['Cabin'] = data_3['Cabin'].astype(int)

data_3.shape

(185, 8)

### Data 4. 나이 및 선실 결측치 평균값 처리

In [19]:
data_4 = data_df.copy()

mean_age = data_4['Age'].mean(skipna=True)
data_4['Age'] = data_4['Age'].fillna(mean_age)
data_4['Age'] = data_4['Age'].astype(int)

data_4[ data_4['Pclass'] == 1 ] = data_4[ data_4['Pclass'] == 1 ].fillna(3)
data_4[ data_4['Pclass'] == 2 ] = data_4[ data_4['Pclass'] == 2 ].fillna(5)
data_4[ data_4['Pclass'] == 3 ] = data_4[ data_4['Pclass'] == 3 ].fillna(6)
data_4['Cabin'] = data_4['Cabin'].astype(int)

data_4.shape
data_4.set_index('PassengerId').to_csv(f'titanic_data_4.csv')

### Data 5. 나이 결측치 행 제거 및 나이 구간화

In [15]:
data_5 = data_df[ data_df['Age'] > 0 ].copy()
data_5['Age'] = data_5['Age'] // 10
data_5['Age'] = data_5['Age'].astype(int)

del data_5['Cabin']

data_5.shape

(714, 7)

### Save Data

In [16]:
for i, data in enumerate([data_1, data_2, data_3, data_4, data_5]):
    data.set_index('PassengerId').to_csv(f'titanic_data_{i+1}.csv')

## Pipeline

In [18]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(categories='auto')

### Processor 1. Age(o), Cabin(x)

In [19]:
numeric_features = ['Age']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipe_1 = Pipeline(steps=[('preprocessor', preprocessor)])

### Processor 2. Age(o), Cabin(o)

In [20]:
numeric_features = ['Age']
categorical_features = ['Pclass', 'Sex', 'Cabin', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipe_2 = Pipeline(steps=[('preprocessor', preprocessor)])

### Processor 3. Age(구간), Cabin(x)

In [21]:
categorical_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'Family']
categorical_transformer = OneHotEncoder(categories='auto')

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, categorical_features)])

pipe_3 = Pipeline(steps=[('preprocessor', preprocessor)])

### Save Processor

In [22]:
import joblib

for i, pipe in enumerate([pipe_1, pipe_2, pipe_3]):
    joblib.dump(pipe, f'titanic_pipe_{i+1}.pkl', compress=True)

## 나이 모델 모든 조합 비교

In [None]:
from itertools import combinations

# numeric_transformer = StandardScaler()
# numeric_features = ['SibSp', 'Parch', 'Fare']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
categorical_features = ['Pclass', 'Name', 'Sex', 'Embarked', 'Family', 'FareRange']

# numeric_comb = list()
categorical_comb = list()

# for i in range(0,len(numeric_features)+1):
#     numeric_comb += list(combinations(numeric_features, i))

for i in range(0,len(categorical_features)+1):
    categorical_comb += list(combinations(categorical_features, i))

pipe_list = list()

# for numeric_feature in numeric_comb:
for categorical_feature in categorical_comb:
    preprocessor = ColumnTransformer(
        transformers=[
            # ('num', numeric_transformer, numeric_feature),
            ('cat', categorical_transformer, categorical_feature)])

    pipe_list.append(Pipeline(steps=[('preprocessor', preprocessor)]))

pipe_list = pipe_list[1:]

In [None]:
value_count_list = list()
score_list = list()

i = 0

for pipe in pipe_list:
    test_df = data_df.copy()

    age_df = test_df[ data_df['Age'] > 0 ].copy()

    age_X = pipe.fit_transform(age_df)
    age_Y = np.array(age_df[['Age']])

    age_model = linear_model.LinearRegression()

    age_model.fit(age_X, age_Y)

    test_age_df = test_df[ test_df['Age'].isnull() ].copy()

    test_age_X = pipe.transform(test_age_df)
    pred_age_y = age_model.predict(test_age_X)

    for i, pred in zip(test_age_df.index.tolist(), pred_age_y):
        test_df['Age'][i] = pred

    test_df['Age'] = test_df['Age'].astype(int)
    test_df['AgeRange'] = pd.qcut(test_df['Age'], 5, labels=[0,1,2,3,4])
    del test_df['Age']

    test_categorical_features = ['Pclass', 'Name', 'Sex', 'Embarked', 'Family', 'FareRange', 'AgeRange']

    test_preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, test_categorical_features)])

    test_pipe = Pipeline(steps=[('preprocessor', test_preprocessor)])

    cabin_df = test_df[ test_df['Cabin'].notnull() ].copy()

    cabin_X = test_pipe.fit_transform(cabin_df)
    cabin_Y = np.array(cabin_df[['Cabin']])

    cabin_model = linear_model.LinearRegression()

    cabin_model.fit(cabin_X, cabin_Y)

    test_cabin_df = test_df[ test_df['Cabin'].isnull() ].copy()

    test_cabin_X = test_pipe.transform(test_cabin_df)
    pred_cabin_y = cabin_model.predict(test_cabin_X)

    for i, pred in zip(test_cabin_df.index.tolist(), pred_cabin_y):
        test_df['Cabin'][i] = pred

    test_df['Cabin'] = test_df['Cabin'].astype(int)

    test2_categorical_features = ['Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Family', 'FareRange', 'AgeRange']

    test2_preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, test2_categorical_features)])

    test2_pipe = Pipeline(steps=[('preprocessor', test2_preprocessor)])

    titanic_Y = np.array(test_df[['Survived']])
    test_df['PassengerId']
    test_df['Survived']
    test2_pipe.fit(test_df)

    x_train, x_test, y_train, y_test = \
        model_selection.train_test_split(test_df, titanic_Y, test_size=0.3, random_state=0)

    x_train = test2_pipe.transform(x_train)
    x_test = test2_pipe.transform(x_test)

    test_model = linear_model.LogisticRegression()

    test_model.fit(x_train, y_train)
    score_list.append((accuracy_score(test_model.predict(x_test), y_test), pipe))


In [None]:
sorted(score_list, key=lambda x: x[0])

[(0.8208955223880597,
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ('Pclass', 'Family'))]))])),
 (0.8246268656716418,
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ('Sex',))]))])),
 (0.8246268656716418,
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ('Pclass', 'Sex'))]))])),
 (0.8246268656716418,
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('cat',
                                

## 선형 모델 모든 조합 비교

In [None]:
from sklearn import linear_model # LogisticRegression()
from sklearn import neighbors # KNeighborsClassifier()
from sklearn.svm import SVC # SVC(gamma='auto')
from sklearn import tree # DecisionTreeClassifier()
from sklearn import ensemble # RandomForestClassifier()
from sklearn import naive_bayes # BernoulliNB()
from sklearn import cluster # KMeans(random_state=0)
from sklearn import decomposition # PCA(n_components=1)
from sklearn.metrics import accuracy_score

In [None]:
from itertools import combinations

numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Name', 'Sex', 'Embarked', 'Family']

numeric_comb = list()
categorical_comb = list()

for i in range(0,len(numeric_features)+1):
    numeric_comb += list(combinations(numeric_features, i))

for i in range(0,len(categorical_features)+1):
    categorical_comb += list(combinations(categorical_features, i))

pipe_list = list()

for numeric_feature in numeric_comb:
    for categorical_feature in categorical_comb:
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_feature),
                ('cat', categorical_transformer, categorical_feature)])

        pipe_list.append(Pipeline(steps=[('preprocessor', preprocessor)]))

pipe_list = pipe_list[1:]

In [None]:
test_numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
test_categorical_features = ['Pclass', 'Name', 'Sex', 'Fare', 'Cabin', 'Embarked', 'Family']

test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, test_numeric_features),
        ('cat', categorical_transformer, test_categorical_features)])

test_pipe = Pipeline(steps=[('preprocessor', test_preprocessor)])

value_count_list = list()
score_list = list()

i = 0

for pipe in pipe_list:
    test_df = data_df.copy()

    cabin_df = test_df[ test_df['Cabin'].notnull() ].copy()

    cabin_X = pipe.fit_transform(cabin_df)
    cabin_Y = np.array(cabin_df[['Cabin']])

    cabin_model = linear_model.LinearRegression()

    cabin_model.fit(cabin_X, cabin_Y)

    test_cabin_df = test_df[ test_df['Cabin'].isnull() ].copy()

    test_cabin_X = pipe.transform(test_cabin_df)

    pred_cabin_y = cabin_model.predict(test_cabin_X)

    for i, pred in zip(test_cabin_df.index.tolist(), pred_cabin_y):
        test_df['Cabin'][i] = pred

    test_df['Cabin'] = test_df['Cabin'].astype(int)

    titanic_X = test_pipe.fit_transform(test_df)
    titanic_Y = np.array(test_df[['Survived']])

    x_train, x_test, y_train, y_test = \
    model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

    test_model = linear_model.LogisticRegression()

    test_model.fit(x_train, y_train)
    score_list.append((accuracy_score(test_model.predict(x_test), y_test), pipe))


In [None]:
sorted(score_list, key=lambda x: x[0])

[(0.8246268656716418,
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('num', StandardScaler(),
                                                    ('Parch', 'Fare')),
                                                   ('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ('Sex', 'Embarked'))]))])),
 (0.8283582089552238,
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('num', StandardScaler(),
                                                    ('SibSp',)),
                                                   ('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ('Sex', 'Family'))]))])),
 (0.8283582089552238,
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('num', StandardScaler(),
 