# 타이타닉 데이터 전처리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection, linear_model
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

import joblib

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## X-data 전처리
**Pclass**: [1,2,3], 범주형 데이터   
**Name**: ['Master.','Miss.','Mr.','Mrs.',], 범주형 데이터   
**Sex**: ['male','female'], 범주형 데이터   
**Age**: 결측치는 예측 모델 적용, 수치형 데이터   
**SibSp**: 수치형 데이터   
**Parch**: 수치형 데이터   
**Fare**: 수치형 데이터   
**Cabin**: ['T','A','B','C','D','E','F','G'], 결측치는 예측 모델 적용, 범주형 데이터   
**Embarked**: ['S','C','Q'], 결측치는 최빈 단어 'S'로 채움, 범주형 데이터   
**Family**: [0,1], SibSp + Parch, 범주형 데이터   
<br>
모든 범주형 데이터는 int형 변환 후 One-Hot Encoding 적용   
**PassngerId** 및 **Survived**는 데이터프레임 저장 시에만 사용

In [12]:
data_df = df.copy()

del data_df['Ticket']

data_df.replace(['male','female'], [0,1], inplace=True)

data_df['Family'] = data_df['SibSp'] + data_df['Parch']
data_df['Family'] = data_df['Family'].apply(lambda x: 1 if x else 0)

data_df['Embarked'] = data_df['Embarked'].fillna('S')
data_df.replace(['S','C','Q'], [0,1,2], inplace=True)

data_df['Cabin'] = data_df['Cabin'].str[0]
# T 선실이 최상층, A부터 G 순서대로 낮은 층의 선실
data_df.replace(['T','A','B','C','D','E','F','G'], range(8), inplace=True)

data_df['Name'] = data_df['Name'].apply(lambda x: str(x).split(',')[1].split()[0])
data_df.replace(['Master.','Mr.','Miss.','Mrs.'], range(4), inplace=True)

for i in range(len(data_df)):
    if data_df['Name'][i] not in range(4):
        data_df['Name'][i] = data_df['Sex'][i] + (0 if data_df['Age'][i] < 18 else 1)
data_df['Name'] = data_df['Name'].astype(int)

# 조건에 맞지 않는 이름을 가진 행 중 유일하게 나이가 결측치인 766번 행의 경우,
# Dr.라는 이름과 male이란 성별을 통해 Mr.라는 이름이 유추되기 때문에 무시하고 진행

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22.0,1,0,7.25,,0,1
1,2,1,1,3,1,38.0,1,0,71.2833,3.0,1,1
2,3,1,3,2,1,26.0,0,0,7.925,,0,0
3,4,1,1,3,1,35.0,1,0,53.1,3.0,0,1
4,5,0,3,1,0,35.0,0,0,8.05,,0,0


## 결측치 처리
**1**. 나이 결측치에 예측 모델 적용   
~~**2**. 그룹별 운임을 개별 운임으로 변경~~   
~~**3**. 선실 결측치에 [Pclass, Fare] 열을 기반으로 Rule 적용~~   
**2**. 선실 결측치에 예측 모델 적용

### 나이 예측 모델 생성 및 적용

In [13]:
# numeric_features = ['SibSp', 'Parch', 'Fare']
# categorical_features = ['Pclass', 'Name', 'Sex', 'Embarked', 'Family']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)])

categorical_features = ['Pclass', 'Name', 'Sex', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, categorical_features)])

age_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [14]:
age_df = data_df[ data_df['Age'] > 0 ].copy()

age_X = age_pipe.fit_transform(age_df)
age_Y = np.array(age_df[['Age']])

age_model = linear_model.LinearRegression()

age_model.fit(age_X, age_Y)

LinearRegression()

In [15]:
test_age_df = data_df[ data_df['Age'].isnull() ].copy()

test_age_X = age_pipe.transform(test_age_df)

pred_age_y = age_model.predict(test_age_X)

for i, pred in zip(test_age_df.index.tolist(), pred_age_y):
    data_df['Age'][i] = abs(pred) # -1.75 연령이 2개 발생해 절댓값 처리

data_df['Age'] = data_df['Age'].astype(int)

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22,1,0,7.25,,0,1
1,2,1,1,3,1,38,1,0,71.2833,3.0,1,1
2,3,1,3,2,1,26,0,0,7.925,,0,0
3,4,1,1,3,1,35,1,0,53.1,3.0,0,1
4,5,0,3,1,0,35,0,0,8.05,,0,0


### 나이 예측 모델 결과 확인

In [16]:
test_age_df['Age'] = pred_age_y
test_age_df['Age'] = test_age_df['Age'].astype(int)

In [17]:
test_age_df[test_age_df['Name'] == 0]['Age'].value_counts() # 18세 미만 남성 그룹

0    2
2    2
Name: Age, dtype: int64

In [18]:
test_age_df[test_age_df['Name'] == 1]['Age'].value_counts() # 18세 이상 남성 그룹

30    42
33    20
27    15
45    15
34     7
23     7
41     6
26     5
31     2
20     1
Name: Age, dtype: int64

In [19]:
test_age_df[test_age_df['Name'] == 2]['Age'].value_counts() # 18세 미만 여성 그룹

23    17
13     7
16     5
10     2
20     2
27     1
31     1
24     1
Name: Age, dtype: int64

In [20]:
test_age_df[test_age_df['Name'] == 3]['Age'].value_counts() # 18세 이상 여성 그룹

44    4
33    3
41    3
26    3
30    2
48    1
36    1
Name: Age, dtype: int64

### 선실 예측 모델 생성 및 적용

In [21]:
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Name', 'Sex', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

cabin_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [22]:
cabin_df = data_df[ data_df['Cabin'].notnull() ].copy()

cabin_X = cabin_pipe.fit_transform(cabin_df)
cabin_Y = np.array(cabin_df[['Cabin']])

cabin_model = linear_model.LinearRegression()

cabin_model.fit(cabin_X, cabin_Y)

LinearRegression()

In [23]:
test_cabin_df = data_df[ data_df['Cabin'].isnull() ].copy()

test_cabin_X = cabin_pipe.transform(test_cabin_df)

pred_cabin_y = cabin_model.predict(test_cabin_X)

for i, pred in zip(test_cabin_df.index.tolist(), pred_cabin_y):
    data_df['Cabin'][i] = pred

data_df['Cabin'] = data_df['Cabin'].astype(int)

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22,1,0,7.25,6,0,1
1,2,1,1,3,1,38,1,0,71.2833,3,1,1
2,3,1,3,2,1,26,0,0,7.925,6,0,0
3,4,1,1,3,1,35,1,0,53.1,3,0,1
4,5,0,3,1,0,35,0,0,8.05,5,0,0


### 선실 예측 모델 결과 확인
이거 괜찮을지도

In [24]:
test_cabin_df['Cabin'] = pred_cabin_y
test_cabin_df['Cabin'] = test_cabin_df['Cabin'].astype(int)

In [25]:
test_cabin_df[test_cabin_df['Pclass'] == 1]['Cabin'].value_counts()

2    30
3     9
1     1
Name: Cabin, dtype: int64

In [26]:
test_cabin_df[test_cabin_df['Pclass'] == 2]['Cabin'].value_counts()

5    161
4      7
Name: Cabin, dtype: int64

In [27]:
test_cabin_df[test_cabin_df['Pclass'] == 3]['Cabin'].value_counts()

5    337
6    142
Name: Cabin, dtype: int64

## 데이터 확인 및 저장

In [28]:
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22,1,0,7.25,6,0,1
1,2,1,1,3,1,38,1,0,71.2833,3,1,1
2,3,1,3,2,1,26,0,0,7.925,6,0,0
3,4,1,1,3,1,35,1,0,53.1,3,0,1
4,5,0,3,1,0,35,0,0,8.05,5,0,0


In [29]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    int64  
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    int64  
 10  Embarked     891 non-null    int64  
 11  Family       891 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 83.7 KB


In [30]:
data_df.set_index('PassengerId').to_csv(f'titanic_data.csv')

## Pipeline 생성 및 저장

In [36]:
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Name', 'Sex', 'Fare', 'Cabin', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [37]:
joblib.dump(pipe, 'titanic_pipe.pkl', compress=True)

['titanic_pipe.pkl']

## ========== 테스트 코드 ==========

### 데이터 세분화
1. 나이 결측치 행 제거

2. 나이 결측치 평균값 처리
3. 나이 및 선실 결측치 행 제거
4. 나이 및 선실 결측치 평균값 처리
5. 나이 결측치 행 제거 및 나이 구간화

### Data 1. 나이 결측치 행 제거

In [11]:
data_1 = data_df[ data_df['Age'] > 0 ].copy()
data_1['Age'] = data_1['Age'].astype(int)

del data_1['Cabin']

data_1.shape

(714, 7)

### Data 2. 나이 결측치 평균값 처리

In [12]:
data_2 = data_df.copy()

mean_age = data_2['Age'].mean(skipna=True)
data_2['Age'] = data_2['Age'].fillna(mean_age)
data_2['Age'] = data_2['Age'].astype(int)

del data_2['Cabin']

data_2.shape

(891, 7)

### Data 3. 나이 및 선실 결측치 행 제거

In [13]:
data_3 = data_df[ data_df['Age'] > 0 ].copy()
data_3['Age'] = data_3['Age'].astype(int)

data_3 = data_3[ data_3['Cabin'].notnull() ]
data_3['Cabin'] = data_3['Cabin'].astype(int)

data_3.shape

(185, 8)

### Data 4. 나이 및 선실 결측치 평균값 처리

In [19]:
data_4 = data_df.copy()

mean_age = data_4['Age'].mean(skipna=True)
data_4['Age'] = data_4['Age'].fillna(mean_age)
data_4['Age'] = data_4['Age'].astype(int)

data_4[ data_4['Pclass'] == 1 ] = data_4[ data_4['Pclass'] == 1 ].fillna(3)
data_4[ data_4['Pclass'] == 2 ] = data_4[ data_4['Pclass'] == 2 ].fillna(5)
data_4[ data_4['Pclass'] == 3 ] = data_4[ data_4['Pclass'] == 3 ].fillna(6)
data_4['Cabin'] = data_4['Cabin'].astype(int)

data_4.shape
data_4.set_index('PassengerId').to_csv(f'titanic_data_4.csv')

### Data 5. 나이 결측치 행 제거 및 나이 구간화

In [15]:
data_5 = data_df[ data_df['Age'] > 0 ].copy()
data_5['Age'] = data_5['Age'] // 10
data_5['Age'] = data_5['Age'].astype(int)

del data_5['Cabin']

data_5.shape

(714, 7)

### Save Data

In [16]:
for i, data in enumerate([data_1, data_2, data_3, data_4, data_5]):
    data.set_index('PassengerId').to_csv(f'titanic_data_{i+1}.csv')

## Pipeline

In [18]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(categories='auto')

### Processor 1. Age(o), Cabin(x)

In [19]:
numeric_features = ['Age']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipe_1 = Pipeline(steps=[('preprocessor', preprocessor)])

### Processor 2. Age(o), Cabin(o)

In [20]:
numeric_features = ['Age']
categorical_features = ['Pclass', 'Sex', 'Cabin', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipe_2 = Pipeline(steps=[('preprocessor', preprocessor)])

### Processor 3. Age(구간), Cabin(x)

In [21]:
categorical_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'Family']
categorical_transformer = OneHotEncoder(categories='auto')

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, categorical_features)])

pipe_3 = Pipeline(steps=[('preprocessor', preprocessor)])

### Save Processor

In [22]:
import joblib

for i, pipe in enumerate([pipe_1, pipe_2, pipe_3]):
    joblib.dump(pipe, f'titanic_pipe_{i+1}.pkl', compress=True)