In [1]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 파일 읽어오기
path = 'https://bit.ly/TitanicFile'
titanic = pd.read_csv(path)

# 확인
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# 열 이름 복사해 두기
old_columns = titanic.columns
print(old_columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [7]:
# 열 이름 변셩
titanic.columns = titanic.columns.str.lower()

# 확인
titanic.head(3)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
# 열 이름 원복
titanic.columns = old_columns

# 확인
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [10]:
# 불필요한 열 제거
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
titanic.drop(drop_cols, axis=1, inplace=True)

# 확인
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [11]:
# 결측치 존재 여부 확인
titanic.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [12]:
# Age 결측치 채우기
age_mean = titanic['Age'].mean()
titanic['Age'].fillna(age_mean, inplace=True)

# 확인
titanic.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [13]:
# Embarked 결측치 채우기
Embarked_freq = titanic['Embarked'].mode()[0]
titanic['Embarked'].fillna(Embarked_freq, inplace=True)

# 확인
titanic.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [14]:
# 열 추가
titanic['AgeGrp'] = titanic['Age'] // 10 * 10

# 확인
titanic['AgeGrp'].value_counts()

20.0    397
30.0    167
10.0    102
40.0     89
0.0      62
50.0     48
60.0     19
70.0      6
80.0      1
Name: AgeGrp, dtype: int64

In [15]:
# Age 열 제거
drop_cols = ['Age']
titanic.drop(drop_cols, axis=1, inplace=True)

# 확인
titanic.head(3)

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,AgeGrp
0,0,3,male,1,0,7.25,S,20.0
1,1,1,female,1,0,71.2833,C,30.0
2,1,3,female,0,0,7.925,S,20.0


In [16]:
# AgeGrp 열 정수로 변환
titanic['AgeGrp'] = titanic['AgeGrp'].astype(int)

# 확인
titanic.head(3)

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,AgeGrp
0,0,3,male,1,0,7.25,S,20
1,1,1,female,1,0,71.2833,C,30
2,1,3,female,0,0,7.925,S,20


In [17]:
# 가변수화
dumm_cols = ['Pclass', 'Sex', 'Embarked']
titanic = pd.get_dummies(titanic, columns=dumm_cols, drop_first=True, dtype=int)

# 확인
titanic.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,AgeGrp,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0,1,0,7.25,20,0,1,1,0,1
1,1,1,0,71.2833,30,0,0,0,0,0
2,1,0,0,7.925,20,0,1,0,0,1
3,1,1,0,53.1,30,0,0,0,0,1
4,0,0,0,8.05,30,0,1,1,0,1


In [18]:
list(titanic)

['Survived',
 'SibSp',
 'Parch',
 'Fare',
 'AgeGrp',
 'Pclass_2',
 'Pclass_3',
 'Sex_male',
 'Embarked_Q',
 'Embarked_S']