### 타이타닉 데이터를 이용한 데이터 탐색

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
# 데이터 불러오기
titanic = pd.read_csv('./data/titanic2.csv')
titanic.head()

Unnamed: 0,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,seat,live
0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,1st,live
1,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,1st,live
2,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,1st,dead
3,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,1st,dead
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,1st,dead


#### 전처리

In [4]:
# 결측치 처리
# age : 대체값으로 보충
# cabin : 컬럼 삭제
# fare, embarked : 행 삭제
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      1309 non-null   object 
 1   sex       1309 non-null   object 
 2   age       1046 non-null   float64
 3   sibsp     1309 non-null   float64
 4   parch     1309 non-null   float64
 5   ticket    1309 non-null   object 
 6   fare      1308 non-null   float64
 7   cabin     295 non-null    object 
 8   embarked  1307 non-null   object 
 9   seat      1310 non-null   object 
 10  live      1310 non-null   object 
dtypes: float64(4), object(7)
memory usage: 112.7+ KB


In [5]:
titanic.age.fillna(np.mean(titanic.age), inplace=True)

In [6]:
titanic.drop(columns=['cabin'], inplace=True)

In [7]:
titanic.dropna(inplace=True)

In [8]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1306 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      1306 non-null   object 
 1   sex       1306 non-null   object 
 2   age       1306 non-null   float64
 3   sibsp     1306 non-null   float64
 4   parch     1306 non-null   float64
 5   ticket    1306 non-null   object 
 6   fare      1306 non-null   float64
 7   embarked  1306 non-null   object 
 8   seat      1306 non-null   object 
 9   live      1306 non-null   object 
dtypes: float64(4), object(6)
memory usage: 112.2+ KB


In [9]:
# 레이블 인코딩
encoder = LabelEncoder()
encoder.fit(titanic.sex)
titanic['gender'] = encoder.transform(titanic.sex)
titanic.loc[:, ['sex', 'gender']].head()

Unnamed: 0,sex,gender
0,female,0
1,male,1
2,female,0
3,male,1
4,female,0


In [10]:
encoder = LabelEncoder()
encoder.fit(titanic.embarked)
titanic['embark_town'] = encoder.transform(titanic.embarked)
titanic.loc[:, ['embarked', 'embark_town']].head()

Unnamed: 0,embarked,embark_town
0,S,2
1,S,2
2,S,2
3,S,2
4,S,2


In [11]:
encoder = LabelEncoder()
encoder.fit(titanic.seat)
titanic['pclass'] = encoder.transform(titanic.seat)
titanic.loc[:, ['seat', 'pclass']].head()

Unnamed: 0,seat,pclass
0,1st,0
1,1st,0
2,1st,0
3,1st,0
4,1st,0


In [12]:
encoder = LabelEncoder()
encoder.fit(titanic.live)
titanic['survived'] = encoder.transform(titanic.live)
titanic.loc[:, ['live', 'survived']].head()

Unnamed: 0,live,survived
0,live,1
1,live,1
2,dead,0
3,dead,0
4,dead,0


In [13]:
titanic.name[:5]

0                      Allen, Miss. Elisabeth Walton
1                     Allison, Master. Hudson Trevor
2                       Allison, Miss. Helen Loraine
3               Allison, Mr. Hudson Joshua Creighton
4    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
Name: name, dtype: object

In [14]:
titanic['title'] = titanic.name.str.split(' ')\
                    .str[1].replace('\.', '', regex=True)\
                    .replace(',', '', regex=True)
titanic.loc[:, ['name', 'title']].head()

Unnamed: 0,name,title
0,"Allen, Miss. Elisabeth Walton",Miss
1,"Allison, Master. Hudson Trevor",Master
2,"Allison, Miss. Helen Loraine",Miss
3,"Allison, Mr. Hudson Joshua Creighton",Mr
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Mrs


In [15]:
# Rev 신부, Dr 의사, Col 대령, Capt 선장, Magor 소령 
titanic.title.value_counts()

title
Mr             735
Miss           255
Mrs            190
Master          59
y                8
Rev              8
Dr               8
Planke           4
Col              4
Impe             3
Billiard         3
Messemaeker      2
Mlle             2
Major            2
Carlo            2
Ms               2
Gordon           2
Shawah           1
Cruyssen         1
Steen            1
Walle            1
Velde            1
Melkebeke        1
Palmquist        1
Capt             1
Pelsmaeker       1
Mulder           1
Khalil           1
Mme              1
der              1
Don              1
the              1
Jonkheer         1
Brito            1
Name: count, dtype: int64

In [16]:
encoder = LabelEncoder()
encoder.fit(titanic.title)
titanic['titles'] = encoder.transform(titanic.title)
titanic.loc[:, ['title', 'titles']].head()

Unnamed: 0,title,titles
0,Miss,16
1,Master,13
2,Miss,16
3,Mr,19
4,Mrs,20


In [22]:
# 분석에 사용할 컬럼 선택
data = titanic.loc[:, ['titles', 'age', 'sibsp', 'parch', 'fare', 'gender', 'embark_town', 'pclass','survived']]
target = titanic.survived  # 1:live, 0:dead

In [23]:
# 작업결과 파일로 저장
result = titanic.loc[:, ['titles', 'age', 'sibsp', 'parch', 'fare', 'gender','embark_town','pclass','survived']]
result.to_csv('titanic3.csv',index=False)

#### 탐색적 분석