### 타이타닉 데이터 분석
+ 다양한 머신러닝 알고리즘을 이용해서 교차검증 방식으로 모델을 훈련시키고 예측 정확도를 평가해 봄

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
titanic = pd.read_csv('csv/titanic2.csv')
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,life,seat,port
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,S,live,1st,southampthon
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,S,live,1st,southampthon
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,S,dead,1st,southampthon
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,S,dead,1st,southampthon
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,S,dead,1st,southampthon


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306 entries, 0 to 1305
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1306 non-null   int64  
 1   survived  1306 non-null   int64  
 2   name      1306 non-null   object 
 3   sex       1306 non-null   object 
 4   age       1306 non-null   float64
 5   sibsp     1306 non-null   int64  
 6   parch     1306 non-null   int64  
 7   ticket    1306 non-null   object 
 8   fare      1306 non-null   float64
 9   embarked  1306 non-null   object 
 10  life      1306 non-null   object 
 11  seat      1306 non-null   object 
 12  port      1306 non-null   object 
dtypes: float64(2), int64(4), object(7)
memory usage: 132.8+ KB


In [4]:
# 레이블 생사 분포 확인
titanic.life.value_counts()

dead    808
live    498
Name: life, dtype: int64

In [5]:
# 여러 특성들 중 성별 분포 확인
titanic.seat.value_counts()

3rd    708
1st    321
2nd    277
Name: seat, dtype: int64

In [6]:
# 여러 특성들 중 승선위치 분포 확인
titanic.port.value_counts()

southampthon    913
cherbourg       270
qeenstown       123
Name: port, dtype: int64

In [7]:
# 데이터 분석시 문자형 값보다는 숫자형 값을 더 잘 인식함
# 문자형값 -> 숫자형값으로 변환하는 과정 필요

# 성별을 레이블인코딩으로 숫자형으로 변환 -> 파생변수
titanic['gender'] = titanic['sex'].apply(lambda x: 0 if x == 'female' else 1)
titanic.iloc[:,[3,13]].head()
#titanic.loc[:,['sex','gender']].head(5)

Unnamed: 0,sex,gender
0,female,0
1,male,1
2,female,0
3,male,1
4,female,0


In [8]:
# 승선 위치를 레이블인코딩으로 숫자형으로 변환 -> 파생변수
titanic['harbor'] =  titanic['embarked'].apply(lambda x : 0 if x =='C' else (1 if x == 'S' else 2))
titanic.iloc[:, [9,14]].head()

Unnamed: 0,embarked,harbor
0,S,1
1,S,1
2,S,1
3,S,1
4,S,1


In [9]:
titanic.embarked.value_counts()

S    913
C    270
Q    123
Name: embarked, dtype: int64

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

### 분석에 필요한 컬럼을 뽑아 특성/레이블을 만들기

In [11]:
data = titanic.iloc[:, [0, 4, 5, 6, 8, 13, 14]]
target = titanic.survived

### 훈련/평가 데이터 분할

In [12]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, train_size=0.7, random_state=2111041110)

### 의사결정나무

In [13]:
dtclf = DecisionTreeClassifier()
dtclf.fit(Xtrain, ytrain)

DecisionTreeClassifier()

In [14]:
pred = dtclf.predict(Xtest)

In [15]:
accuracy_score(ytest, pred)

0.7780612244897959

### 로지스틱 회귀

In [16]:
lrclf = LogisticRegression()
lrclf.fit(Xtrain, ytrain)

LogisticRegression()

In [17]:
pred2 = lrclf.predict(Xtest)
accuracy_score(ytest, pred2)

0.8010204081632653

### 교차검증

In [18]:
dtclf = DecisionTreeClassifier(max_depth=3)
scores = cross_val_score(dtclf, data, target, cv=10, scoring = 'accuracy')
np.mean(scores)

0.7487081620669407

In [19]:
lrclf = LogisticRegression(max_iter=300)
scores = cross_val_score(lrclf, data, target, cv=10, scoring = 'accuracy')
np.mean(scores)

0.7501820317087493

In [20]:
rfclf = RandomForestClassifier()
scores = cross_val_score(rfclf, data, target, cv=10, scoring = 'accuracy')
np.mean(scores)

0.7348737522019965

In [21]:
# 머신 러닝 모델 평가
# 정확도만으로 모델의 성능을 평가하는 것이 옳은 것인가?
titanic.life.value_counts()

dead    808
live    498
Name: life, dtype: int64

In [22]:
# 성별에 따른 생존여부
titanic.groupby(['sex','life'])['life'].count()

# 여성의 생존률이 남성의 생존률보다 높기 때문에 간단한 조건문만으로도 모델을 만들수도 있음
# 입력값 : 여성 -> 생존 // 남성 -> 사망

sex     life
female  dead    127
        live    337
male    dead    681
        live    161
Name: life, dtype: int64