# titanic 데이터 분류 - 순서
1. data 로드
2. 데이터 전체적 모양 확인(info, describe, null값, 시각화)
3. 데이터 결측치 제거
4. 카테고리 데이터/문자열 데이터 인코딩
5. 알고리즘에 불필요한 속성 제거
6. 교차 검증하고 정확도 높은 모델 선택

- 컬럼이 뭘 뜻하는지 알아보고
- 데이터 컬럼의 타입을 확인
- Null값 확인 후 제거
- Cabin 컬럼 수정
- 시각화(선택..)
- 문자열 컬럼 레이블인코딩
- train-test 분리
- DT, RF, LR 모델 사용, accuracy 평가
- 교차 검증(KFold, StratifiedKFold, cross_val_score, GridSearchCV)
- 하이퍼 파라미터 : {'max_depth':[2, 3, 5, 10], 'min_samples_split':[2, 3, 5], 'min_samples_leaf' = [1, 5, 8]}

In [26]:
import numpy as np
import pandas as pd

# DataFrame 불러오기
titanic = pd.read_csv("titanic/titanic_train.csv")
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [27]:
titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [28]:
# Age에 177개, Cabin에 687개, Embarked에 2개

In [29]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace = True)
titanic['Cabin'].fillna('N', inplace = True)
titanic['Embarked'].fillna('N', inplace = True)

In [30]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [31]:
titanic.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [32]:
titanic.Ticket.value_counts()

347082               7
1601                 7
CA. 2343             7
CA 2144              6
3101295              6
347088               6
S.O.C. 14879         5
382652               5
347077               4
17421                4
19950                4
PC 17757             4
W./C. 6608           4
LINE                 4
4133                 4
113760               4
2666                 4
113781               4
349909               4
F.C.C. 13529         3
363291               3
35273                3
239853               3
C.A. 34651           3
371110               3
13502                3
345773               3
24160                3
PC 17582             3
SC/Paris 2123        3
                    ..
2649                 1
A/5. 2151            1
W./C. 6609           1
PC 17474             1
113051               1
364512               1
237798               1
315037               1
28220                1
STON/O 2. 3101292    1
28425                1
4138                 1
3101276    

In [33]:
titanic.Embarked.value_counts()

S    644
C    168
Q     77
N      2
Name: Embarked, dtype: int64

In [34]:
titanic.Cabin.value_counts()

N              687
G6               4
B96 B98          4
C23 C25 C27      4
E101             3
F33              3
C22 C26          3
F2               3
D                3
B35              2
C78              2
C126             2
B5               2
C92              2
E8               2
E67              2
B18              2
B49              2
C125             2
E25              2
D36              2
C124             2
C68              2
B20              2
F G73            2
C83              2
B77              2
F4               2
E24              2
D35              2
              ... 
B71              1
E40              1
C7               1
D46              1
C47              1
F E69            1
F38              1
B86              1
B102             1
D47              1
E58              1
C106             1
C103             1
E17              1
A31              1
C82              1
E36              1
A36              1
E34              1
C101             1
A7               1
C49         

In [35]:
titanic['Cabin'] = titanic['Cabin'].str[:1]

In [36]:
titanic.Cabin.value_counts()

N    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [37]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,N,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,N,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,N,S


In [38]:
# 나이 카테고리 나누기 함수
def age_cat(age):
    res = ''
    if age <= 3 : res = 'Baby'
    elif age <= 12 : res = 'Child'
    elif age <= 20 : res = 'Teen'
    elif age <= 35 : res = 'Young_Adult'
    elif age <= 60 : res = 'Adult'
    else : res = 'Elder'
    return res

In [39]:
titanic['Age_cat'] = titanic['Age'].apply(age_cat)

In [40]:
titanic.drop('Age', axis = 1, inplace = True)

In [41]:
# 카테고리 데이터 : Pclass, Sex, Cabin, Embarked, Age_cat
# 버려도 되는 데이터 : PassengerId, Name, Ticket, Fare
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_cat
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,N,S,Young_Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,N,S,Young_Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,C,S,Young_Adult
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,N,S,Young_Adult


In [44]:
Age_cat_dummies = pd.get_dummies(titanic['Age_cat'])
Age_cat_dummies.head()

Unnamed: 0,Adult,Baby,Child,Elder,Teen,Young_Adult
0,0,0,0,0,0,1
1,1,0,0,0,0,0
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1


In [46]:
Sex_dummies = pd.get_dummies(titanic['Sex'])
Sex_dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [47]:
Cabin_dummies = pd.get_dummies(titanic['Cabin'])
Embarked_dummies = pd.get_dummies(titanic['Embarked'])

In [48]:
titanic.drop(['PassengerId','Name','Ticket','Fare','Sex','Cabin','Embarked'], axis = 1, inplace = True)