# 모듈, 데이터 가져오기

In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.tree import DecisionTreeClassifier

In [3]:
DATA_PATH = "../data/titanic/"

In [4]:
df = pd.read_csv(DATA_PATH+"train.csv")
SEED = 42

In [5]:
df.shape

(891, 12)

In [6]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
df.columns = [col.lower() for col in df.columns]
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
target = df.pop("survived")
target

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

In [10]:
df

Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# 수치형, 범주형 데이터 분리

In [11]:
df_numbers = df.select_dtypes(include=np.number)
df_numbers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   age          714 non-null    float64
 3   sibsp        891 non-null    int64  
 4   parch        891 non-null    int64  
 5   fare         891 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 41.9 KB


In [12]:
df_objects = df.select_dtypes(include=np.object_)
df_objects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      891 non-null    object
 1   sex       891 non-null    object
 2   ticket    891 non-null    object
 3   cabin     204 non-null    object
 4   embarked  889 non-null    object
dtypes: object(5)
memory usage: 34.9+ KB


In [13]:
df_numbers = df_numbers.drop(columns="passengerid")

# 결측치 조사

In [48]:
df_numbers.isnull().sum(), df_objects.isnull().sum()

(pclass      0
 age       177
 sibsp       0
 parch       0
 fare        0
 dtype: int64,
 sex           0
 ticket        0
 cabin       687
 embarked      2
 header        0
 family        0
 dtype: int64)

In [51]:
df_numbers[["pclass", "fare", "age"]].loc[df_numbers["age"].isnull()]

Unnamed: 0,pclass,fare,age
5,3,8.4583,
17,2,13.0000,
19,3,7.2250,
26,3,7.2250,
28,3,7.8792,
...,...,...,...
859,3,7.2292,
863,3,69.5500,
868,3,9.5000,
878,3,7.8958,


In [78]:
df_numbers["fare"].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64

In [87]:
df_numbers["fare_div_5"] = pd.Series([x//5 for x in df_numbers["fare"]])
df_numbers["fare_div_5"]

0       1.0
1      14.0
2       1.0
3      10.0
4       1.0
       ... 
886     2.0
887     6.0
888     4.0
889     6.0
890     1.0
Name: fare_div_5, Length: 891, dtype: float64

In [89]:
df_numbers["age_div_10"] = pd.Series([x//10 for x in df_numbers["age"]])
df_numbers["age_div_10"]

0      2.0
1      3.0
2      2.0
3      3.0
4      3.0
      ... 
886    2.0
887    1.0
888    NaN
889    2.0
890    3.0
Name: age_div_10, Length: 891, dtype: float64

In [91]:
pd.pivot_table(df_numbers, index="age_div_10", columns="pclass", values="fare", aggfunc="mean")

pclass,1,2,3
age_div_10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,128.319433,28.7402,24.33839
1.0,114.10625,20.934489,13.452276
2.0,101.603797,22.273113,10.273399
3.0,98.344334,19.616146,12.81473
4.0,67.985141,23.125,13.250124
5.0,74.60077,16.025,7.7007
6.0,64.262177,20.0,7.858333
7.0,51.719467,10.5,7.7625
8.0,30.0,,


In [92]:
df.loc[df["embarked"].isnull()]

Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,1
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,1


In [94]:
df.loc[df["pclass"] == 1]

Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0
11,12,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,1
23,24,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,1
872,873,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S,0
879,880,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,1
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1


In [68]:
df.loc[df_numbers["fare"] == max(df_numbers["fare"])]

Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
258,259,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
679,680,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C
737,738,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C


In [14]:
df_objects.head()

Unnamed: 0,name,sex,ticket,cabin,embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [16]:
name_header = re.compile(" [a-zA-Z]+\.")

In [17]:
nheaders = [name_header.search(x).group().strip() for x in df_objects["name"]]

In [18]:
df_objects['header'] = nheaders
df_objects

Unnamed: 0,name,sex,ticket,cabin,embarked,header
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,Mr.
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,Mrs.
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,Miss.
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,Mrs.
4,"Allen, Mr. William Henry",male,373450,,S,Mr.
...,...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,,S,Rev.
887,"Graham, Miss. Margaret Edith",female,112053,B42,S,Miss.
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,,S,Miss.
889,"Behr, Mr. Karl Howell",male,111369,C148,C,Mr.


In [19]:
df_objects["header"].unique()

array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Don.', 'Rev.', 'Dr.', 'Mme.',
       'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Col.', 'Capt.',
       'Countess.', 'Jonkheer.'], dtype=object)

skit. 앤더슨 찾기...

In [74]:
df_objects.loc[list(map(lambda x: "Andersson" in x, df_objects["name"]))]

Unnamed: 0,sex,ticket,cabin,embarked,header,family,survived
13,male,347082,,S,Mr.,Andersson,0
68,female,3101281,,S,Miss.,Andersson,1
119,female,347082,,S,Miss.,Andersson,0
146,male,350043,,S,Mr.,Andersson,1
541,female,347082,,S,Miss.,Andersson,0
542,female,347082,,S,Miss.,Andersson,0
610,female,347082,,S,Mrs.,Andersson,0
813,female,347082,,S,Miss.,Andersson,0
850,male,347082,,S,Master.,Andersson,0


In [21]:
name_family = re.compile(".+\,")

In [22]:
nfamily = [name_family.search(x).group().replace(",", "") for x in df_objects["name"]]

In [23]:
df_objects["family"] = nfamily
df_objects["family"].nunique()

667

In [24]:
df_name = df_objects.pop("name")
df_name

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: name, Length: 891, dtype: object

In [53]:
df_objects[["cabin", "embarked"]]

Unnamed: 0,cabin,embarked
0,,S
1,C85,C
2,,S
3,C123,S
4,,S
...,...,...
886,,S
887,B42,S
888,,S
889,C148,C


In [41]:
df_objects[["cabin", "embarked"]].loc[df_objects["embarked"].isnull()]

Unnamed: 0,cabin,embarked
61,B28,
829,B28,


In [52]:
df_objects[["cabin", "embarked"]].loc[df_objects["cabin"] == "B28"]

Unnamed: 0,cabin,embarked
61,B28,
829,B28,


In [69]:
pd.concat([df_numbers, df_objects, target], axis=1) 

Unnamed: 0,pclass,age,sibsp,parch,fare,sex,ticket,cabin,embarked,header,family,survived
0,3,22.0,1,0,7.2500,male,A/5 21171,,S,Mr.,Braund,0
1,1,38.0,1,0,71.2833,female,PC 17599,C85,C,Mrs.,Cumings,1
2,3,26.0,0,0,7.9250,female,STON/O2. 3101282,,S,Miss.,Heikkinen,1
3,1,35.0,1,0,53.1000,female,113803,C123,S,Mrs.,Futrelle,1
4,3,35.0,0,0,8.0500,male,373450,,S,Mr.,Allen,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,male,211536,,S,Rev.,Montvila,0
887,1,19.0,0,0,30.0000,female,112053,B42,S,Miss.,Graham,1
888,3,,1,2,23.4500,female,W./C. 6607,,S,Miss.,Johnston,0
889,1,26.0,0,0,30.0000,male,111369,C148,C,Mr.,Behr,1


In [72]:
df

Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


### 생각 좀 해보자 해야할거 뭐있냐
- 결측치 제거
- feature 추가
    - 일단 name 관련은 그냥 날려

# 데이터 트레인, 테스트 세트 분리

In [23]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(df, target, test_size=0.2, random_state=3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 11), (179, 11), (712,), (179,))

# 일단 모델을 돌려보자

In [17]:
dtc = DecisionTreeClassifier()

In [24]:
dtc.fit(X_train, y_train)
did_nothing_score = round(dtc.score(X_test, y_test), 4)
print(f'전처리 안한거 결과: {did_nothing_score}')

ValueError: could not convert string to float: 'Andersson, Mr. August Edvard ("Wennerstrom")'