# 제출물
```gender_submission.csv```를 확인해보면 여성 승객이 살아남았는지 아닌지를 확인하는 문제이다

# 라이브러리 import

In [2]:
import os
from glob import glob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 있고 없고의 차이를 알자
pd.set_option('display.max_rows', 100)

In [2]:
os.listdir()

['titanic.ipynb',
 'test.csv',
 'train.csv',
 '.ipynb_checkpoints',
 'titanic.zip',
 'gender_submission.csv',
 'titanic_pandas.ipynb']

In [3]:
train_path = 'train.csv'
test_path = 'test.csv'
sub_path = 'gender_submission.csv'

In [4]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sub = pd.read_csv(sub_path)

# 데이터 확인

In [5]:
train.shape, test.shape, sub.shape

((891, 12), (418, 11), (418, 2))

In [6]:
train.head(2)
train.tail(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [7]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [8]:
sub.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


In [9]:
# copy? deep copy?
train_copy = train.copy()
test_copy = test.copy()

In [10]:
# 예측해야하는 column 찾기 - label
# 아... 이런식으로도 확인할 수 있는건가
set(train.columns) - set(test.columns)

{'Survived'}

In [11]:
label = train['Survived']     # train데이터중에 survived column을 가져온다.
train_index = train.shape[0]  # (891, 12) 중에서 891

In [12]:
# train, test concat 후 결합
train.drop('Survived', axis=1, inplace=True)
total_df = pd.concat([train, test]).reset_index(drop=True) # 결합 후 index 리셋

- 컬럼 삭제
    - ```drop```
    - ```axis=1``` : column을 뜻한다. (0일 경우에는 row, 이건 default값)
    - ```inplace``` : drop한 후의 데이터프레임으로 기존 데이터프레임을 대체

In [13]:
# drop후 train을 확인해보면 survived가 없어져있다.
train.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [14]:
ID = total_df['PassengerId']

In [15]:
# NA값 확인
print(total_df.isna().sum())

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [16]:
# 비율로 확인
print((total_df.isna().sum() / len(total_df))*100)

PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.091673
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.076394
Cabin          77.463713
Embarked        0.152788
dtype: float64


In [17]:
# 77% NA -> 삭제한다
total_df.drop('Cabin', axis=1, inplace=True)

In [18]:
# 통계 수치 확인 -> numerical dtype only
# 숫자로 있는 데이터들만 확인 가능하다
total_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,80.0,8.0,9.0,512.3292


In [19]:
# Age는 평균 값으로 NA 값을 채운다
# 다른 방법도 가능
total_df['Age'] = total_df['Age'].fillna(np.int32(total_df['Age'].mean()))

In [20]:
# 이것도 뭐지 왜 확인하는 거지
total_df[total_df['Embarked'].isna() == True]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [21]:
# 이건 뭐지 요금하고 왜 뭐야
total_df.groupby('Embarked')['Fare'].mean()

Embarked
C    62.336267
Q    12.409012
S    27.418824
Name: Fare, dtype: float64

In [22]:
# Embarked는 평균 요금이 가장 비슷한 곳으로 선정하여 NA 값을 채움
total_df['Embarked'] = total_df['Embarked'].fillna('C')

In [23]:
# 변수 unique 확인
# unique가 아닌 nunique
for col in total_df.columns:
    print('Column : {} \nunique 수 : {}'.format(col, total_df[col].nunique()))

Column : PassengerId 
unique 수 : 1309
Column : Pclass 
unique 수 : 3
Column : Name 
unique 수 : 1307
Column : Sex 
unique 수 : 2
Column : Age 
unique 수 : 98
Column : SibSp 
unique 수 : 7
Column : Parch 
unique 수 : 8
Column : Ticket 
unique 수 : 929
Column : Fare 
unique 수 : 281
Column : Embarked 
unique 수 : 3


In [24]:
# 범주형 변수 확인
cat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for col in cat_cols:
    print('Column : {} \nunique : {}'.format(col, total_df[col].unique()))

Column : Pclass 
unique : [3 1 2]
Column : Sex 
unique : ['male' 'female']
Column : SibSp 
unique : [1 0 3 4 2 5 8]
Column : Parch 
unique : [0 1 2 5 3 4 6 9]
Column : Embarked 
unique : ['S' 'C' 'Q']


# Feature Engineering

In [25]:
# Family size
total_df['fam_size'] = total_df['SibSp'] + total_df['Parch']

In [26]:
# PassengerId와 name이 가리키는 대상이 같음 -> Name 삭제
# 사람마다 고유한 ID가 있기 때문에? 이름을 삭제하는 것 같다.
total_df.drop('Name', axis=1, inplace=True)

In [27]:
# One-hot Encoding
total_df['male'] = total_df['Sex'].map(lambda x : 1 if x=='male' else 0).astype(np.uint8)
total_df['female'] = total_df['Sex'].map(lambda x : 1 if x=='female' else 0).astype(np.uint8)

In [28]:
total_df['Pclass-1'] = total_df['Pclass'].map(lambda x : 1 if x == 1 else 0).astype(np.uint8)
total_df['Pclass-2'] = total_df['Pclass'].map(lambda x : 1 if x == 2 else 0).astype(np.uint8)
total_df['Pclass-3'] = total_df['Pclass'].map(lambda x : 1 if x == 3 else 0).astype(np.uint8)

In [29]:
total_df['Embarked-S'] = total_df['Embarked'].map(lambda x : 1 if x == 'S' else 0).astype(np.uint8)
total_df['Embarked-C'] = total_df['Embarked'].map(lambda x : 1 if x == 'C' else 0).astype(np.uint8)
total_df['Embarked-Q'] = total_df['Embarked'].map(lambda x : 1 if x == 'Q' else 0).astype(np.uint8)

In [30]:
total_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,fam_size,male,female,Pclass-1,Pclass-2,Pclass-3,Embarked-S,Embarked-C,Embarked-Q
0,1,3,male,22.0,1,0,A/5 21171,7.25,S,1,1,0,0,0,1,1,0,0
1,2,1,female,38.0,1,0,PC 17599,71.2833,C,1,0,1,1,0,0,0,1,0
2,3,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,0,0,1,0,0,1,1,0,0
3,4,1,female,35.0,1,0,113803,53.1,S,1,0,1,1,0,0,1,0,0
4,5,3,male,35.0,0,0,373450,8.05,S,0,1,0,0,0,1,1,0,0


In [31]:
total_df['Age'].describe()

count    1309.000000
mean       29.704102
std        12.888034
min         0.170000
25%        22.000000
50%        29.000000
75%        35.000000
max        80.000000
Name: Age, dtype: float64

In [32]:
# 승객의 75프로 35세 이하
# 35세 이하와 36세 이상으로 범주화
total_df['Age'] = total_df['Age'].map(lambda x : 0 if x <= 35 else 1)
total_df['Age_young'] = total_df['Age'].map(lambda x : 1 if x == 0 else 0)
total_df['Age_old'] = total_df['Age'].map(lambda x : 1 if x == 1 else 0)

In [33]:
total_df.drop(['PassengerId', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Embarked', 'Pclass', 'Age'], axis=1, inplace=True)

In [34]:
total_df.head()

Unnamed: 0,Fare,fam_size,male,female,Pclass-1,Pclass-2,Pclass-3,Embarked-S,Embarked-C,Embarked-Q,Age_young,Age_old
0,7.25,1,1,0,0,0,1,1,0,0,1,0
1,71.2833,1,0,1,1,0,0,0,1,0,0,1
2,7.925,0,0,1,0,0,1,1,0,0,1,0
3,53.1,1,0,1,1,0,0,1,0,0,1,0
4,8.05,0,1,0,0,0,1,1,0,0,1,0


In [35]:
# 일반적인 리스트의 인덱스 슬라이싱하고는 다르다
# loc을 쓸때는 다르므로 주의
x_tr = total_df.loc[:train_index-1]
x_tst = total_df.loc[train_index:]

In [36]:
y_tr = label[:train_index]
y_tst = label[train_index:]


In [37]:
rf = RandomForestClassifier(random_state=2080)

In [38]:
rf.fit(x_tr, y_tr)

RandomForestClassifier(random_state=2080)

In [41]:
x_tst.isna().sum()

Fare          1
fam_size      0
male          0
female        0
Pclass-1      0
Pclass-2      0
Pclass-3      0
Embarked-S    0
Embarked-C    0
Embarked-Q    0
Age_young     0
Age_old       0
dtype: int64

In [42]:
x_tst['Fare'] = x_tst['Fare'].fillna(train['Fare'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [43]:
y_pred = rf.predict(x_tst)

In [44]:
sub['Survived'] = y_pred

In [48]:
!mkdir -p ./submissions

In [49]:
sub.to_csv('./submissions/titanic_submission_rf_baseline.csv',index=False)

# EDA

In [3]:
train_path = 'train.csv'
test_path = 'test.csv'
sub_path = 'gender_submission.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sub = pd.read_csv(sub_path)

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.shape

(891, 12)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
