In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=1.5)

import missingno as msno

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

In [None]:
df_train.head()

- 각 피처(컬럼) 최대값을 확인

In [None]:
df_train.max()

In [None]:
df_train.describe()

In [None]:
df_train.info()

In [None]:
df_test.head()

## 결측치 백분위 비율로 확인

In [None]:
for col in df_train.columns:
    msg = 'column:{:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100*(df_train[col].isnull().sum()/df_train[col].shape[0]))
    print(msg)

In [None]:
for col in df_test.columns:
    msg = 'column:{:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100*(df_test[col].isnull().sum()/df_test[col].shape[0]))
    print(msg)

## 결측치 시각화

In [None]:
msno.matrix(df_train, figsize=(5,5))

In [None]:
msno.bar(df_train, figsize=(5,5))

## EDA : Survived

## Survived : 0 (사망), 1(생존)

In [None]:
f, ax = plt.subplots(1,2,figsize=(18,8))

df_train['Survived'].value_counts().plot.pie(explode=[0,0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')

sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()

## EDA : Pclass

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass']).count()

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean()

In [None]:
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True).style.background_gradient(cmap='rocket_r')

In [None]:
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).mean()

In [None]:
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived',ascending=False).plot(kind='bar')

In [None]:
y_position = 1.02
f, ax = plt.subplots(1,2,figsize=(18,8))
df_train['Pclass'].value_counts().plot(kind='bar', ax=ax[0])
ax[0].set_title('Number of passengers by Pclass', y=y_position)
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Pclass: Suvived vs Dead')
ax[1].set_ylabel('Count')
plt.show()

## 2.2. Sex

In [None]:
f, ax = plt.subplots(1,2,figsize=(18,8))
df_train[['Sex','Survived']].groupby(['Sex'], as_index=True).mean().plot(kind='bar', ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Sex: Survivied vs Dead')
plt.show()

In [None]:
pd.crosstab(df_train['Sex'], df_train['Survived'], margins=True).style.background_gradient(cmap='rocket_r')

## 2.2. Both Sex and Pclass

In [None]:
sns.factorplot('Pclass','Survived', hue='Sex', data=df_train, size=6, aspect=1.5)

In [None]:
sns.factorplot(x='Sex',y='Survived', col='Pclass', data=df_train, saturatoin=0.5, size=9, aspect=1)

In [None]:
sns.factorplot(x='Sex',y='Survived', hue='Pclass', data=df_train, saturatoin=0.5, size=9, aspect=1)

### 3. Age

In [None]:
print('제일 나이 많은 탑승객 : {:.1f} years'.format(df_train['Age'].max()))
print('제일 어린 탑승객 : {:.1f} years'.format(df_train['Age'].min()))
print('탑승객 평균 나이 : {:.1f} years'.format(df_train['Age'].mean()))

KDE(커널밀도추정) 학습 해 볼것!

In [None]:
f, ax = plt.subplots(1,1,figsize=(16,8))
sns.kdeplot(df_train[df_train['Survived']==1]['Age'], ax= ax)
sns.kdeplot(df_train[df_train['Survived']==0]['Age'], ax= ax)
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

In [None]:
df_train[df_train['Survived'] == 0]['Age'].hist()

In [None]:
df_train.head()

In [None]:
df_train.loc[1]

In [None]:
df_train.iloc[1,3]

In [None]:
for row in df_train.iterrows():
    break
row

In [None]:
# f = plt.figure(figsize=(10,10))

# f, ax = plt.subplots(1,1,figsize=(10,10))

# plt.figure(figsize=(10,10))

In [None]:
f, ax = plt.subplots(1,1,figsize=(16,8))
df_train['Age'][df_train['Pclass']==1].plot(kind='kde')
df_train['Age'][df_train['Pclass']==2].plot(kind='kde')
df_train['Age'][df_train['Pclass']==3].plot(kind='kde')

ax.set_xlabel('Age')
ax.set_title('Age Distribution within classes')
ax.legend(['1st Class','2nd Class','3rd Class'])

In [None]:
f, ax = plt.subplots(1,1,figsize=(8,5))
sns.kdeplot(df_train[(df_train['Survived']==0) & (df_train['Pclass']==1)]['Age'], ax= ax)
sns.kdeplot(df_train[(df_train['Survived']==1) & (df_train['Pclass']==1)]['Age'], ax= ax)                      
plt.legend(['Survived == 0', 'Survived == 1'])
plt.title('1st Class')
plt.show()

In [None]:
f, ax = plt.subplots(1,1,figsize=(8,5))
sns.kdeplot(df_train[(df_train['Survived']==0) & (df_train['Pclass']==2)]['Age'], ax= ax)
sns.kdeplot(df_train[(df_train['Survived']==1) & (df_train['Pclass']==2)]['Age'], ax= ax)                      
plt.legend(['Survived == 0', 'Survived == 1'])
plt.title('2nd Class')
plt.show()

In [None]:
f, ax = plt.subplots(1,1,figsize=(8,5))
sns.kdeplot(df_train[(df_train['Survived']==0) & (df_train['Pclass']==3)]['Age'], ax= ax)
sns.kdeplot(df_train[(df_train['Survived']==1) & (df_train['Pclass']==3)]['Age'], ax= ax)                      
plt.legend(['Survived == 0', 'Survived == 1'])
plt.title('3rd Class')
plt.show()

In [None]:
survived = df_train[df_train['Age'] < 10]['Survived']
survived.sum()
survived.shape[0]

In [None]:
survived_ratio_age = []

for i in range(1, 80):
    survived = df_train[df_train['Age'] < i]['Survived']
    survived_ratio_age.append(survived.sum()/survived.shape[0])

plt.figure(figsize=(10,8))
plt.plot(survived_ratio_age)
plt.title('Survival ratio depending on range of Age', y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0-X)')
plt.show()

## 2.3. Pclass, Sex, Age

In [None]:
f, ax = plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass', 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[0])
ax[0].set_title('Pcalss and Age vs Survived')
ax[0].set_yticks(range(0, 110, 10))

sns.violinplot('Sex', 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110, 10))
plt.show()

## EDA : Embarked

## 배에 탑승한 위치

In [None]:
df_train[['Embarked','Survived']].groupby(['Embarked'], as_index=True).mean().sort_index(ascending=False)

In [None]:
f, ax = plt.subplots(1,1,figsize=(10,8))
df_train[['Embarked','Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot(kind='bar', ax=ax, rot=0)

In [None]:
f, ax = plt.subplots(2,2,figsize=(20,15))

sns.countplot('Embarked',data=df_train, ax=ax[0, 0])
ax[0, 0].set_title('(1) No. Of Passengers Board')

sns.countplot('Embarked', hue='Sex', data=df_train, ax=ax[0,1])
ax[0,1].set_title('(2) Male-Female split for Embarked')

sns.countplot('Embarked', hue='Survived', data=df_train, ax=ax[1,0])
ax[1,0].set_title('(3) Embarked vs Survived')

sns.countplot('Embarked', hue='Pclass', data=df_train, ax=ax[1,1])
ax[1,1].set_title('(4) Embarked vs Pclass')

plt.subplots_adjust(wspace=0.2, hspace=0.4)
plt.show()

## EDA : Family : SibSp + Parch

# SibSp : 함께 탑승한 형제자매, 아내, 남편의 수
# Parch : 함께 탑승한 부모, 자식의 수

In [None]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1 # 자신을 포함하기 때문에 + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1 # 자신을 포함하기 때문에 + 1

In [None]:
df_train.head()

In [None]:
print('Max size of Family : {:>2d}'.format(df_train['FamilySize'].max()))
print('Min size of Family : {:>2d}'.format(df_train['FamilySize'].min()))

In [None]:
f, ax = plt.subplots(1,3,figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('(1) No. of Passenger Boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depend on FamilySize')

df_train[['FamilySize','Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot(kind='bar', ax=ax[2])
ax[2].set_title('(3) Survived rate depend on FamilySize')

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

# EDA : Fare, Cabin, Ticket

# 왜도, 첨도
- https://blog.naver.com/jxlove716/221830329612
- https://blog.naver.com/yk60park/222100758577

In [None]:
f, ax = plt.subplots(1,1,figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness: {:.2f}'.format(df_train['Fare'].skew()), ax=ax) 
g = g.legend(loc='best') 

In [None]:
df_train['Fare'] = df_train['Fare'].map(lambda i : np.log(i) if i>0 else 0)

In [None]:
x = lambda i : i*i
x(3)

In [None]:
g

- 피처엔지니어링 : 모델 성능을 향상시키기 위한 작업

In [None]:
f, ax = plt.subplots(1,1,figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness: {:.2f}'.format(df_train['Fare'].skew()), ax=ax) 
g = g.legend(loc='best') 

- 티켓에는 다양한 문자정보가 있으며, 향후 피처 엔지니어링 활용해 볼것 -> 새로운 피처를 만드는 등 모델의 성능 높일 수 있을것

In [None]:
df_train['Ticket'].value_counts()

# 피처엔지니어링
- null 값 처리 
- 테스트셋에도 동일하게 적용해야 함

In [None]:
df_train['Age'].isnull().sum()

- 나이 null 값 : 이름에는 Mr. Miss Mrs. 로 그룹화 하고 

In [None]:
df_train['Initial'] = df_train['Name'].str.extract('([A-Za-z]+)\.')
df_test['Initial'] = df_test['Name'].str.extract('([A-Za-z]+)\.')

In [None]:
pd.crosstab(df_train['Initial'], df_train['Sex']).T.style.background_gradient(cmap='summer_r')

In [None]:
df_train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
                            ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mr'], inplace=True)
df_test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
                            ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mr'], inplace=True)

In [None]:
df_train.groupby('Initial').mean()

In [None]:
df_train.groupby('Initial')['Survived'].mean().plot(kind='bar')

In [None]:
df_all = pd.concat([df_train, df_test])

In [None]:
df_all.reset_index(drop=True)

In [None]:
df_all.groupby('Initial').mean()

In [None]:
df_train.loc[:1,:]

In [None]:
# df_train[(df_train['Age'].isnull()) & (df_train['Initial']=='Mr')]
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Master'), 'Age'] = 5
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Miss'), 'Age'] = 22
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Mr'), 'Age'] = 33
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Mrs'), 'Age'] = 37
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Other'), 'Age'] = 45

df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Master'), 'Age'] = 5
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Miss'), 'Age'] = 22
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Mr'), 'Age'] = 33
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Mrs'), 'Age'] = 37
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Other'), 'Age'] = 45

df_test.loc[(df_test['Fare'].isnull()), 'Fare'] = 94.9

In [None]:
# df_test.dropna(axis=0, inplace=True)

In [None]:
print('{}, {}'.format(df_train['Age'].isnull().sum() , df_test['Age'].isnull().sum()))

- 피처엔지니어링 : Embark 가장 많은 데이터로 치환하자

In [None]:
df_train['Embarked'].isnull().sum()

In [None]:
df_train['Embarked'].fillna('S', inplace=True)

- continuous feature 를 카테고리로 변환

In [None]:
df_train['Age_cat'] = 0

In [None]:
df_train.loc[df_train['Age'] < 10, 'Age_cat'] = 0
df_train.loc[(10 <= df_train['Age']) & (df_train['Age'] < 20), 'Age_cat'] = 1
df_train.loc[(20 <= df_train['Age']) & (df_train['Age'] < 30), 'Age_cat'] = 2
df_train.loc[(30 <= df_train['Age']) & (df_train['Age'] < 40), 'Age_cat'] = 3
df_train.loc[(40 <= df_train['Age']) & (df_train['Age'] < 50), 'Age_cat'] = 4
df_train.loc[(50 <= df_train['Age']) & (df_train['Age'] < 60), 'Age_cat'] = 5
df_train.loc[(60 <= df_train['Age']) & (df_train['Age'] < 70), 'Age_cat'] = 6
df_train.loc[70 <= df_train['Age'], 'Age_cat'] = 7

df_test.loc[df_test['Age'] < 10, 'Age_cat'] = 0
df_test.loc[(10 <= df_test['Age']) & (df_test['Age'] < 20), 'Age_cat'] = 1
df_test.loc[(20 <= df_test['Age']) & (df_test['Age'] < 30), 'Age_cat'] = 2
df_test.loc[(30 <= df_test['Age']) & (df_test['Age'] < 40), 'Age_cat'] = 3
df_test.loc[(40 <= df_test['Age']) & (df_test['Age'] < 50), 'Age_cat'] = 4
df_test.loc[(50 <= df_test['Age']) & (df_test['Age'] < 60), 'Age_cat'] = 5
df_test.loc[(60 <= df_test['Age']) & (df_test['Age'] < 70), 'Age_cat'] = 6
df_test.loc[70 <= df_test['Age'], 'Age_cat'] = 7

In [None]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7

In [None]:
df_train['Age_cat_2'] = df_train['Age'].apply(category_age)

In [None]:
(df_train['Age_cat'] == df_train['Age_cat_2']).all() # 전체값이 같은지 확인 할때 all 사용

In [None]:
(df_train['Age_cat'] == df_train['Age_cat_2']).any() # 하나라도 True 있으면 True

In [None]:
df_train.drop(['Age','Age_cat_2'], axis=1, inplace= True)
df_test.drop(['Age'], axis=1, inplace=True)

# 피처엔지니어링 2 - 카테고리화

In [None]:
df_train['Initial'].unique()

In [None]:
df_train['Initial'] = df_train['Initial'].map({'Master':0, 'Miss':1,'Mrs':2,'Mr':3,'Other':4})
df_test['Initial'] = df_test['Initial'].map({'Master':0, 'Miss':1,'Mrs':2,'Mr':3,'Other':4})

In [None]:
df_train['Embarked'] = df_train['Embarked'].map({'C':0, 'Q':1,'S':2})
df_test['Embarked'] = df_test['Embarked'].map({'C':0, 'Q':1,'S':2})

In [None]:
df_train.head()

In [None]:
df_train['Sex'] = df_train['Sex'].map({'female':0, 'male':1})
df_test['Sex'] = df_test['Sex'].map({'female':0, 'male':1})

- 피처들간의 상관관계

In [None]:
heatmap_data = df_train[['Survived','Pclass','Sex','Fare','Embarked','FamilySize','Initial','Age_cat']]

In [None]:
heatmap_data.corr()

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(10,10))
plt.title('test')
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={'size':16})

- one hot encoding

In [None]:
df_train = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')

In [None]:
df_test.head()

In [None]:
df_train = pd.get_dummies(df_train, columns=['Embarked'], prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'], prefix='Embarked')

In [None]:
df_train.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin'], axis=1, inplace=True)

In [None]:
df_train.head()

In [None]:
df_test.head()

# 머신러닝

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
X_train = df_train.drop('Survived', axis=1).values
target_label = df_train['Survived'].values
X_test = df_test.values

In [None]:
X_tr, X_vid, y_tr, y_vid = train_test_split(X_train, target_label, test_size=0.3, random_state=2018)

In [None]:
model = RandomForestClassifier()
model.fit(X_tr, y_tr)

In [None]:
prediction = model.predict(X_vid)

In [None]:
print('총 {}aud wnd {:.2f}% 정확도로 생존 맞춤'.format(y_vid.shape[0], 100*metrics.accuracy_score(prediction, y_vid)))

# Feature Importance

In [None]:
from pandas import Series

In [None]:
model.feature_importances_

In [None]:
feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=df_test.columns)

In [None]:
plt.figure(figsize=(5,5))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()

In [None]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')

In [None]:
submission.head()

In [None]:
submission.shape

In [None]:
prediction = model.predict(X_test)

In [None]:
prediction

In [None]:
submission['Survived'] = prediction

In [None]:
submission.to_csv('./titanic_submission.csv', index=False)