In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(42)

from datetime import datetime, timedelta
import time

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_log_error, mean_squared_error 

from tqdm import tqdm_notebook
import gc

# 1. 데이터 Load

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
gender_submission = pd.read_csv('data/gender_submission.csv')

In [None]:
train.shape, test.shape, gender_submission.shape

In [None]:
train.head(3)
test.head(3)
gender_submission.head(3)

In [None]:
df = pd.concat((train, test)) 
df.shape

# 2.  EDA & 전처리 -> Feature Engineering

## 공통 함수

In [None]:
def value_counts(df, cols):
    for col in cols:
        print(col)
        print(df[col].value_counts(),'\n')

In [None]:
def importance_plotting(data, xlabel, ylabel, n=20):
    sns.set(style="whitegrid")
    ax = data.tail(n).plot(kind='barh')
    
    ax.set(xlabel=xlabel, ylabel=ylabel)
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)
    plt.show()

In [None]:
# 주어진 컬럼과 생존률 관계 
def survpct(col):
    return train.groupby(col)['Survived'].mean().sort_values(ascending=False)

In [None]:
train.info()

 - categorical: Pclass, Sex, Embarked     
 - String: Name, Ticket   
 - ??: Cabin        

In [None]:
## id 중복 확인
train[train.duplicated(['PassengerId'])]
test[test.duplicated(['PassengerId'])]
# 중복 없음

In [None]:
train.describe(include='all')

- Missing Value: Age, Cabin, Embarked
- Categorical: Sex(2), Embarked(3)
- Name 중복 없음 

### Missing Value

In [None]:
df.isnull().sum()    # Age, Fare, Cabin, Embarked
train.isnull().sum() # Age      , Cabin, Embarked
test.isnull().sum()  # Age, Fare, Cabin

#### Missing Value: Fare - only test(1건)

In [None]:
test[test.Fare.isna()].index
test[test.Fare.isna()]

- Fare 가 null 인 컬럼<br>
Pclass: 3<br>
Ticket: 3701<br>
Embarked: S<br>

In [None]:
# 동일 조건을 가진 다른 컬럼의 Fare 검색
cond = (df.Pclass==3) & (df.Ticket.str.contains('3701')) & (df.Embarked == 'S')
df[cond]
# 동일한 값을 가지는것으로 파악 Fare -> Update

In [None]:
# Fare: 20.2125
# Ticket: 3701 -> 370129 (동일한 조건의 오기재된 정보로 보여 업데이트)
test.loc[152,'Fare'] = 20.2125
test.loc[152,'Ticket'] = '370129'

#### Missing Value: Embarked - only train(2건)

In [None]:
train[train.Embarked.isna()].index
train[train.Embarked.isna()]

In [None]:
# null값을 가지는 다른 공통 정보로 검색 
df[df.Cabin.str.contains('B2')==True]
# Cabin BX 로 시작하는 데이터는 Embarked = 'S'

In [None]:
train.loc[train[train.Embarked.isna()].index, 'Embarked'] = 'S'
train.loc[[61, 829]]

#### Missing Value: Age - train test (263건)

In [None]:
df[df.Age.isna()].index
df[df.Age.isna()]
# Name 에서 정보 추출 

In [None]:
# Initial: Title 추출 XXXX.
train['Initial'] = train.Name.str.extract('([A-Za-z]+)\.') 
test['Initial'] = test.Name.str.extract('([A-Za-z]+)\.') 

In [None]:
# Name2: 성 추출 .XXXX
train['Name2'] = train.Name.str.extract('(([A-Za-z])+(?=,))')[0]
test['Name2'] = test.Name.str.extract('(([A-Za-z])+(?=,))')[0]

In [None]:
# Family: 가족 수 = 형제자매 + 부모/자녀 + 나자신 
train['Family'] = (train['SibSp'] + train['Parch'] + 1).astype('int')
test['Family'] = (test['SibSp'] + test['Parch'] + 1).astype('int')

In [None]:
# Age_NA: 나이가 null 인건은 -1, null 이 아닌 건은 0으로 표현하는 컬럼을 추가 
train['Age_NA'] = 0
test['Age_NA'] = 0

train_idx = train[train['Age'].isna()].index
test_idx = test[test['Age'].isna()].index

train.loc[train_idx, 'Age_NA'] = -1
test.loc[test_idx, 'Age_NA'] = -1

In [None]:
# Initial 중 Age_NA 값이 0이 아닌 경우 null 이 존재하는 Initial 로 판단

train.groupby(by='Initial').agg({'Initial':['count'],'Age':['min','max', 'mean'], 'Age_NA':['sum']})
# Initial(null건수/총건수): Dr(1/7), Master(4/40), Miss(36/182), Mr(119/517), Mrs(17/125)

test.groupby(by='Initial').agg({'Initial':['count'],'Age':['min','max', 'mean'], 'Age_NA':['sum']})
# Initial(null건수/총건수): Master(4/21), Miss(14/78), Mr(240/517), Mrs(10/72), Ms(1/1)

df = pd.concat((train, test)) 
df.groupby(by='Initial').agg({'Initial':['count'],'Age':['min','max', 'mean'], 'Age_NA':['sum']})

In [None]:
# 1. Ms(1건)
## train 에 없던 test Ms 1건이 존재하며 Null 값임 
# train 데이터의 Ms(1건)을 참고로 test Ms 를 28.0세로 세팅
df[df.Initial == 'Ms']
test.loc[88, 'Age'] = 28.0
test.loc[test.Initial == 'Ms', 'Age']

In [None]:
# 2. Dr(1건)
# 8건 중에 1건 null
# Age: 23.00~54.0, 평균: 43.571429
df[df.Initial == 'Dr']

# 1건의 세부 정보와 동일한 조건의(Pclass == 1 & Sex=='male') 평균으로 업데이트
train.loc[766, 'Age'] = df[(df.Initial == 'Dr') & (df.Pclass==1) & (df.Sex=='male')]['Age'].mean() # 44.75

In [None]:
# 3. Master ~ 14세 이하(생존률 높음 주의!)
# 61건 중에 8건 NULL
# Age: 0.33~14.5 평균: 5.482642
df[((df.Initial == 'Master') & (df.Age.isna()))]

# null건의 세부 정보와 동일한 조건의(Pclass == 3 & Sex == 'male') 평균으로 업데이트
df[((df.Initial == 'Master') & (df.Pclass==3) & (df.Sex=='male'))]['Age'].mean() # 6.09

train.loc[((train.Initial == 'Master') & (train.Age.isna())), 'Age'] = 6.09
test.loc[((test.Initial == 'Master') & (test.Age.isna())), 'Age'] = 6.09

In [None]:
# 4. Mrs (Sex == 'female' )
# 197건 중에 27건 NULL
# Age: 14.00~76.0 평균: 36.994118
df[((df.Initial == 'Mrs') & (df.Age.isna()))].head(3)

In [None]:
# SibSp(형제자매)가 있는 경우 또래로 가정하고 동일한 나이 세팅
df[((df.Initial == 'Mrs') & (df.Age.isna()) & (df.SibSp > 0))].SibSp.unique()
# SibSp 모두 1임을 확인 

In [None]:
# 동일한 성(Name2)을 사용하는 SibSp(형제자매)의 성을 추출
SibSp_Name2 = df[((df.Initial == 'Mrs') & (df.Age.isna()) & (df.SibSp > 0))]['Name2'].unique()
SibSp_Name2

In [None]:
# 동일한 성(Name2)을 가지는 형제자매의 나이를 동일하게 세팅 
#df[((df['Name2'].isin(SibSp_Name2)) & (df.SibSp == 1))].sort_values('Name2')

# Name2가 같은 14개 중 5개의 나이를 채울 수 있음 - train / test 혼용되 있어 index로 바로 처리
train.loc[334, 'Age'] = 43.0
train.loc[849, 'Age'] = 49.0
train.loc[457, 'Age'] = 41.0
train.loc[375, 'Age'] = 28.0
test.loc[316, 'Age'] = 57.0

In [None]:
# 남은 Null Age 평균으로 일괄처리

# | Initial | null건수/총건수 | 최소나이~최대나이 | 평균나이
# | Miss    | 50/260          | 0.17~63.0         | 21.774238
# | Mr      | 176/757         | 11.00~80.0        | 32.252151
# | Mrs     | 23/197          | 14.00~76.0        | 37.068966

train.loc[((train.Initial == 'Miss') & (train.Age.isna())), 'Age'] = 21.77
train.loc[((train.Initial == 'Mr') & (train.Age.isna())), 'Age'] = 32.25
train.loc[((train.Initial == 'Mrs') & (train.Age.isna())), 'Age'] = 37.07

test.loc[((test.Initial == 'Miss') & (test.Age.isna())), 'Age'] = 21.77
test.loc[((test.Initial == 'Mr') & (test.Age.isna())), 'Age'] = 32.25
test.loc[((test.Initial == 'Mrs') & (test.Age.isna())), 'Age'] = 37.07

In [None]:
gc.collect()

#### Initial grouping

- Initial 유지
Mr          757<br>
Miss        260<br>
Mrs         197<br>
Master       61<br>
Rev           8 -> X (모든 성직자가 Survived=0)<br>
<br>
- 성별에 따라 Initial 분리
Dr            8 -> Mr(7) Miss(1)<br>
<br>
- Initial Mr 통일
Col           4 -> Mr<br>
Major         2 -> Mr<br>
Don           1 -> Mr<br>
Jonkheer      1 -> Mr<br>
Capt          1 -> Mr<br>
Sir           1 -> Mr<br>
<br>
- Initial Miss 통일
Dona          1 -> Miss<br>
Lady          1 -> Miss<br>
Countess      1 -> Miss<br>
Mme           1 -> Miss<br>
Ms            2 -> Miss<br>
Mlle          2 -> Miss<br>

In [None]:
value_counts(df, ['Initial'])   

train.loc[796, 'Initial'] = 'Miss'
train['Initial'] = train['Initial'].map({'Mr':'Mr','Miss':'Miss','Mrs':'Mrs','Master':'Master','Rev':'Rev',
                                         'Dr':'Mr', 'Col':'Mr', 'Major':'Mr', 'Don':'Mr', 'Jonkheer':'Mr', 'Capt':'Mr', 'Sir':'Mr'
                                         , 'Dona':'Miss', 'Lady':'Miss', 'Countess':'Miss', 'Mme':'Miss', 'Ms':'Miss', 'Mlle':'Miss'})
test['Initial'] = test['Initial'].map({'Mr':'Mr','Miss':'Miss','Mrs':'Mrs','Master':'Master','Rev':'Rev',
                                         'Dr':'Mr', 'Col':'Mr', 'Major':'Mr', 'Don':'Mr', 'Jonkheer':'Mr', 'Capt':'Mr', 'Sir':'Mr'
                                         , 'Dona':'Miss', 'Lady':'Miss', 'Countess':'Miss', 'Mme':'Miss', 'Ms':'Miss', 'Mlle':'Miss'})

In [None]:
# Age_NA 컬럼 삭제 
train.drop(['Age_NA'], axis=1, inplace=True)
test.drop(['Age_NA'], axis=1, inplace=True)

#### Missing Value: Cabin - train test

In [None]:
# Cabin을 첫자리 알파벳만으로 구성
train.Cabin = train.Cabin.str.slice(0,1)
test.Cabin = test.Cabin.str.slice(0,1)

In [None]:
df = pd.concat((train, test)) 
df.shape

In [None]:
pd.crosstab(df.Cabin, df.Pclass)

df.groupby(by='Cabin').agg({'Cabin':['count'], 'Fare':['min','max', 'mean'], 'Family': 'mean'})

# Cabin 과 Class 로 Null Cabin 분류

In [None]:
train.loc[((train.Pclass==1) & (train.Family==1) & (train.Cabin.isna())), 'Cabin'] = 'A'
train.loc[((train.Pclass==1) & (train.Family==2) & (train.Cabin.isna())), 'Cabin'] = 'B'
train.loc[((train.Pclass==1) & (train.Family>=3) & (train.Cabin.isna())), 'Cabin'] = 'C'
train.loc[((train.Pclass==2) & (train.Family==1) & (train.Cabin.isna())), 'Cabin'] = 'D'
train.loc[((train.Pclass==2) & (train.Family>=2) & (train.Cabin.isna())), 'Cabin'] = 'F'
train.loc[((train.Pclass==3) & (train.Family<=2) & (train.Cabin.isna())), 'Cabin'] = 'F'
train.loc[((train.Pclass==3) & (train.Family>=3) & (train.Cabin.isna())), 'Cabin'] = 'G'
test.loc[((test.Pclass==1) & (test.Family==1) & (test.Cabin.isna())), 'Cabin'] = 'A'
test.loc[((test.Pclass==1) & (test.Family==2) & (test.Cabin.isna())), 'Cabin'] = 'B'
test.loc[((test.Pclass==1) & (test.Family>=3) & (test.Cabin.isna())), 'Cabin'] = 'C'
test.loc[((test.Pclass==2) & (test.Family==1) & (test.Cabin.isna())), 'Cabin'] = 'D'
test.loc[((test.Pclass==2) & (test.Family>=2) & (test.Cabin.isna())), 'Cabin'] = 'F'
test.loc[((test.Pclass==3) & (test.Family<=2) & (test.Cabin.isna())), 'Cabin'] = 'F'
test.loc[((test.Pclass==3) & (test.Family>=3) & (test.Cabin.isna())), 'Cabin'] = 'G'

In [None]:
# 모든 Null 처리가 되었는지 확인
train.isna().any().sum()
test.isna().any().sum()

In [None]:
df = pd.concat((train, test)) 
df.shape

### Value Counts

In [None]:
# Target Survived
value_counts(df, ['Survived'])       

- categorical: Pclass, Sex, Embarked     
- String: Name, Ticket   
- ??: Cabin        

In [None]:
value_counts(df, ['Pclass', 'Sex', 'Embarked'])        

# one-hot 대상: 'Pclass', 'Sex', 'Embarked'

In [None]:
# Name> 파생컬럼(2):Initial, Name2
value_counts(df, ['Name', 'Initial', 'Name2'])   
# Name(1307)  Initial(5)  Name2(872)
# => Initial 만 사용

train.drop(['Name', 'Name2'], axis=1, inplace=True)
test.drop(['Name', 'Name2'], axis=1, inplace=True)

# one-hot 대상: 'Pclass', 'Sex', 'Embarked', 'Initial'

In [None]:
# Ticket
value_counts(df, ['Ticket'])   

# 숫자만 추출 
train['Ticket'] = train['Ticket'].str.extract('(([0-9])+(?!.)+)')[0]
test['Ticket'] = test['Ticket'].str.extract('(([0-9])+(?!.)+)')[0]
# 1자리 추출 
train['Ticket'] = train['Ticket'].str.slice(start=0, stop=1)
test['Ticket'] = test['Ticket'].str.slice(start=0, stop=1)
# 전체 문자(LINE) NULL 값 처리(가장 많은 3으로 처리)
train.Ticket.fillna('3', inplace=True)
test.Ticket.fillna('3', inplace=True)

# one-hot 대상: 'Pclass', 'Sex', 'Embarked', 'Initial', 'Ticket'

In [None]:
# Cabin
value_counts(df, ['Cabin'])   

# Cabin:T 가 1건이라 유사한 정보를 가진 사람의 A 로 업데이트
df[df.Cabin=='T']
df[((df.Fare == 35.50) & (df.Ticket.str.contains('11378')) & (df.Embarked == 'S'))]
train.loc[339, 'Cabin'] = 'A'

# one-hot 대상: 'Pclass', 'Sex', 'Embarked', 'Initial', 'Ticket', 'Cabin'

In [None]:
df = pd.concat((train, test)) 
df.shape

### Age Categorize

In [None]:
sorted( (train.Age*0.1).astype(np.int8).unique() )
sorted( (test.Age*0.1).astype(np.int8).unique() )
# train 80대 1명이 one-hot 할때 shape가 안 맞아 7로 수정 

In [None]:
train[(train.Age*0.1).astype(np.int8) == 8]
train.loc[train.Age == 80.0, 'Age'] = 79.0

In [None]:
train['Age'] = (train.Age*0.1).astype(np.int8)
test['Age'] = (test.Age*0.1).astype(np.int8)

In [None]:
#'Cabin', 'Embarked'
train.Embarked.unique()
test.Embarked.unique()

In [None]:
train['Embarked'].replace(['S', 'C', 'Q'], [1,2,3], inplace=True)
test['Embarked'].replace(['S', 'C', 'Q'], [1,2,3], inplace=True)

In [None]:
train.columns
test.columns

## 클러스터링

In [None]:
# lable encoding 
train.loc[(train['Sex'] == 'male'), 'Sex'] = 1
train.loc[(train['Sex'] == 'female'), 'Sex'] = 2
train.loc[(train['Age'] < 1), 'Sex'] = 3
test.loc[(test['Sex'] == 'male'), 'Sex'] = 1
test.loc[(test['Sex'] == 'female'), 'Sex'] = 2
test.loc[(test['Age'] < 1), 'Sex'] = 3

train['Initial'].replace(['Mr', 'Mrs', 'Miss', 'Master', 'Rev'], [1,2,3,4,5], inplace=True)
test['Initial'].replace(['Mr', 'Mrs', 'Miss', 'Master', 'Rev'], [1,2,3,4,5], inplace=True)

train['Cabin'].replace(['A', 'B', 'C', 'D', 'E', 'F', 'G'], [1,2,3,4,5,6,7], inplace=True)
test['Cabin'].replace(['A', 'B', 'C', 'D', 'E', 'F', 'G'], [1,2,3,4,5,6,7], inplace=True)

train['Embarked'].replace(['S', 'C', 'Q'], [1,2,3], inplace=True)
test['Embarked'].replace(['S', 'C', 'Q'], [1,2,3], inplace=True)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

In [None]:
# 신상 'Pclass', 'Sex', 'Age'
personal = np.vstack((train[['Pclass','Sex', 'Age']].values,
                    test[['Pclass','Sex', 'Age']].values
                   ))
kmeans = MiniBatchKMeans(n_clusters=5, init='k-means++').fit(personal)
train.loc[:, 'personal_cluster'] = kmeans.predict(train[['Pclass','Sex', 'Age']])
test.loc[:, 'personal_cluster'] = kmeans.predict(test[['Pclass','Sex', 'Age']])

In [None]:
# 가족 'SibSp', 'Parch', 'Initial'
fam = np.vstack((train[['SibSp', 'Parch', 'Initial']].values,
                 test[['SibSp', 'Parch', 'Initial']].values
               ))
kmeans = MiniBatchKMeans(n_clusters=5, init='k-means++').fit(personal)
train.loc[:, 'fam_cluster'] = kmeans.predict(train[['SibSp', 'Parch', 'Initial']])
test.loc[:, 'fam_cluster'] = kmeans.predict(test[['SibSp', 'Parch', 'Initial']])

In [None]:
# 항구/선실정보 'Fare', 'Cabin', 'Embarked',
ship = np.vstack((train[['Fare', 'Cabin', 'Embarked']].values,
                 test[['Fare', 'Cabin', 'Embarked']].values
               ))
kmeans = MiniBatchKMeans(n_clusters=5, init='k-means++').fit(personal)
train.loc[:, 'ship_cluster'] = kmeans.predict(train[['Fare', 'Cabin', 'Embarked']])
test.loc[:, 'ship_cluster'] = kmeans.predict(test[['Fare', 'Cabin', 'Embarked']])

### one-hot 대상: 
- 'Pclass', 'Sex', 'Embarked', 'Initial', 'Ticket', 'Cabin' + 'Age'
- + 'personal_cluster', 'fam_cluster', 'ship_cluster'

In [None]:
train.columns
test.columns

In [None]:
train = pd.get_dummies(train, columns = ['Pclass', 'Sex', 'Embarked', 'Initial', 'Ticket', 'Cabin'
                                         ,'Age','personal_cluster', 'fam_cluster', 'ship_cluster'])
test= pd.get_dummies(test, columns = ['Pclass', 'Sex', 'Embarked', 'Initial', 'Ticket', 'Cabin'
                                      ,'Age','personal_cluster', 'fam_cluster', 'ship_cluster'])

In [None]:
train.shape
test.shape

## 시각화

## 상관관계

In [None]:
plt.figure(figsize=(30, 8))
sns.heatmap(train.corr(),annot=True,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':10})
plt.show()

# 3. 모델 학습

In [None]:
# 전처리 및 머신 러닝 알고리즘
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier

## 데이터 준비

In [None]:
train.columns

In [None]:
cols = [#'PassengerId'
        #, 'Survived'
        # 'Age', 
       'SibSp', 'Parch', 'Fare', 'Family',
       'Pclass_1', 'Pclass_2', 'Pclass_3'
       , 'Sex_1', 'Sex_2', 'Sex_3',
       'Embarked_1', 'Embarked_2', 'Embarked_3'
       , 'Initial_1', 'Initial_2', 'Initial_3', 'Initial_4', 'Initial_5'
       , 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4', 'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9'
       , 'Cabin_1', 'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7'
       , 'Age_0', 'Age_1', 'Age_2', 'Age_3', 'Age_4', 'Age_5', 'Age_6', 'Age_7'
       , 'personal_cluster_0', 'personal_cluster_1', 'personal_cluster_2', 'personal_cluster_3', 'personal_cluster_4',
       'fam_cluster_0', 'fam_cluster_1', 'fam_cluster_2', 'fam_cluster_3', 'fam_cluster_4'
    , 'ship_cluster_0', 'ship_cluster_1', 'ship_cluster_4'
       ]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train[cols], train['Survived'], test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## 하이퍼파라미터 튜닝

## 모델 훈련

In [None]:
# 기본 모델; 건수가 적어 lightgbm 은 생략 
ran = RandomForestClassifier(random_state=42)
log = LogisticRegression(random_state=42)
xgb = XGBClassifier(random_state=42)
gbc = GradientBoostingClassifier(random_state=42)
ext = ExtraTreesClassifier(random_state=42)

# 리스트 준비
models = [ran, log, xgb, gbc, ext]         
model_names = ['Random Forest', 'Logistic Regression', 'XGBoost', 'Gradient Boosting', 'Extra Trees']
scores2 = {}

# 학습 및 교차 검증
for ind, mod in enumerate(models):
    mod.fit(X_train, y_train)
    acc = cross_val_score(mod, X_train, y_train, scoring = "accuracy", cv = 10)
    scores2[model_names[ind]] = acc

In [None]:
# 결과 테이블을 만듭니다.
results = pd.DataFrame(scores2).T
results['mean'] = results.mean(1)

result_df = results.sort_values(by='mean', ascending=False)#.reset_index()
result_df.head(11)
result_df = result_df.drop(['mean'], axis=1)
sns.boxplot(data=result_df.T, orient='h')
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)');

In [None]:
results.sort_values('mean', ascending=False)

### 하이퍼파라미터 튜닝

#RandomForestClassifier()

#cv15 - 

#cv 10 - 

#cv 5 - 0.8398896877770117
#{'criterion': 'gini', 'max_depth': 7, 'max_features': 0.4, 'min_samples_leaf': 3
#, 'min_samples_split': 2, 'n_estimators': 8}

criterion = ['gini','entropy']
max_depth = [6,7,8,9]
min_samples_leaf = [2,3,4]
min_samples_split = [1,2,3,4]
max_features = [0.3,0.4,0.5]
n_estimators = [7,8,9]

hyperparams = {'n_estimators':n_estimators
               ,'criterion':criterion
               ,'max_depth':max_depth
               ,'max_features':max_features
               ,'min_samples_split':min_samples_split
               ,'min_samples_leaf':min_samples_leaf
               ,'max_features': max_features}

gd=GridSearchCV(estimator = RandomForestClassifier(random_state=42
, criterion='entropy', max_depth=8, max_features='auto'
, min_samples_leaf=3, min_samples_split=3, n_estimators=11)
                , param_grid = hyperparams, 
                verbose=True, cv=5, scoring = "accuracy", n_jobs=-1)

gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_params_)

#LogisticRegression()

#cv=15 - 

#cv=10 - 0.8399843505477309
#{'C': 2.7825594022071245, 'max_iter': 10, 'multi_class': 'auto', 'penalty': 'l1', 'solver': 'liblinear'}

#cv=5 - 0.8413079877868611
#{'C': 464.15888336127773, 'max_iter': 100, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}

penalty = ['l1','l2']
C = np.logspace(0, 4, 10)
solver = ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']
max_iter=[5, 10, 25]
multi_class=['auto', 'ovr', 'multinomial']

hyperparams = {'penalty': penalty
              ,'C': C
               ,'solver': solver
               ,'max_iter': max_iter
               ,'multi_class': multi_class}

gd=GridSearchCV(estimator = LogisticRegression(random_state=42)
                , param_grid = hyperparams, 
                verbose=True, cv=10, scoring = "accuracy", n_jobs=-1)

gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_params_)

#XGBClassifier

#cv=15 - 

#cv=10 - 

#cv=5 - 0.8454939426770413
#{'colsample_bytree': 0.85, 'gamma': 0.0, 'learning_rate': 0.5
#, 'max_depth': 2, 'min_child_weight': 2, 'n_estimators': 50
#, 'reg_alpha': 0.1, 'subsample': 1}

colsample_bytree = [0.75, 0.8, 0.85, 0.9]#0.6, 0.65, 0.7, 0.75, 0.8, 0.95
gamma = [i*0.1 for i in range(0,5)]
learning_rate = [0.3, 0.4, 0.5,0.6]#0.001, 0.005,  0.01,0.1, 
max_depth = [2, 3, 4] #  ,5, 6, 7 , 8, 9, 10
min_child_weight = [2,3,4]#, 3, 4, 5, 6
n_estimators = [25, 50, 100]# 10,, 250, 500, 1000]
reg_alpha = [0.1, 0.3, 0.5, 0.7, 1]#1e-5, , 1, 100]
subsample = [0.95, 1]#0.6, 0.65, 0.7, 0.75, 0.8, 0.85,0.9, , 1
    
hyperparams = {'learning_rate': learning_rate, 'n_estimators': n_estimators
               ,'max_depth': max_depth, 'min_child_weight': min_child_weight
               ,'gamma': gamma
               ,'subsample': subsample, 'colsample_bytree': colsample_bytree 
               ,'reg_alpha': reg_alpha
              }

gd=GridSearchCV(estimator = XGBClassifier(random_state=42)
                , param_grid = hyperparams, 
                verbose=True, cv=5, scoring = "accuracy", n_jobs=-1)

gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_params_)

#GradientBoostingClassifier

#cv=15 - 

#cv=10 -

#cv=5 - 0.8342164877376146
#{'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 120}

learning_rate = [0.01, 0.05, 0.15, 0.25] # 0.01, , 0.15, 0.1, 0.2, 0.5
max_depth = [1, 3, 5, 7] # 1, 8, 10, 15
n_estimators = [50, 75, 100, 120, 150] # , 100, 1000, 2000

hyperparams = {'learning_rate': learning_rate, 'n_estimators': n_estimators, 'max_depth':max_depth}

gd=GridSearchCV(estimator = GradientBoostingClassifier(random_state=42), param_grid = hyperparams, 
                verbose=True, cv=5, scoring = "accuracy", n_jobs=-1)

gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_params_)

In [None]:
gc.collect()

In [None]:
# 튜닝 모델
ran = RandomForestClassifier(random_state=42
, criterion='entropy', max_depth=8, max_features='auto'
, min_samples_leaf=3, min_samples_split=3, n_estimators=11)
log = LogisticRegression(random_state=42, C=1291.5496650148827)
xgb = XGBClassifier(random_state=42, learning_rate=0.2, n_estimators=10,
                                         max_depth=6, min_child_weight=1, gamma=0.1
                                         , subsample=1, colsample_bytree=1, reg_alpha=1e-05)
gbc = GradientBoostingClassifier(random_state=42)
ext = ExtraTreesClassifier(random_state=42)

# 리스트 준비
models = [ran, log, xgb, gbc, ext]         
model_names = ['Random Forest', 'Logistic Regression', 'XGBoost', 'Gradient Boosting', 'Extra Trees']
scores2 = {}

# 학습 및 교차 검증
for ind, mod in enumerate(models):
    mod.fit(X_train, y_train)
    acc = cross_val_score(mod, X_train, y_train, scoring = "accuracy", cv = 10)
    scores2[model_names[ind]] = acc

In [None]:
# 결과 테이블을 만듭니다.
results = pd.DataFrame(scores2).T
results['mean'] = results.mean(1)

result_df = results.sort_values(by='mean', ascending=False)#.reset_index()
result_df.head(11)
result_df = result_df.drop(['mean'], axis=1)
sns.boxplot(data=result_df.T, orient='h')
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)');

In [None]:
results.sort_values('mean', ascending=False)

In [None]:
gc.collect()

In [None]:
# {'C': 2.7825594022071245, 'max_iter': 400, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
# {'C': 59.94842503189409, 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
# {'C': 464.15888336127773, 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
# 'C': 464.15888336127773, 'max_iter': 100, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'
log = LogisticRegression(random_state=42
                         , C=464.15888336127773
                         , max_iter=100
                         , multi_class='multinomial'
                         , penalty='l2'
                         , solver='lbfgs'
                        )

In [None]:
log.fit(train[cols], train['Survived'])

In [None]:
#{'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.3
#, 'max_depth': 2, 'min_child_weight': 3, 'n_estimators': 100, 'reg_alpha': 0.05, 'subsample': 1}
#'colsample_bytree': 0.85, 'gamma': 0.0, 'learning_rate': 0.5, 'max_depth': 2, 'min_child_weight': 2, 'n_estimators': 50
#, 'reg_alpha': 0.1, 'subsample': 1
xgb = XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=50,
                                         max_depth=2, min_child_weight=2, gamma=0.0
                                         , subsample=1, colsample_bytree=0.85, reg_alpha=0.1)

In [None]:
xgb.fit(train[cols], train['Survived'])

# 4. 예측

In [None]:
predictions = log.predict(test[cols])

In [None]:
predictions = xgb.predict(test[cols])

# 5. 제출

In [None]:
submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
submission.to_csv("./output/20200910-2-1.csv", index=False)

In [None]:
submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
submission.to_csv("./output/20200910-3-2.csv", index=False)

In [None]:
gc.collect()