# 라이브러리 임포트, 경로 설정, 데이터 로드

In [1]:
import os
import numpy as np
import pandas as pd
import sweetviz as sv
import xgboost

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV

from tqdm import trange, notebook



In [2]:
#나는 지금 어디...?
os.getcwd()

'C:\\Users\\frank\\data_analysis\\mystudy\\titanic_prac\\Titanic-main'

In [3]:
# 데이터 읽기, 테스트 데이터셋에 fare 결측치 딱 1개 있는거 미리 채워줌...(나중에 귀찮)
path = os.getcwd()
train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/test.csv')
# test 데이터의 fare 컬럼에 있는 결측치 채우는 방법.
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

# EDA(Exploratory Data Analysis)

In [5]:
#EDA(feat. SweetViz: EDA 자동화 툴)
#titanic EDA 라고 검색하면 많은 방법들이 나온다.
train_report = sv.analyze(train)
train_report.show_html(filepath = path + '/train_report.html')
test_report = sv.analyze(test)
test_report.show_html(filepath = path + '/test_report.html')

                                             |                                             | [  0%]   00:00 ->…

Report C:\Users\frank\data_analysis\mystudy\titanic_prac\Titanic-main/train_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


                                             |                                             | [  0%]   00:00 ->…

Report C:\Users\frank\data_analysis\mystudy\titanic_prac\Titanic-main/test_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Data Preprocessing

## Sex 컬럼이 text 데이터(male, female) 원핫인코딩(feat. 딕셔너리)

In [6]:
#성별 원핫인코딩
train['Sex'] = train['Sex'].map({'male' : 0, 'female' : 1})
test['Sex'] = test['Sex'].map({'male' : 0, 'female' : 1})

In [7]:
train['Sex']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: int64

In [8]:
#여기까지 얻은 정보만 가지고 모델 돌려보기
x = train[['Pclass', 'SibSp', 'Sex']]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size =0.7, test_size=0.3)

model = MLPRegressor(
    hidden_layer_sizes=[512, 4], 
    max_iter=5000, 
    alpha=0.005, 
    random_state=42
)
model.fit(x_train, y_train)

print(r2_score(y_train, model.predict(x_train)))
print(r2_score(y_test, model.predict(x_test)))

test_x = test[['Pclass', 'SibSp', 'Sex']]
test_y_pred = model.predict(test_x)
submission = pd.read_csv(path + "/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv(
    path + '/submission_v0.01.csv', 
    index = False
)   #리더보드 AUC Score: 0.7872

0.38494971511477216
0.42863959715413613


## Age 컬럼 결측치 Pclass, SibSq, Parch, Fare데이터로 Ridge써서 예측하고 대체하기

In [9]:
#데이터셋에서 Age열에서 결측치 제외하고 추출: age_null_train
age_null_train = train.dropna(subset = ['Age'])

#age_null_train에서 학습데이터와 정답데이터 정의
train_xx = age_null_train[['Pclass', 'SibSp', 'Parch', 'Fare']]
train_yy = age_null_train['Age']

#학습데이터셋의 Age열에서 결측치가 있는 행 추출
age_null_at_train = train.loc[train['Age'].isnull()]

#테스트데이터셋의 Age열에서 결측치가 있는 행 추출
age_null_at_test = test.loc[train['Age'].isnull()]

#학습데이터셋에서 예측용 데이터 추출
age_null_at_train = age_null_at_train[['Pclass', 'SibSp', 'Parch', 'Fare']]

#테스트데이터셋에서 예측용 데이터 추출
age_null_at_test = age_null_at_test[['Pclass', 'SibSp', 'Parch', 'Fare']]

#모든 데이터를 실수 자료 형태로 전환
train_xx = train_xx.apply(pd.to_numeric)
train_xx = train_xx.astype(float)
train_yy = train_yy.apply(pd.to_numeric)
train_yy = train_yy.astype(float)

#모델 정의 및 학습
model = Ridge()
model.fit(train_xx, train_yy)

#예측
test_at_train_pred = abs(model.predict(age_null_at_train))
test_at_test_pred = abs(model.predict(age_null_at_test))
print(test_at_train_pred)
print(test_at_test_pred)

[27.5339525  34.20225953 27.5582485  27.5582485  27.54536076 27.54503374
 34.50649354 27.547906   27.55816576 27.54503374 27.54199601 23.57204547
 27.547906   19.62712968 40.51678816 40.67003835 22.37983821 27.54503374
 27.54199601 27.54716725 27.54199601 27.54199601 27.54503374 27.5474135
 23.40164052 27.54199601 27.547906   22.239722   25.0058088  27.55652475
 27.52992976  6.64946829 38.9354233  40.7054156  14.53211537  6.64946829
 34.16187454 40.23113826 23.57204547 27.547906   27.547906    6.64946829
 23.72472041 27.54503374 14.53211537 27.551846   23.59264772 23.57204547
 27.557756   39.65589845 27.547906   27.547906   40.60543813 27.547906
 34.45835944 40.7039381  40.67003835 40.61528813 27.547906   19.59618493
 34.21506452 27.54199601 39.03173765  6.64946829 19.59618493 34.76004822
 27.54503374 23.56022547 40.52663816 27.5582485  27.54536076 27.54536076
 23.57204547 27.55816576 27.547906   35.77418812 27.54503374 27.54831576
 14.53211537 27.54503374 27.56547249 34.45835944 27.54

In [10]:
print('학습데이터셋 행의 갯수: ', len(train))
print('예측한 학습데이터셋의 age 행의 결측치 갯수: ', len(test_at_train_pred))
print('테스트데이터셋 행의 갯수: ', len(test))
print('예측한 테스트데이터셋의 age 행의 결측치 갯수: ', len(test_at_test_pred))

학습데이터셋 행의 갯수:  891
예측한 학습데이터셋의 age 행의 결측치 갯수:  177
테스트데이터셋 행의 갯수:  418
예측한 테스트데이터셋의 age 행의 결측치 갯수:  83


In [11]:
#결측치 넣기 전 학습데이터셋
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,Q


In [12]:
#결측치 넣기 전 테스트데이터셋
test.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,3,"Spector, Mr. Woolf",0,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",0,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",0,,1,1,2668,22.3583,,C


In [13]:
#데이터셋 결측치를 예측값으로 대체하기

for i in range(len(train)):
    if np.isnan(train.iloc[i, 5]) == True:
        for j in range(len(test_at_train_pred)):
            train.iloc[i, 5] = test_at_train_pred[j]
    else:
        pass
print('수정된 학습데이터셋 Age 값 확인: \n', train['Age'])

for i in range(len(test)):
    if np.isnan(test.iloc[i, 4]) == True:
        for j in range(len(test_at_test_pred)):
            test.iloc[i, 4] = test_at_test_pred[j]
    else:
        pass
print('\n 수정된 테스트데이터셋 Age 값 확인: \n', test['Age'])

수정된 학습데이터셋 Age 값 확인: 
 0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    21.021001
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

 수정된 테스트데이터셋 Age 값 확인: 
 0      34.500000
1      47.000000
2      62.000000
3      27.000000
4      22.000000
         ...    
413    27.557756
414    39.000000
415    38.500000
416    27.557756
417    27.557756
Name: Age, Length: 418, dtype: float64


In [14]:
# 결측치 대체 후 학습데이터셋
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,21.021001,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,Q


In [15]:
# 결측치 대체 후 테스트데이터셋
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,21.021001,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,Q


In [16]:
#age 데이터를 추가해서 아까 모델 다시 돌려보기
x = train[['Pclass', 'SibSp', 'Age', 'Sex']]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size =0.7, test_size=0.3)

model = MLPRegressor(
    hidden_layer_sizes=[512, 4], 
    max_iter=5000, 
    alpha=0.005, 
    random_state=42)
model.fit(x_train, y_train)

print('train r2 score: ', r2_score(y_train, model.predict(x_train)))
print('test r2 score: ', r2_score(y_test, model.predict(x_test)))

test_x = test[['Pclass', 'SibSp', 'Age', 'Sex']]
test_y_pred = model.predict(test_x)
submission['Survived'] = test_y_pred
submission.to_csv(
    path + '/submission_v0.02.csv', 
    index = False
) #리더보드 AUC Score: 0.8092

train r2 score:  0.3876562486674936
test r2 score:  0.4069746192910507


## Embarked 컬럼 원핫인코딩(feat. pd.get_dummies())

In [17]:
#embarked 데이터 전처리: 원핫인코딩
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

train_embark_dummy = pd.get_dummies(train['Embarked'])
test_embark_dummy = pd.get_dummies(test['Embarked'])

train = pd.concat([train, train_embark_dummy], axis = 1)
test = pd.concat([test, test_embark_dummy], axis = 1)

#나중에 cabin데이터 결측치 채워줄 때 여기도 원핫인코딩 쓰는데 컬럼명 'C'가 겹쳐서 'C_' 이름 바꿔줌
train.rename(columns={'C' : 'C_'}, inplace = True)
test.rename(columns={'C' : 'C_'}, inplace = True)

In [18]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C_,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0,1


In [19]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C_,Q,S
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q,0,1,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S,0,0,1


## Cabin 컬럼 결측치를 Fare데이터로 RandomForest써서 예측해서 대체하기

In [20]:
#cabin 컬럼의 첫번째 글자만 남기고 날리기
train['Cabin'] = train['Cabin'].str.get(i=0)
test['Cabin'] = test['Cabin'].str.get(i=0)

In [21]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C_,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0,1


In [22]:
df1_train = train[['Fare', 'Cabin']]
df1_test = test[['Fare', 'Cabin']]

In [23]:
#데이터셋의 Cabin열에서 결측치 제외하고 추출: cabin_null_train
cabin_null_train = df1_train.dropna(subset = ['Cabin'])

#cabin_null_train에서 학습데이터와 정답데이터 정의
train_xxx = cabin_null_train[['Fare']]
train_cabin_dummy = pd.get_dummies(cabin_null_train['Cabin'])
train_yyy = train_cabin_dummy[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']]

#학습데이터셋의 Cabin열에서 결측치가 있는 행 추출
cabin_null_at_train = df1_train.loc[df1_train['Cabin'].isnull()]

#테스트데이터셋의 Cabin열에서 결측치가 있는 행 추출
cabin_null_at_test = df1_test.loc[df1_test['Cabin'].isnull()]

#학습데이터셋에서 예측용 데이터 추출
cabin_null_at_train = cabin_null_at_train[['Fare']]

#테스트데이터셋에서 예측용 데이터 추출 및 결측치 대체(평균)
cabin_null_at_test = cabin_null_at_test[['Fare']]
cabin_null_at_test = cabin_null_at_test.fillna(cabin_null_at_test.mean())

#모든 데이터를 실수 자료 형태로 전환
train_xxx = train_xxx.apply(pd.to_numeric)
train_xxx = train_xxx.astype(float)
train_yyy = train_yyy.apply(pd.to_numeric)
train_yyy = train_yyy.astype(float)

#모델 정의 및 학습
model = RandomForestClassifier()
model.fit(train_xxx, train_yyy)

#예측 및 결과값 반올림(0또는 1)
test_at_train_pred = model.predict(cabin_null_at_train)
test_at_test_pred = model.predict(cabin_null_at_test)
print(test_at_train_pred)
print(test_at_test_pred)

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]
[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


In [24]:
#예측한 test 데이터의 cabin 컬럼 데이터엔 T로 시작하는 객실이 안나와서 별도로 추가
df2_train = pd.get_dummies(df1_train['Cabin'])
df2_test = pd.get_dummies(df1_test['Cabin'])
df2_test['T'] = 0

#Cabin 결측치 채워주기 - Train 데이터셋
for i in notebook.tqdm(range(len(df1_train))):
    if df2_train.sum(axis=1)[i] == 0:
        for j in range(len(test_at_train_pred)):
            df2_train.iloc[i] = test_at_train_pred[j]
    else:
        pass
print('수정된 학습데이터셋 cabin 값 확인: \n', df2_train)

#Cabin 결측치 채워주기 - Test 데이터셋
for i in notebook.tqdm(range(len(df1_test))):
    if df2_test.sum(axis=1)[i] == 0:
        for j in range(len(test_at_test_pred)):
            df2_test.iloc[i] = test_at_test_pred[j]
    else:
        pass
print('수정된 테스트데이터셋 cabin 값 확인: \n', df2_test)

#합체
train = pd.concat([train, df2_train], axis = 1)
test = pd.concat([test, df2_test], axis = 1)

  0%|          | 0/891 [00:00<?, ?it/s]

수정된 학습데이터셋 cabin 값 확인: 
      A  B  C  D  E  F  G  T
0    0  0  0  0  0  1  0  0
1    0  0  1  0  0  0  0  0
2    0  0  0  0  0  1  0  0
3    0  0  1  0  0  0  0  0
4    0  0  0  0  0  1  0  0
..  .. .. .. .. .. .. .. ..
886  0  0  0  0  0  1  0  0
887  0  1  0  0  0  0  0  0
888  0  0  0  0  0  1  0  0
889  0  0  1  0  0  0  0  0
890  0  0  0  0  0  1  0  0

[891 rows x 8 columns]


  0%|          | 0/418 [00:00<?, ?it/s]

수정된 테스트데이터셋 cabin 값 확인: 
      A  B  C  D  E  F  G  T
0    0  0  0  0  0  1  0  0
1    0  0  0  0  0  1  0  0
2    0  0  0  0  0  1  0  0
3    0  0  0  0  0  1  0  0
4    0  0  0  0  0  1  0  0
..  .. .. .. .. .. .. .. ..
413  0  0  0  0  0  1  0  0
414  0  0  1  0  0  0  0  0
415  0  0  0  0  0  1  0  0
416  0  0  0  0  0  1  0  0
417  0  0  0  0  0  1  0  0

[418 rows x 8 columns]


In [43]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Q,S,A,B,C,D,E,F,G,T
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,...,0,1,0,0,0,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,...,0,0,0,0,1,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,...,0,1,0,0,0,0,0,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,...,0,1,0,0,1,0,0,0,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,...,0,1,0,0,0,0,0,1,0,0


In [44]:
#학습에 안 쓸 불필요해보이는 데이터들 제거
train_final = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis = 1)
test_final = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis = 1)

In [45]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Q,S,A,B,C,D,E,F,G,T
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,...,0,1,0,0,0,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,...,0,0,0,0,1,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,...,0,1,0,0,0,0,0,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,...,0,1,0,0,1,0,0,0,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,...,0,1,0,0,0,0,0,1,0,0


In [46]:
train_final.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C_,Q,S,...,Q.1,S.1,A,B,C,D,E,F,G,T
0,0,3,0,22.0,1,0,7.25,0,0,1,...,0,1,0,0,0,0,0,1,0,0
1,1,1,1,38.0,1,0,71.2833,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,1,1,1,35.0,1,0,53.1,0,0,1,...,0,1,0,0,1,0,0,0,0,0
4,0,3,0,35.0,0,0,8.05,0,0,1,...,0,1,0,0,0,0,0,1,0,0


In [47]:
#train - test - split
x = train_final.drop(['Survived'], axis = 1)
y = train_final['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size =0.7, test_size=0.3)

#지금까지 전처리한 데이터로 아까 모델 다시 돌려보기
model = MLPRegressor(
    hidden_layer_sizes=[512, 4], 
    max_iter=5000, 
    alpha=0.005, 
    random_state=42
)
model.fit(x_train, y_train)

print(r2_score(y_train, model.predict(x_train)))   #output: 0.45
print(r2_score(y_test, model.predict(x_test)))   #output: 0.38

0.48243535720904374
0.29919289955210315


In [48]:
#예측과 제출파일 저장
test_x = test_final
test_y_pred = model.predict(test_x)
submission = pd.read_csv(path + "/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv(
    path + '/submission_v0.03.csv', 
    index = False
) # 리더보드 AUC Score : 0.7756

# Data Scaling(feat. MinMaxScaler(), StandardScaler())

StandardScaler: 기본 스케일. 평균과 표준편차 사용<br>
MinMaxScaler: 최대값과 최소값이 각각 1, 0 이 되도록 스케일링
![scaler](scaler.png)

In [49]:
#Data Scaling의 중요성
from sklearn.preprocessing import StandardScaler, MinMaxScaler

train_test_data = [train_final, test_final]

scaler = MinMaxScaler()
for dataset in train_test_data:
    array = dataset['Age'].values.reshape(-1,1) # 2D array로 변환
    scaler.fit(array) # 스케일링에 필요한 값(최소값, range 등) 계산
    dataset['AgeScale'] = pd.Series(scaler.transform(array).reshape(-1)) # 스케일링 후 series로 추가

for dataset in train_test_data:
    array = dataset['Pclass'].values.reshape(-1,1) # 2D array로 변환
    scaler.fit(array) # 스케일링에 필요한 값(최소값, range 등) 계산
    dataset['PclassScale'] = pd.Series(scaler.transform(array).reshape(-1)) # 스케일링 후 series로 추가
    
#Bias 있어서 스탠다드 씀
scaler = StandardScaler()
for dataset in train_test_data:
    array = dataset['Fare'].values.reshape(-1,1)
    scaler.fit(array)
    dataset['FareScale'] = pd.Series(scaler.transform(array).reshape(-1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['AgeScale'] = pd.Series(scaler.transform(array).reshape(-1)) # 스케일링 후 series로 추가
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['AgeScale'] = pd.Series(scaler.transform(array).reshape(-1)) # 스케일링 후 series로 추가
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['PclassScale'] = pd

In [50]:
#진짜 본격적으로 여러 모델 돌려보기 전에 스케일링해서 필요없어졌거나 불필요해보이는 데이터들 날려주기
train_real_final = train_final.drop(['Age', 'Pclass', 'Fare', 'SibSp', 'Parch', 'C_', 'Q', 'S'], axis = 1)
test_real_final = test_final.drop(['Age', 'Pclass', 'Fare', 'SibSp', 'Parch', 'C_', 'Q', 'S'], axis = 1)

In [51]:
#MLPRegressor(계속 쓰고 있던 모델)

x = train_real_final.drop(['Survived'], axis = 1)
y = train_real_final['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size =0.7, test_size=0.3)

MLPR = MLPRegressor(hidden_layer_sizes=[512, 4], max_iter=5000, alpha=0.005, random_state=42)
MLPR.fit(x_train, y_train)

print(r2_score(y_train, MLPR.predict(x_train)))   #output: 0.48
print(r2_score(y_test, MLPR.predict(x_test)))   #output: 0.46

test_x = test_real_final
test_y_pred = np.around(MLPR.predict(test_x))
submission = pd.read_csv(path + "/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv(path + '/submission_v0.04.csv', index = False)

#예측한 결과값 반올림하고 acc도 측정해보기
preds = np.around(MLPR.predict(x_test))
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))   #output: 0.82

0.5115267301793742
0.4115811581330052
accuracy: 0.820896


In [52]:
#MLPClassifier

MLPC = MLPClassifier(hidden_layer_sizes=(512, 4), max_iter=5000)
MLPC.fit(x_train, y_train)

print(r2_score(y_train, MLPC.predict(x_train)))   #output: 0.30
print(r2_score(y_test, MLPC.predict(x_test)))   #output: 0.33

test_x = test_real_final
test_y_pred = np.around(MLPC.predict(test_x))
submission = pd.read_csv(path + "/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv(path + '/submission_v0.05.csv', index = False)

preds = np.around(MLPC.predict(x_test))
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))   #output: 0.84

#예측과 제출파일 저장
test_y_pred = MLPC.predict(test_x)
submission = pd.read_csv(path + "/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv(path + '/submission_v0.06.csv', index = False)

0.3252225201072385
0.2055335968379446
accuracy: 0.820896


In [53]:
#XGBoost Regressor
xgbR_model = xgboost.XGBRegressor(n_estimators=1000, learning_rate=0.01, gamma=0.1, subsample=0.7,
                           colsample_bytree=1, max_depth=9)

xgbR_model.fit(x_train, y_train)
print(r2_score(y_train, xgbR_model.predict(x_train)))   #output: 0.89  <-오버피팅일까...?
print(r2_score(y_test, xgbR_model.predict(x_test)))   #output: 0.46
test_y_pred = xgbR_model.predict(test_x)
submission = pd.read_csv(path + "/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv(path + '/submission_v0.11.csv', index = False)

0.9023899965075861
0.4364605423573561


In [54]:
#XGB Classifier와 parameter detector
params = {
        'eta': [0.1, 0.01, 0.02, 0.001],
        'min_child_weight': [1, 3, 5, 7, 10],
        'gamma': [0, 0.5, 1, 1.5, 2, 3, 4, 5],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'max_depth': [3, 4, 5, 6, 7, 8, 9]
        }

xgbM = xgboost.XGBClassifier(n_estimators=600, objective='binary:logistic', silent=True, nthread=1)

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(
    xgbM, param_distributions=params, 
    n_iter=param_comb, 
    scoring='roc_auc', 
    n_jobs=4, 
    cv=skf.split(x_train,y_train), 
    verbose=3, 
    random_state=1001)

random_search.fit(x_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits




Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x0000022211230890>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, mis...
                                           subsample=None, tree_method=None,
                                           validate_parameters=None,
                                           verbosity=None),
                   n_iter=5, n_jobs=4,
                   p

In [56]:
'''
찾은 best parameter
{'subsample': 0.7,
 'min_child_weight': 1,
 'max_depth': 4,
 'gamma': 3,
 'eta': 0.01,
 'colsample_bytree': 0.8}
 '''
random_search.best_params_

{'subsample': 0.7,
 'min_child_weight': 1,
 'max_depth': 4,
 'gamma': 3,
 'eta': 0.01,
 'colsample_bytree': 0.8}

In [57]:
#best parameter로 XGBClassifer 돌리기

xgbM_model = xgboost.XGBClassifier(
    eta = random_search.best_params_['eta'], 
    n_estimators = 600, 
    objective = 'binary:logistic', 
    silent=True, 
    nthread=1, 
    subsample = random_search.best_params_['subsample'],
    min_child_weight = random_search.best_params_['min_child_weight'],
    max_depth = random_search.best_params_['max_depth'],
    gamma = random_search.best_params_['gamma'],
    colsample_bytree = random_search.best_params_['colsample_bytree'])

xgbM_model.fit(x_train, y_train)
print(r2_score(y_train, xgbM_model.predict(x_train)))   #output: 0.41
print(r2_score(y_test, xgbM_model.predict(x_test)))   #output: 0.39
preds = xgbM_model.predict(x_test)
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))   #output: 0.85

test_x = test_real_final
test_y_pred = xgbM_model.predict(test_x)
submission = pd.read_csv(path + "/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv(
    path + '/submission_v0.07.csv', 
    index = False
) # 리더보드 AUC Score : 0.8121

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.445479892761394
0.321393280632411
accuracy: 0.847015


In [58]:
train_real_final

Unnamed: 0,Survived,Sex,A,B,C,D,E,F,G,T,AgeScale,PclassScale,FareScale
0,0,0,0,0,0,0,0,1,0,0,0.271174,1.0,-0.502445
1,1,1,0,0,1,0,0,0,0,0,0.472229,0.0,0.786845
2,1,1,0,0,0,0,0,1,0,0,0.321438,1.0,-0.488854
3,1,1,0,0,1,0,0,0,0,0,0.434531,0.0,0.420730
4,0,0,0,0,0,0,0,1,0,0,0.434531,1.0,-0.486337
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,0,0,0,0,0,1,0,0,0.334004,0.5,-0.386671
887,1,1,0,1,0,0,0,0,0,0,0.233476,0.0,-0.044381
888,0,1,0,0,0,0,0,1,0,0,0.258872,1.0,-0.176263
889,1,0,0,0,1,0,0,0,0,0,0.321438,0.0,-0.044381
