In [115]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd
import random
import os
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt # 득점모델 변수 중요도
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [116]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [117]:
train_df = pd.read_csv('./train.csv')
train_df.drop([ 'X_02', 'X_48', 'X_47', 'X_23', 'X_11', 'X_10', 'X_04'], inplace= True, axis=1)
test_df = pd.read_csv('./test.csv')
test_df.drop([ 'X_02', 'X_48', 'X_47', 'X_23', 'X_11', 'X_10', 'X_04'], inplace= True, axis=1)

In [118]:
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y_01 Feature..
test_x = test_df.filter(regex='X')
test_y = test_df.filter(regex='Y')

# 군집화 코드(GMM)
 - 전체 데이터 셋에서 군집화 실시 후 예측

In [119]:
data = pd.concat([train_x,test_x],axis=0)

In [120]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components = 5) # Gaussian Mixture 객체 생성
gmm.fit(data)

In [121]:
gmm_cluster_labels_tr = gmm.predict(train_x)
gmm_cluster_labels_te = gmm.predict(test_x)

## 구간화 후 랜덤포레스트 실행
 - 구간화 후 군집화 값 넣고 랜포

In [122]:
# 구간화
train_df1 = train_df[train_df['X_49']<10000]
train_df2 = train_df[train_df['X_49']>=10000]
train_df2 = train_df2[train_df2['X_49']<50000]
train_df3 = train_df[train_df['X_49']>=50000] ##구간화해주기

In [123]:
train_x1 = train_df1.filter(regex='X') # Input : X Featrue
train_y1 = train_df1.filter(regex='Y') # Output : Y Feature

In [154]:
import xgboost as xgb
from sklearn.metrics import r2_score
xgb_model = xgb.XGBRegressor(n_estimators=50, max_depth=10, learning_rate=0.1, n_jobs=-1)
XGB_1 = MultiOutputRegressor(xgb_model).fit(train_x1, train_y1)

In [155]:
train_predict = XGB_1.predict(train_x1)
print("RMSE':{}".format(math.sqrt(mean_squared_error(train_predict, train_y1)))) ##훈련용데이터 대상 예측력1

RMSE':0.33414132942707003


In [156]:
train_x2 = train_df2.filter(regex='X') # Input : X Featrue
train_y2 = train_df2.filter(regex='Y') # Output : Y Feature

In [157]:
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=15, learning_rate=0.1, n_jobs=-1)
XGB_2 = MultiOutputRegressor(xgb_model).fit(train_x2, train_y2)

In [158]:
train_predict = XGB_2.predict(train_x2)
print("RMSE':{}".format(math.sqrt(mean_squared_error(train_predict, train_y2)))) ##훈련용데이터 대상 예측력2

RMSE':0.32244751841593683


In [159]:
train_x3 = train_df3.filter(regex='X') # Input : X Featrue
train_y3 = train_df3.filter(regex='Y') # Output : Y Feature

In [160]:
xgb_model = xgb.XGBRegressor(n_estimators=50, max_depth=10, learning_rate=0.1, n_jobs=-1)
XGB_3 = MultiOutputRegressor(xgb_model).fit(train_x3, train_y3)

In [161]:
train_predict = XGB_3.predict(train_x3)
print("RMSE':{}".format(math.sqrt(mean_squared_error(train_predict, train_y3)))) ##훈련용데이터 대상 예측력3

RMSE':0.21727890333558217


In [162]:
##test데이터 불러오기
test_df = pd.read_csv('./test.csv').drop(columns=['ID'])
test_df.drop([ 'X_02', 'X_48', 'X_47', 'X_23', 'X_11', 'X_10', 'X_04'], inplace= True, axis=1) # 상관성 없는 컬럼 제거


In [163]:
test_df1 = test_df[test_df['X_49']<10000]
test_df2 = test_df[test_df['X_49']>=10000]
test_df2 = test_df2[test_df2['X_49']<50000]
test_df3 = test_df[test_df['X_49']>=50000] ##test데이터 구간화

In [164]:
df1_index = test_df1.index
df2_index = test_df2.index
df3_index = test_df3.index #인덱싱해주기

In [175]:
pred1 = XGB_1.predict(test_df1)
pred2 = XGB_2.predict(test_df2)
pred3 = XGB_3.predict(test_df3)
pred1 = pd.DataFrame(pred1)
pred2 = pd.DataFrame(pred2)
pred3 = pd.DataFrame(pred3)
pred1.index = df1_index
pred2.index = df2_index
pred3.index = df3_index
pred = pd.concat([pred1,pred2,pred3],axis=0)
pred = pred.sort_index(ascending=True) ##인덱스 순서대로 오름차순 정렬
pred = np.array(pred)

In [176]:
pred

array([[  1.4393586,   1.3504254,   1.1867129, ..., -25.9473   ,
        -25.804668 , -25.94142  ],
       [  1.5250287,   1.3046801,   1.2087998, ..., -26.164412 ,
        -26.045593 , -25.87236  ],
       [  1.2539355,   1.0829971,   1.1809623, ..., -25.925034 ,
        -25.815016 , -25.919876 ],
       ...,
       [  1.1715627,   0.644978 ,   0.8078467, ..., -26.110992 ,
        -26.124924 , -26.09455  ],
       [  1.1779459,   0.6571133,   1.073825 , ..., -26.217358 ,
        -26.391727 , -26.443634 ],
       [  1.4773206,   0.8834003,   0.9483063, ..., -26.153316 ,
        -26.047998 , -26.141565 ]], dtype=float32)

In [177]:
submit = pd.read_csv('./sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = pred[:,idx-1]
submit.to_csv('./xgb_1.csv', index=False)

In [142]:
from sklearn.model_selection import GridSearchCV
 
param_grid = [
        {'max_depth' : [3,4,5,6] , 'n_estimators': [12,24,32], 'learning_rate':[0.01, 0.1], 'gamma': [0.5, 1, 2], 'random_state':[42]}
        
    ]
forest_reg = xgb.XGBRegressor()
 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
 
grid_search.fit(train_x2, train_y2)

In [143]:
print(grid_search.best_params_)

{'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 32, 'random_state': 99}
