In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, accuracy_score
import time
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GridSearchCV
from bayes_opt import BayesianOptimization
%matplotlib inline

In [25]:
df= pd.read_csv('/home/workspace/data/.train/.task147/train.csv')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 20)

# features 변환

x값이 skew해서 log1p 씌어줌

In [26]:
y_df = df['ad_periods']
cate_x_df = df[['dum_1_1','dum_1_2','dum_1_3','dum_1_4','dum_1_5','dum_2_1','dum_2_2','dum_2_3','dum_2_4','dum_2_5','dum_2_6','dum_2_7','dum_2_8','dum_2_9','dum_2_10','dum_2_11','dum_3_1','dum_3_2','dum_3_3','dum_4_1','dum_4_2','dum_4_3','dum_4_4','dum_4_5','dum_4_6','dum_4_7','dum_4_8','dum_4_9','dum_4_10','dum_4_11','dum_4_12','dum_4_13','dum_4_14']]
x_df = df.drop(columns=['id', 'ad_periods','dum_1_1','dum_1_2','dum_1_3','dum_1_4','dum_1_5','dum_2_1','dum_2_2','dum_2_3','dum_2_4','dum_2_5','dum_2_6','dum_2_7','dum_2_8','dum_2_9','dum_2_10','dum_2_11','dum_3_1','dum_3_2','dum_3_3','dum_4_1','dum_4_2','dum_4_3','dum_4_4','dum_4_5','dum_4_6','dum_4_7','dum_4_8','dum_4_9','dum_4_10','dum_4_11','dum_4_12','dum_4_13','dum_4_14'])

In [27]:
skewness = x_df.apply(lambda x: x.skew()).sort_values(ascending=False)
skew_feature = skewness[skewness>1].index
log_x_df = x_df.copy()
log_x_df[skew_feature] = np.log1p(log_x_df[skew_feature])
log_x_df.apply(lambda x: x.skew()).sort_values(ascending=False)
new_x_df = pd.concat([log_x_df,cate_x_df],axis=1)

In [28]:
X_train, X_val, y_train, y_val = train_test_split(new_x_df,y_df, test_size = 0.01, random_state = 42)
d_train = xgb.DMatrix(X_train, label=y_train)
d_val = xgb.DMatrix(X_val, label=y_val)
watchlist = [(d_train, 'train'), (d_val, 'valid')]

# 최적의 parameters 찾기

grid search, optuna 등 보다 베이지안 최적화가 가장 성능이 좋았던것으로 판단

In [29]:
def xgb_evaluate(max_depth, eta, gamma, colsample_bytree, subsample, min_child_weight,lamb):
    params = {'eval_metric': 'rmse',
              'objective': 'count:poisson',
              'max_depth': int(round(max_depth)),
              'subsample': subsample, 
              'eta': eta,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree,
              'min_child_weight' : min_child_weight,
              'gpu_id' : 0, 
              'tree_method' : 'gpu_hist',
              'lambda': lamb,
              'grow_policy': 'lossguide'}
    cv_result = xgb.cv(params, xgb.DMatrix(df.drop(columns = ['id', 'ad_periods']), label = df['ad_periods']) ,num_boost_round = 10000, nfold = 5, early_stopping_rounds=100)
    
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [None]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3,10),
                                             'eta': (0.02,0.2),
                                             'gamma': (0, 1),
                                             'colsample_bytree': (0.5, 1),
                                             'subsample': (0.5, 1),
                                             'min_child_weight': (0,5),
                                             'lamb': (0,1)})
xgb_bo.maximize(init_points=10, n_iter = 10)

In [30]:
params = {} 
params['eta'] = 0.05 
params['objective'] = 'count:poisson' 
params['eval_metric'] = 'rmse' 
#params['silent'] = 1 
params['booster'] = 'gbtree' 
params['colsample_bytree'] = 0.7798 
params['subsample'] = 0.76 
params['max_depth'] = 6 
params['min_child_weight'] = 3.833 
params['gamma'] = 0.4928 
params['alpha'] = 0.1171 
params['lambda'] = 0.7742 
params['grow_policy'] = 'lossguide' 
#params['gpu_id'] = 0 
#params['tree_method'] = 'gpu_hist'

[4868]	train-rmse:19.51641	valid-rmse:30.88939 (0.01 test size)

# 모델 학습

In [31]:
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-rmse:63.22253	valid-rmse:60.60445
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[10]	train-rmse:63.07414	valid-rmse:60.45978
[20]	train-rmse:62.86452	valid-rmse:60.25521
[30]	train-rmse:62.56820	valid-rmse:59.96645
[40]	train-rmse:62.15101	valid-rmse:59.55974
[50]	train-rmse:61.56197	valid-rmse:58.98895
[60]	train-rmse:60.74238	valid-rmse:58.19258
[70]	train-rmse:59.59946	valid-rmse:57.08936
[80]	train-rmse:58.03898	valid-rmse:55.58149
[90]	train-rmse:55.93890	valid-rmse:53.57225
[100]	train-rmse:53.22480	valid-rmse:51.02578
[110]	train-rmse:49.95577	valid-rmse:48.00435
[120]	train-rmse:46.47187	valid-rmse:44.92057
[130]	train-rmse:43.48129	valid-rmse:42.39636
[140]	train-rmse:41.21701	valid-rmse:40.56541
[150]	train-rmse:39.64874	valid-rmse:39.28156
[160]	train-rmse:38.42936	valid-rmse:38.39461
[170]	train-rmse:37.48954	valid-rmse:37.75269
[180]	train-rmse:36.87345	valid-rmse:

[1740]	train-rmse:26.83563	valid-rmse:32.32089
[1750]	train-rmse:26.80466	valid-rmse:32.30825
[1760]	train-rmse:26.77302	valid-rmse:32.29460
[1770]	train-rmse:26.74213	valid-rmse:32.28200
[1780]	train-rmse:26.70901	valid-rmse:32.27076
[1790]	train-rmse:26.67620	valid-rmse:32.26108
[1800]	train-rmse:26.64425	valid-rmse:32.25243
[1810]	train-rmse:26.61222	valid-rmse:32.25153
[1820]	train-rmse:26.57903	valid-rmse:32.25398
[1830]	train-rmse:26.54377	valid-rmse:32.25503
[1840]	train-rmse:26.51959	valid-rmse:32.24288
[1850]	train-rmse:26.48542	valid-rmse:32.23479
[1860]	train-rmse:26.45150	valid-rmse:32.22402
[1870]	train-rmse:26.41357	valid-rmse:32.21915
[1880]	train-rmse:26.37877	valid-rmse:32.19879
[1890]	train-rmse:26.34438	valid-rmse:32.17828
[1900]	train-rmse:26.31507	valid-rmse:32.16949
[1910]	train-rmse:26.27994	valid-rmse:32.16626
[1920]	train-rmse:26.24721	valid-rmse:32.13634
[1930]	train-rmse:26.21505	valid-rmse:32.12419
[1940]	train-rmse:26.18521	valid-rmse:32.10482
[1950]	train-

[3490]	train-rmse:22.16330	valid-rmse:31.37063
[3500]	train-rmse:22.14426	valid-rmse:31.37512
[3510]	train-rmse:22.12154	valid-rmse:31.37348
[3520]	train-rmse:22.10042	valid-rmse:31.36960
[3530]	train-rmse:22.07613	valid-rmse:31.34200
[3540]	train-rmse:22.05203	valid-rmse:31.34514
[3550]	train-rmse:22.02696	valid-rmse:31.35136
[3560]	train-rmse:22.00250	valid-rmse:31.34874
[3570]	train-rmse:21.98412	valid-rmse:31.35101
[3580]	train-rmse:21.95776	valid-rmse:31.34283
[3590]	train-rmse:21.93246	valid-rmse:31.33617
[3600]	train-rmse:21.91328	valid-rmse:31.34214
[3610]	train-rmse:21.89170	valid-rmse:31.34387
[3620]	train-rmse:21.86916	valid-rmse:31.32010
[3630]	train-rmse:21.85136	valid-rmse:31.30925
[3640]	train-rmse:21.83081	valid-rmse:31.29985
[3650]	train-rmse:21.81279	valid-rmse:31.30409
[3660]	train-rmse:21.79189	valid-rmse:31.30301
[3670]	train-rmse:21.76964	valid-rmse:31.30502
[3680]	train-rmse:21.74868	valid-rmse:31.29770
[3690]	train-rmse:21.72750	valid-rmse:31.29342
[3700]	train-

In [32]:
clf.save_model('model_last.json')
clf.load_model('model_last.json')

# 모델 예측

In [33]:
test_df = pd.read_csv('/home/workspace/data/.train/.task147/test.csv')
test_cate_x_df = test_df[['dum_1_1','dum_1_2','dum_1_3','dum_1_4','dum_1_5','dum_2_1','dum_2_2','dum_2_3','dum_2_4','dum_2_5','dum_2_6','dum_2_7','dum_2_8','dum_2_9','dum_2_10','dum_2_11','dum_3_1','dum_3_2','dum_3_3','dum_4_1','dum_4_2','dum_4_3','dum_4_4','dum_4_5','dum_4_6','dum_4_7','dum_4_8','dum_4_9','dum_4_10','dum_4_11','dum_4_12','dum_4_13','dum_4_14']]
test_x_df = test_df.drop(columns=['id','dum_1_1','dum_1_2','dum_1_3','dum_1_4','dum_1_5','dum_2_1','dum_2_2','dum_2_3','dum_2_4','dum_2_5','dum_2_6','dum_2_7','dum_2_8','dum_2_9','dum_2_10','dum_2_11','dum_3_1','dum_3_2','dum_3_3','dum_4_1','dum_4_2','dum_4_3','dum_4_4','dum_4_5','dum_4_6','dum_4_7','dum_4_8','dum_4_9','dum_4_10','dum_4_11','dum_4_12','dum_4_13','dum_4_14'])

#### x만 train과 같이 log1p 씌어줌

In [34]:
test_skewness = test_x_df.apply(lambda x: x.skew()).sort_values(ascending=False)
test_skew_feature = test_skewness[(test_skewness)>1].index
test_log_x_df = test_x_df.copy()
test_log_x_df[test_skew_feature] = np.log1p(test_log_x_df[test_skew_feature])
test_log_x_df.apply(lambda x: x.skew()).sort_values(ascending=False)
test_new_x_df = pd.concat([test_log_x_df,test_cate_x_df],axis=1)

In [35]:
d_test = xgb.DMatrix(test_new_x_df)
pred = clf.predict(d_test)
id_list = test_df['id']
submit_df = pd.DataFrame({'id':id_list, 'ad_periods':pred})

In [36]:
submit_df.to_csv('prediction/차차차_제출_last.csv', index=False)

In [22]:
team_id = "1100"
task_no= "147"
prediction_path = '/home/workspace/user-workspace/prediction/차차차_제출_last.csv'

In [23]:
from nipa.taskSubmit import nipa_submit
nipa_submit(team_id=team_id,
task_no=task_no,
result=prediction_path)

20201115125400405941_5Kxy.csv: 200
