In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')
import import_ipynb
import Preprocessing
from Preprocessing import pre_buy_num, pre_buy_amount_round, data_split

importing Jupyter notebook from Preprocessing.ipynb


In [5]:
def RF_gridSearch(x_train, x_test, y_train, y_test):
    algorithm = RandomForestRegressor(random_state=2022)
    algorithm = algorithm.fit(x_train, y_train)

    depth_lst = list()
    for es in algorithm.estimators_:
        depth_lst.append(es.get_depth())
    depth = np.median(depth_lst)
    
    algorithm = RandomForestRegressor(random_state=2022)
    params = {'max_depth' :[depth-1, depth, depth+1], 'n_estimators': [90,100,110], 'min_samples_split' :[2,3,4]}
    score = 'neg_mean_squared_error'

    df_grid = GridSearchCV(algorithm, param_grid = params, cv=5, scoring= score, n_jobs=-1)
    df_grid.fit(x_train, y_train)

    best_mse = (-1) * df_grid.best_score_
    best_rmse = np.sqrt(best_mse)
    print('Best score: {}, Best params: {}'.format(round(best_rmse,4), df_grid.best_params_))
    
    estimator = df_grid.best_estimator_
    pred = estimator.predict(x_test)
    print("학습 데이터셋 정확도: ", estimator.score(x_train, y_train))
    print("테스트 데이터셋 정확도: ", estimator.score(x_test, y_test))
    print("MSE: ", mean_squared_error(y_test, pred))
    print("RMSE: ", np.sqrt(mean_squared_error(y_test, pred)))
    print("R2: ", r2_score(y_test, pred))
    print("MAE: ", mean_absolute_error(y_test, pred))

In [6]:
def Boost_GridSearch(x_train, x_test, y_train, y_test, Boost):
    if Boost == 'XGB':
        algorithm = XGBRegressor(random_state=2022)
    elif Boost == 'LGB':
        algorithm = LGBMRegressor(random_state=2022)
    else:
        algorithm = CatBoostRegressor(random_state=2022, silent=True)
        
    params = {'max_depth' :[3,5,7,11], 'learning_rate' :[0.01,0.05,0.1,0.5]}
    score = 'neg_mean_squared_error'

    df_grid= GridSearchCV(algorithm, param_grid = params, cv=5, scoring=score, n_jobs=-1)
    df_grid.fit(x_train, y_train)

    best_mse = (-1) * df_grid.best_score_
    best_rmse = np.sqrt(best_mse)
    print('Best score: {}, Best params: {}'.format(round(best_rmse,4), df_grid.best_params_))
    
    estimator = df_grid.best_estimator_
    pred = estimator.predict(x_test)
    print("학습 데이터셋 정확도: ", estimator.score(x_train, y_train))
    print("테스트 데이터셋 정확도: ", estimator.score(x_test, y_test))
    print("MSE: ", mean_squared_error(y_test, pred))
    print("RMSE: ", np.sqrt(mean_squared_error(y_test, pred)))
    print("R2: ", r2_score(y_test, pred))
    print("MAE: ", mean_absolute_error(y_test, pred))

In [7]:
def DNN_model(x_train, y_train, hidden, units):
    np.random.seed(2022)
    tf.random.set_seed(2022)
    initializer = tf.keras.initializers.GlorotUniform(seed=2022)
    
    early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=5)
    check_point = ModelCheckpoint('temp/DNN_temp.h5',monitor='val_loss',mode='min',save_best_only=True)
    
    model = Sequential()
    model.add(keras.layers.Dense(units=units, activation='relu', input_shape=(x_train.shape[1],), kernel_initializer=initializer))
    for i in range(hidden):
        model.add(keras.layers.Dense(units=units, activation='relu', kernel_initializer=initializer))
    model.add(keras.layers.Dense(units=1, activation='linear', kernel_initializer=initializer))
    model.compile(optimizer='adam', loss='mse', metrics='mae')
    model.fit(x_train, y_train, epochs=50, batch_size=128, validation_split=0.2, callbacks=[early_stopping,check_point], verbose=0)

    return model

def DNN_param(x_train, x_test, y_train, y_test):
    count = 0
    hyperparam = pd.DataFrame(index=range(0), columns = ['hiddenlayer_num', 'units_num', 'r2_score', 'MSE', 'MAE'])
    for hidden in range(1,5):
        for units in [200, 300, 400, 500]:
            model = DNN_model(x_train, y_train, hidden, units)
            pred = model.predict(x_test)
            r2 = r2_score(y_test, pred)
            score = model.evaluate(x_test, y_test)
            hyperparam.loc[count] = [hidden, units, r2, score[0], score[1]]
            count += 1
    
    best_hyperparam = hyperparam[hyperparam['r2_score'] == max(hyperparam['r2_score'])]
    
    print('Best params: hiddenlayer_num = {}, units_num = {}'.format(best_hyperparam.iloc[0,0], best_hyperparam.iloc[0,1]))
    print("MSE: ", best_hyperparam.iloc[0,3])
    print("RMSE: ", np.sqrt(best_hyperparam.iloc[0,3]))
    print("R2: ", best_hyperparam.iloc[0,2])
    print("MAE: ", best_hyperparam.iloc[0,4])

# 1번 주제: 총 구매횟수

In [None]:
data02_offline, data02_online, data03_offline, data03_online = pre_buy_num()

data02_offline

In [None]:
x_train, x_test, y_train, y_test = data_split(data02_offline)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 5.4663, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9115277006811993
테스트 데이터셋 정확도:  0.8834281058882767
MSE:  28.742463273083946
RMSE:  5.361199797907549
R2:  0.8834281058882767
MAE:  3.345179077300532


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 7.9502, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.750959622042766
테스트 데이터셋 정확도:  0.744076175749653
MSE:  63.10166936270983
RMSE:  7.9436559192043195
R2:  0.744076175749653
MAE:  4.7570279034979555


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 5.3047, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9119866031226268
테스트 데이터셋 정확도:  0.8883046180735571
MSE:  27.540089635302632
RMSE:  5.24786524553581
R2:  0.8883046180735571
MAE:  3.3641174677513015


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

data02_online

In [None]:
x_train, x_test, y_train, y_test = data_split(data02_online)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

Best score: 3.0531, Best params: {'max_depth': 45.0, 'min_samples_split': 2, 'n_estimators': 110}
학습 데이터셋 정확도:  0.9831909748721949
테스트 데이터셋 정확도:  0.8900722076220451
MSE:  7.6723592022630225
RMSE:  2.7699023813598598
R2:  0.8900722076220451
MAE:  1.366625534426022


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 3.3278, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9670823969387203
테스트 데이터셋 정확도:  0.8618586643113534
MSE:  9.641510351082438
RMSE:  3.105078155390366
R2:  0.8618586643113534
MAE:  1.7899007289270314


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 4.4399, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.7478108597662267
테스트 데이터셋 정확도:  0.7175956737423911
MSE:  19.710278760732855
RMSE:  4.439625970814755
R2:  0.7175956737423911
MAE:  2.741841197125275


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 3.3019, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9620957446577683
테스트 데이터셋 정확도:  0.8574512432196884
MSE:  9.949124258715731
RMSE:  3.154223241737295
R2:  0.8574512432196884
MAE:  1.865262517478538


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

Best params: hiddenlayer_num = 4.0, units_num = 300.0
MSE:  24.150588989257812
RMSE:  4.9143248762426985
R2:  0.6539760864188346
MAE:  2.84924054145813


data03_offline

In [None]:
x_train, x_test, y_train, y_test = data_split(data03_offline)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

Best score: 4.7304, Best params: {'max_depth': 44.0, 'min_samples_split': 2, 'n_estimators': 110}
학습 데이터셋 정확도:  0.9872328932265991
테스트 데이터셋 정확도:  0.906086845701141
MSE:  20.317747135546394
RMSE:  4.507521174165063
R2:  0.906086845701141
MAE:  2.3440311014307085


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 5.5646, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9744086440719864
테스트 데이터셋 정확도:  0.870838068619467
MSE:  27.943683511867142
RMSE:  5.286178535754079
R2:  0.870838068619467
MAE:  3.2558130704369175


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 6.9744, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.8106503900221929
테스트 데이터셋 정확도:  0.786868758634066
MSE:  46.1101184502623
RMSE:  6.790443170387504
R2:  0.786868758634066
MAE:  4.779878302225002


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 5.3427, Best params: {'learning_rate': 0.1, 'max_depth': 11}
학습 데이터셋 정확도:  0.9202689560183823
테스트 데이터셋 정확도:  0.8797587519661142
MSE:  26.01372822640551
RMSE:  5.100365499295664
R2:  0.8797587519661142
MAE:  3.389803629488527


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

Best params: hiddenlayer_num = 3.0, units_num = 300.0
MSE:  49.74653625488281
RMSE:  7.053122447177762
R2:  0.7700604093841336
MAE:  4.79890775680542


data03_online

In [None]:
x_train, x_test, y_train, y_test = data_split(data03_online)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

Best score: 3.645, Best params: {'max_depth': 35.0, 'min_samples_split': 2, 'n_estimators': 110}
학습 데이터셋 정확도:  0.9880346966084476
테스트 데이터셋 정확도:  0.9119370787000942
MSE:  12.28855545907168
RMSE:  3.50550359564381
R2:  0.9119370787000942
MAE:  2.015202506061392


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 3.7995, Best params: {'learning_rate': 0.1, 'max_depth': 11}
학습 데이터셋 정확도:  0.9807490065765517
테스트 데이터셋 정확도:  0.9109091426465747
MSE:  12.431996637442706
RMSE:  3.525903662530034
R2:  0.9109091426465747
MAE:  2.1570471556630877


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 4.4253, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9242694656137396
테스트 데이터셋 정확도:  0.8645867244699867
MSE:  18.895961225021296
RMSE:  4.34694849578659
R2:  0.8645867244699867
MAE:  2.97938748583994


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 3.5719, Best params: {'learning_rate': 0.1, 'max_depth': 11}
학습 데이터셋 정확도:  0.9860148480863536
테스트 데이터셋 정확도:  0.9146379614078266
MSE:  11.911666452296796
RMSE:  3.4513282156724525
R2:  0.9146379614078266
MAE:  2.23701604440326


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

Best params: hiddenlayer_num = 4.0, units_num = 200.0
MSE:  51.19654846191406
RMSE:  7.155176340378626
R2:  0.6331124285135352
MAE:  5.0160722732543945


# 2번 주제: 총 구매금액

In [8]:
data02_offline, data02_online, data03_offline, data03_online = pre_buy_amount_round()

data02_offline

In [9]:
x_train, x_test, y_train, y_test = data_split(data02_offline)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

In [11]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 0.2276, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9719614598954888
테스트 데이터셋 정확도:  0.9629054480973273
MSE:  0.05065156790369528
RMSE:  0.22505903204202954
R2:  0.9629054480973273
MAE:  0.14691330544262815


In [12]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 0.2777, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9448247900452864
테스트 데이터셋 정확도:  0.9437297019938173
MSE:  0.07683551018218307
RMSE:  0.2771921899732802
R2:  0.9437297019938173
MAE:  0.18499609896692054


In [10]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 0.22, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9736622022119307
테스트 데이터셋 정확도:  0.9655197868887309
MSE:  0.04708176177250162
RMSE:  0.2169833214154987
R2:  0.9655197868887309
MAE:  0.14395205806129727


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

data02_online

In [None]:
x_train, x_test, y_train, y_test = data_split(data02_online)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

Best score: 0.182, Best params: {'max_depth': 44.0, 'min_samples_split': 2, 'n_estimators': 110}
학습 데이터셋 정확도:  0.996770150791519
테스트 데이터셋 정확도:  0.9772672332867431
MSE:  0.029055899396589722
RMSE:  0.1704579109240452
R2:  0.9772672332867431
MAE:  0.09768761249372114


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 0.1988, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9918790602413553
테스트 데이터셋 정확도:  0.9719551844061946
MSE:  0.03584549785637506
RMSE:  0.1893290729295822
R2:  0.9719551844061946
MAE:  0.12439445908391916


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 0.264, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9499271601646609
테스트 데이터셋 정확도:  0.9460068172893995
MSE:  0.06901141883561501
RMSE:  0.262700245214227
R2:  0.9460068172893995
MAE:  0.18356988091689577


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 0.1963, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9934483517426782
테스트 데이터셋 정확도:  0.9725828782275513
MSE:  0.03504321062247813
RMSE:  0.18719831896274639
R2:  0.9725828782275513
MAE:  0.1263140168193499


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

Best params: hiddenlayer_num = 4.0, units_num = 400.0
MSE:  0.08890886604785919
RMSE:  0.2981758978318992
R2:  0.9304395233923007
MAE:  0.21759279072284698


data03_offline

In [None]:
x_train, x_test, y_train, y_test = data_split(data03_offline)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

Best score: 0.2167, Best params: {'max_depth': 41.0, 'min_samples_split': 2, 'n_estimators': 110}
학습 데이터셋 정확도:  0.9958018562813906
테스트 데이터셋 정확도:  0.9705357430420247
MSE:  0.041353814686917394
RMSE:  0.20335637360780556
R2:  0.9705357430420247
MAE:  0.11581736716833854


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 0.2267, Best params: {'learning_rate': 0.1, 'max_depth': 11}
학습 데이터셋 정확도:  0.9816114247684077
테스트 데이터셋 정확도:  0.9659059979321144
MSE:  0.04785177666152163
RMSE:  0.21875048951150172
R2:  0.9659059979321144
MAE:  0.1406740984681834


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 0.2749, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.954790833879181
테스트 데이터셋 정확도:  0.9483527861730556
MSE:  0.07248814428754634
RMSE:  0.2692362239512847
R2:  0.9483527861730556
MAE:  0.18731542415857141


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 0.2266, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9964595535416878
테스트 데이터셋 정확도:  0.9678750837839101
MSE:  0.04508811587978421
RMSE:  0.21233962390421673
R2:  0.9678750837839101
MAE:  0.1414326029499509


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

Best params: hiddenlayer_num = 4.0, units_num = 300.0
MSE:  0.09453966468572617
RMSE:  0.3074730308266502
R2:  0.9326412573088358
MAE:  0.2176033854484558


data03_online

In [None]:
x_train, x_test, y_train, y_test = data_split(data03_online)

In [None]:
RF_gridSearch(x_train, x_test, y_train, y_test)

Best score: 0.2201, Best params: {'max_depth': 31.0, 'min_samples_split': 2, 'n_estimators': 110}
학습 데이터셋 정확도:  0.9928448478568042
테스트 데이터셋 정확도:  0.949819935593933
MSE:  0.043473145387550184
RMSE:  0.20850214720129426
R2:  0.949819935593933
MAE:  0.12738636315082927


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'XGB')

Best score: 0.2091, Best params: {'learning_rate': 0.1, 'max_depth': 11}
학습 데이터셋 정확도:  0.9915067306876965
테스트 데이터셋 정확도:  0.9542644464189564
MSE:  0.03962267473631243
RMSE:  0.19905445168674935
R2:  0.9542644464189564
MAE:  0.12653519876657424


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'LGB')

Best score: 0.2388, Best params: {'learning_rate': 0.5, 'max_depth': 11}
학습 데이터셋 정확도:  0.9648054997513242
테스트 데이터셋 정확도:  0.9387600323637674
MSE:  0.0530548146577692
RMSE:  0.2303363077280028
R2:  0.9387600323637674
MAE:  0.16611832333383686


In [None]:
Boost_GridSearch(x_train, x_test, y_train, y_test, 'CAT')

Best score: 0.2099, Best params: {'learning_rate': 0.1, 'max_depth': 11}
학습 데이터셋 정확도:  0.9940475580379253
테스트 데이터셋 정확도:  0.9548544743292341
MSE:  0.039111508211721074
RMSE:  0.19776629695608167
R2:  0.9548544743292341
MAE:  0.13529552347000315


In [None]:
DNN_param(x_train, x_test, y_train, y_test)

Best params: hiddenlayer_num = 4.0, units_num = 200.0
MSE:  0.09639185667037964
RMSE:  0.3104703796989008
R2:  0.888737026844638
MAE:  0.23775334656238556
