In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import preprocessing
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from lightgbm import LGBMRegressor

In [None]:
df = pd.read_csv("../data/17051_total_data.csv")
G = df['G_VRH']
del df['G_VRH']
del df['K_VRH']
df

In [None]:
def prepro(df, target, scaling=True, test=0.2, seed=2021):
    
    x = df.to_numpy()
    y = target.to_numpy()
    
    if scaling:
        x = preprocessing.StandardScaler().fit_transform(x)
        
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, shuffle=True, random_state=seed)
    
    return x_train, x_test, y_train, y_test

In [None]:
def predict(model, x_train, x_test, y_train, y_test):
    
    elif model == 'LGBM':
        reg = LGBMRegressor(boosting_type='gbdt', 
                            class_weight=None, 
                            colsample_bytree=1.0, 
                            importance_type='split', 
                            learning_rate=0.1, 
                            max_depth=-1, 
                            min_child_samples=21, 
                            min_child_weight=0.001, 
                            min_split_gain=0.9, 
                            n_estimators=140, 
                            n_jobs=-1, 
                            num_leaves=70, 
                            objective=None, 
                            random_state=2175, 
                            reg_alpha=1e-06, 
                            reg_lambda=0.5, 
                            silent='warn', 
                            subsample=1.0, 
                            subsample_for_bin=200000, 
                            subsample_freq=0, 
                            feature_fraction=0.5, 
                            bagging_freq=0, 
                            bagging_fraction=0.5)
        
    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    feat = reg.feature_importances_
    
    return r2, mse, mae, y_test, y_pred, feat

In [None]:
def plot(y_test, y_pred):
    plt.figure(figsize=(4, 4))
    plt.scatter(y_test, y_pred, c='blue', alpha=0.4)
    plt.plot([0, 200], [0, 200], c='black', alpha=0.5)
    plt.xlabel('True')
    plt.ylabel('Pred')
    plt.show()

In [None]:
model_list = ['LGBM']

In [None]:
G_model_list = []
G_r2_list = []
G_mse_list = []
G_mae_list = []
G_rand_list = []

for i in tqdm(range(len(model_list))):
    model = model_list[i]
    print(model)
    for r in tqdm(range(0, 1000, 100)):
        print(r)
        x_train, x_test, y_train, y_test = prepro(df, G, scaling=True, test=0.2, seed=r)
        r2, mse, mae, y_test, y_pred, feat = predict(model, x_train, x_test, y_train, y_test)
        print('R2 :', round(r2, 2))
        print('MSE :', round(mse, 2))
        print('MAE :', round(mae, 2))
        plot(y_test, y_pred)
        G_model_list.append(model)
        G_r2_list.append(round(r2, 4))
        G_mse_list.append(round(mse, 4))
        G_mae_list.append(round(mae, 4))
        G_rand_list.append(r)
        feat_df = pd.DataFrame({'Features':df.columns, 'Importances':feat})
        feat_df = feat_df.sort_values(['Importances'], ascending=False).reset_index(drop=True)
        feat_df.to_csv(f'../data/G_feature_importance_by_{model}_{r}.csv', index=False)