In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/Project/Dacon/lgauto/open.zip
# !git clone --recursive https://github.com/Microsoft/LightGBM
# !cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
!pip install optuna
!pip install catboost
!pip install skranger
!pip install ngboost

Archive:  /content/drive/MyDrive/Project/Dacon/lgauto/open.zip
   creating: meta/
  inflating: meta/x_feature_info.csv  
  inflating: meta/y_feature_info.csv  
  inflating: meta/y_feature_spec_info.csv  
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 5.1 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 10.4 MB/s 
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting alembic
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 72.7 MB/s 
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading

In [None]:
import pandas as pd
import random
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tqdm

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

from lightgbm import LGBMRegressor
from ngboost import NGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, Lasso, Ridge
from catboost import CatBoostRegressor, Pool
from skranger.ensemble import RangerForestRegressor
from sklearn.neighbors import RadiusNeighborsRegressor

from hyperopt import fmin, hp, tpe
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

In [None]:
class Config:
  seed = 42
  epochs = 200

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [None]:
def check_for_NAs(df, show=False):
    """
    @Description: checks for the NAs in the dataframe
    @Param1: df, pandas dataframe
    @Param2: show, boolean indicating whether NaN data are also necessary as a part of the output
    @Return: name of the columns with NaN
    """
    nan_values = df.loc[:, df.isnull().any()]
    if show:
        return df[df.isna().any(axis=1)]
    return list(nan_values.columns)

In [None]:
def check_for_label_bound(df, labels, bound):
    """
    @Description: check bound is inbetween min and max
    @Param1: df, pandas dataframe
    @Param2: labels, list of column names 
    @Param3: thres: list of bounds
    @Return: names of the columns not within the bound
    """
    n = len(labels)
    result = []
    for idx in range(n):
        col = labels[idx]
        thres = bound[idx]
        extracted_column = df[col]
        if not extracted_column.between(thres[0], thres[1]).all():
            result.append(labels[idx])
    if len(result) == 0:
        print('everything is within the bound')
    return result

In [None]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [None]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [None]:
def adjacent_histogram_boxplot(feature_var, figsize = (7, 5)):
    """
    @Description: plot histogram and boxplot in next to each other
    @Param1: feature_var, pandas series 
    @Param2: figsize, size of the figure 
    """
    fig, (hist_plot, box_plot) = plt.subplots(nrows=2, sharex=True, gridspec_kw={'height_ratios':(.85,.15)}, figsize=figsize)
    sns.distplot(feature_var, kde=True, ax=hist_plot, kde_kws= {"linewidth":1.5}) 
    sns.boxplot(feature_var, ax=box_plot, linewidth = 1, width = 0.5)
    hist_plot.set_ylabel('')    
    hist_plot.set_xlabel('')
    box_plot.set_xlabel('')
    hist_plot.tick_params(labelsize=8)
    box_plot.tick_params(labelsize=8)
    fig.suptitle(feature_var.name, fontsize = 10)
    hist_plot.axvline(np.mean(feature_var),color='red',linestyle='-',lw = 1.5)
    hist_plot.axvline(np.median(feature_var),color='green',linestyle='--',lw = 1.5)
    

In [None]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [None]:
def lg_individual_nrmse(gt, preds):
    """
    @Description: Metric used in this project (individual)
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    rmse = mean_squared_error(gt, preds, squared=False)
    nrmse = rmse/np.mean(np.abs(gt))
    return nrmse

In [None]:
def find_outlier_zscore(data, threshold = 3):
    mean = np.mean(data)
    std = np.std(data)
    zs = [(y - mean) / std for y in data]
    masks = np.where(np.abs(zs) > threshold)
    return masks[0]

In [None]:
ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
      'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
      'Y_11', 'Y_12', 'Y_13', 'Y_14']
ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], 
             [7, 19], [22, 36.5], [-19.2, 19], 
             [2.4, 4], [-29.2, -24], [-29.2, -24],
             [-30.6, -20], [19.6, 26.6], [-29.2, -24],
             [-29.2, -24], [-29.2, -24]]

In [None]:
seed_everything(Config.seed)

train_df = pd.read_csv('./train.csv')
test_x = pd.read_csv('./test.csv')
train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)

In [None]:
class Config:
  seed = 42
  epochs = 200
  cv=10
  test_size = 0.2

In [None]:
def lgbm_objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.5f}'.format(params['colsample_bytree']),
        'subsample': '{:.5f}'.format(params['subsample']),
        'min_split_gain': '{:.5f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.5f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.5f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.5f}'.format(params['reg_lambda']),
        'learning_rate': '{:.5f}'.format(params['learning_rate']),   
    }

    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 1,
        verbose = 100,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_01'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_01']))
    return losses.mean()


def xgb_objective(params):
    params = {

    }

    model = XGBRegressor(
        n_jobs = -1,
        verbose = 100,
        random_state = 1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_01'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_01']))
    return losses.mean()


def cat_objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],   
        'l2_leaf_reg': params['l2_leaf_reg'],
        'max_bin': int(params['max_bin']),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'random_strength': params['random_strength'],
        'fold_len_multiplier': params['fold_len_multiplier'],
        
    }

    model = CatBoostRegressor(
        logging_level='Silent',
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_04'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_04']))
    return losses.mean()

def random_objective(params):
    params = {
 
    }

    model = RangerForestRegressor(
        n_jobs = -1,
        verbose= 100
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_01'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_01']))
    return losses.mean()

def gradient_objective(params):
    params = {

        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'subsample': params['subsample'],
        'learning_rate': params['learning_rate'],
        'min_samples_split': int(params['min_samples_split']),
        'min_samples_leaf': int(params['min_samples_leaf']),
        'min_weight_fraction_leaf': params['min_weight_fraction_leaf'],
        'min_impurity_decrease': params['min_impurity_decrease'],
        'max_features': params['max_features'],
        'alpha': params['alpha'],
        'max_leaf_nodes': int(params['max_leaf_nodes']),
        'ccp_alpha': params['ccp_alpha'],
        
    }

    model = GradientBoostingRegressor(
        random_state = 1,
        **params
    )
    
    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_07'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_07']))
    print("NRMSE Loss {:.5f} params {}".format(losses.mean(), params))
    
    return losses.mean()

def extra_objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'min_samples_split': int(params['min_samples_split']),
        'min_samples_leaf': int(params['min_samples_leaf']),
        'min_weight_fraction_leaf': params['min_weight_fraction_leaf'],
        'max_features': params['max_features'],
        'max_leaf_nodes': int(params['max_leaf_nodes']),
        'min_impurity_decrease': params['min_impurity_decrease'],
        'bootstrap': params['bootstrap'],
        'ccp_alpha': params['ccp_alpha'],  
    }

    model = ExtraTreesRegressor(
        n_jobs = -1,
        verbose = 0,
        random_state = 1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_01'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_01']))
    return losses.mean()

def ngbr_objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'learning_rate': params['learning_rate'],
        'natural_gradient': params['natural_gradient'],
        'col_sample': float(params['col_sample']),
        'minibatch_frac': float(params['minibatch_frac']),
        'tol': float(params['tol']),
    }

    model = NGBRegressor(
        verbose = 100,
        random_state = 1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_01'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_01']))
    print("NRMSE Loss {:.5f} params {}".format(losses.mean(), params))
    return losses.mean()

## Catboost Regressor

In [None]:
## https://catboost.ai/en/docs/concepts/parameter-tuning (참고)
space_catboost = {
    'n_estimators' : hp.quniform('n_estimators', 10, 20, 1),
    'depth': hp.quniform("depth", 2, 16, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 3, 8),
    'max_bin' : hp.quniform('max_bin', 1, 254, 1),
    'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 2, 700, 1),
    'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
    'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
}

best = fmin(fn = cat_objective,
            space = space_catboost,
            algo = tpe.suggest,
            verbose = 1,
            max_evals = 200)


100%|██████████| 200/200 [35:40<00:00, 10.70s/it, best loss: 0.1916584045864259]


In [None]:
print(best)

## Extra Trees Regressor

In [None]:
space_extra = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 50, 1),
    'min_samples_split': hp.quniform('min_samples_split', 5, 50, 5),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 5, 50, 1),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0.01, 0.5),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None, 'auto']),
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 3, 30, 1),
    'min_impurity_decrease': hp.uniform('min_impurity_decrease', 0, 200),
    'bootstrap':  hp.choice('bootstrap', [True, False]),
    'ccp_alpha': hp.uniform('ccp_alpha', 0.01, 1.0),
}

best = fmin(fn = extra_objective,
            space = space_extra,
            algo = tpe.suggest,
            verbose = 1,
            max_evals = 2)

best['n_estimators'] = int(best['n_estimators'])
best['max_depth'] = int(best['max_depth'])
best['max_leaf_nodes'] = int(best['max_leaf_nodes'])


# RangerForest

In [None]:
def random_objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'mtry': int(params['mtry']),
        'min_node_size': int(params['min_node_size']),
        'max_depth': int(params['max_depth']),
        # 'num_random_splits': int(params['num_random_splits']),
        'sample_fraction': params['sample_fraction'],
        'alpha':  params['alpha'],
        # 'split_rule' : params['split_rule'],
    }

    model = RangerForestRegressor(
        n_jobs = -1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_04'], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_04']))
    return losses.mean()

space_random = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 1),
    'mtry': hp.quniform('mtry', 5, len(train_x.columns), 1),
    'min_node_size': hp.quniform('min_node_size', 10, 200, 5),
    'max_depth': hp.quniform('max_depth', 10, 350, 5),
    # 'num_random_splits': hp.quniform('num_random_splits', 5, 200, 5),
    'sample_fraction': hp.uniform('sample_fraction', 0.3, 1.0),
    'alpha': hp.uniform('alpha', 0.3, 1.0),
    # 'split_rule' : hp.choice('reg_lambda', ['variance', 'extratrees', 'maxstat', 'beta']),
}

best = fmin(fn = random_objective,
            space = space_random,
            algo = tpe.suggest,
            max_evals = 200)

print(best)

## Gradient Boost Objective


In [None]:
space_gradient = {
    'n_estimators' : hp.quniform('n_estimators', 100, 2000, 10),
    'max_depth': hp.quniform('max_depth', 5, 250, 1),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    'min_samples_split': hp.quniform('min_samples_split', 5, 50, 5),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 5, 50, 1),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0.01, 0.5),
    'min_impurity_decrease': hp.uniform('min_impurity_decrease', 0, 200),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None, 'auto']),
    'alpha': hp.uniform('alpha', 0.001, 0.999),
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 3, 30, 1),
    'ccp_alpha': hp.uniform('ccp_alpha', 0.01, 0.999),

}

best = fmin(fn = gradient_objective,
            space = space_gradient,
            algo = tpe.suggest,
            verbose = 1,
            max_evals = 200)

print(best)

##LGBM

In [None]:
space_lgbm = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 1),
    'max_depth': hp.quniform('max_depth', 5, 250, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 200, 5),
    'min_child_samples': hp.quniform('min_child_samples', 10, 150, 5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 500),
    'reg_lambda': hp.uniform('reg_lambda', 0, 500),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = lgbm_objective,
            space = space_lgbm,
            algo = tpe.suggest,
            verbose = 10,
            max_evals = 200)

print(best)
best['n_estimators'] = int(best['n_estimators'])
best['num_leaves'] = int(best['num_leaves'])
best['max_depth'] = int(best['max_depth'])
best['min_child_samples'] = int(best['min_child_samples'])

##NGBR

In [None]:
space_ngboost = {
    'n_estimators': hp.quniform('n_estimators', 100, 2000, 10),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'natural_gradient': hp.choice('natural_gradient', [True, False]),
    'col_sample': hp.quniform('col_sample', 0, 1, 0.01),
    'minibatch_frac': hp.quniform('minibatch_frac', 0, 1, 0.01),
    'tol': hp.uniform('tol', 1e-6, 3e-4),
}

best = fmin(fn = ngbr_objective,
            space = space_ngboost,
            algo = tpe.suggest,
            verbose = 10,
            max_evals = 100)

print(best)
best['n_estimators'] = int(best['n_estimators'])
best['num_leaves'] = int(best['num_leaves'])
best['max_depth'] = int(best['max_depth'])
best['min_child_samples'] = int(best['min_child_samples'])

In [None]:
def get_stacking_base_datasets(model, train_x, train_y, col,test):
    kf = KFold(n_splits=Config.cv, shuffle=False)
    train_fold_pred = np.zeros((train_x.shape[0],1))
    test_pred = np.zeros((test.shape[0],Config.cv))
    
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(train_x)):
        print('Fold : ', folder_counter, ' Start')
        X_tr = train_x.loc[train_index]
        y_tr = train_y[col].loc[train_index]
        X_te = train_x.loc[valid_index] 
        
        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1) 
        test_pred[:, folder_counter] = model.predict(test) 
        
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean 

In [None]:
model = LGBMRegressor(
        n_jobs = -1,
        random_state = 1,
        verbose = 100,
        **best
    )


# model 8개
xx_train, xx_test = get_stacking_base_datasets(model, train_x, train_y, col='Y_01', test=test_x)
yy_train, yy_test = get_stacking_base_datasets(model, train_x, train_y, col='Y_01', test=test_x)
zz_train, zz_test = get_stacking_base_datasets(model, train_x, train_y, col='Y_01', test=test_x)
qq_train, qq_test = get_stacking_base_datasets(model, train_x, train_y, col='Y_01', test=test_x)

Stack_final_X_train = np.concatenate((xx_train,yy_train,zz_train,qq_train), axis=1)
Stack_final_X_test = np.concatenate((xx_test,yy_test,zz_test,qq_test), axis=1)

# final_model 선택해야함
final_model.fit(Stack_final_X_train, y_train)
stack_final = final_model.predict(Stack_final_X_test) 




In [None]:
## col1 col2 지정
stack_final.to_csv(f'{col1}_{col2}.csv', index=False)