### Import Libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [4]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [5]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [6]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [7]:
def get_binary_target(df):
    """
    @Description: transform numeric target to binary
    @Param1 df, pandas dataframe
    @Param2 y_range, list of lists with min-max
    @return labels, binary labels
    """
    
    ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
          'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
          'Y_11', 'Y_12', 'Y_13', 'Y_14']
    ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], [7, 19], [22, 36.5], [-19.2, 19], 
                 [2.4, 4], [-29.2, -24], [-29.2, -24],[-30.6, -20], [19.6, 26.6], 
                 [-29.2, -24], [-29.2, -24], [-29.2, -24]]
    labels = pd.DataFrame()
    for idx in range(len(ys)):
        y_series = ~df[ys[idx]].between(ys_bounds[idx][0], ys_bounds[idx][1], inclusive='both')
        labels = pd.concat([labels, y_series.astype(int)], axis = 1)
    return labels


In [16]:
train_df = pd.read_csv('./train.csv')
test_x = pd.read_csv('./test.csv')
train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)
y_binary_label = get_binary_target(train_y)


In [45]:

def get_splitted_data(binary_target, col, train_x_df, train_y_df, test_size = 0.2):
    
    train = pd.concat([train_x_df, train_y_df[col]], axis = 1) # 학습데이터에 수치형 타겟 칼럼 추가 
    target = binary_target[col] # 칼럼 이진 데이터 (불량 vs. 정상)
    X_train, X_test, y_train, y_test = train_test_split(train, target, random_state=1, test_size=test_size, stratify=target)
    
    # 여기서 X_test, y_test 는 이진 데이터이므로 사용하지 않음
    # 나눠진 데이터에서 불량/정상 데이터 비율 확인 
    print("학습 데이터에서의 불량/정상 Ratio : ", sum(y_train ==0) / sum(y_train))
    print("테스트 데이터에서의 불량/정상 Ratio: ", sum(y_test ==0) / sum(y_test))
    
    train_numerical_target = X_train[col] # 나눠진 *학습* 데이터에서 수치형 데이터 다시 추출
    train_feature = X_train.drop([col], axis = 1) # 나눠진 *학습* 데이터에서 수치형 데이터 제거

    test_numerical_target = X_test[col] # 나눠진 *테스트* 데이터에서 수치형 데이터 다시 추출
    test_feature = X_test.drop([col], axis = 1) # 나눠진 *테스트* 데이터에서 수치형 데이터 제거
    
    return train_feature, train_numerical_target, test_feature, test_numerical_target


In [46]:

train_feature, train_target, test_feature, test_target = get_splitted_data(y_binary_label, 'Y_01', train_x, train_y, test_size=0.2)


학습 데이터에서의 불량/정상 Ratio :  25.82895850973751
테스트 데이터에서의 불량/정상 Ratio:  25.854237288135593


In [None]:
lst = []
for i in df_indicator.columns:  # 불량 데이터 (행) 인덱스 추출
    lst.append(df_indicator[df_indicator[i] == 1].index)
    
ans=set() # 유니크한 인덱스
for i in lst:
    for k in i:
        ans.add(k)

ans = list(ans)
ans.sort()
train_data_spec = train_df.loc[ans, :]  # 불량 데이터

In [None]:
train_data_norm = train_df.drop(train_data_spec.index) # 정상 데이터

In [None]:
print(len(train_data_norm))

train_x_norm, train_y_norm = dataset_split_X_y(train_data_norm)

In [None]:
train_x_spec, train_y_spec = dataset_split_X_y(train_data_spec)
print(len(train_x_spec))


In [None]:
new_train_x_spec_plus_norm = pd.concat([train_x_norm, train_x_spec], axis = 0)
new_train_y_spec_plus_norm = pd.concat([train_y_norm, train_y_spec], axis = 0)

In [None]:
new_train_x_spec_plus_norm

In [None]:
# msk1 = np.random.rand(len(train_x_norm)) < 0.8
# msk2 = np.random.rand(len(train_x_spec)) < 0.8

# tv_train_x_norm = train_x_norm[msk1]
# tv_valid_x_norm = train_x_norm[~msk1]
# tv_train_y_norm = train_y_norm[msk1]
# tv_valid_y_norm = train_y_norm[~msk1]

# tv_train_x_spec = train_x_spec[msk2]
# tv_valid_x_spec = train_x_spec[~msk2]
# tv_train_y_spec = train_y_spec[msk2]
# tv_valid_y_spec = train_y_spec[~msk2]

# tv_train_x = pd.concat([tv_train_x_norm, tv_train_x_spec], axis=0)
# tv_valid_x = pd.concat([tv_valid_x_norm, tv_valid_x_spec], axis=0)
# tv_train_y = pd.concat([tv_train_y_norm, tv_train_y_spec], axis=0)
# tv_valid_y = pd.concat([tv_valid_y_norm, tv_valid_y_spec], axis=0)

# tv_train_x.reset_index(inplace = True)
# tv_valid_x.reset_index(inplace = True)
# tv_train_y.reset_index(inplace = True)
# tv_valid_y.reset_index(inplace = True)

In [None]:
tv_train_x_norm

In [None]:
print(len(tv_train_x))
print(len(tv_train_y))
print('-------------------------------------------')
print(len(tv_valid_x))
print(len(tv_valid_y))

In [None]:
tv_train_x = tv_train_x.iloc[:, 1:]
tv_train_y = tv_train_y.iloc[:, 1:]
tv_valid_x = tv_valid_x.iloc[:, 1:]
tv_valid_y = tv_valid_y.iloc[:, 1:]

In [None]:
tv_train_x.columns

In [None]:
#cols_with_zero_variance = zero_variance(tv_train_x) # 분산이 0 (통과 여부)
#tv_train_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)
#tv_valid_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)

tv_train_x = tv_train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
tv_valid_x = tv_valid_x.drop(['X_10', 'X_11'], axis = 1)

#tv_valid_x = tv_valid_x.drop('ID', axis=1)

In [None]:
print(len(tv_valid_x) + len(tv_train_x))
print(len(tv_train_y) + len(tv_valid_y))

In [None]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, tv_train_x, tv_train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1000, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 500,
            rstate=np.random.default_rng(1))

In [None]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, tv_train_x, tv_train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

In [None]:
print(len(tv_train_x.columns))
print(len(tv_valid_x.columns))

In [None]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(tv_train_x, tv_train_y)
preds = model.predict(tv_valid_x)

In [None]:
a = pd.DataFrame(preds)

In [None]:
for i in a.columns:
    print('{}  /  Min : {}    ,   Min : {}'.format(i, min(a[i]), max(a[i])) )

In [None]:
submit = pd.read_csv('data/validation_test_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
#submit.to_csv('data/param_test.csv', index = False)

In [None]:
submit.head()

df_indicator = pd.DataFrame()

for i, k in enumerate(submit.columns):
    if k == 'ID':
        continue
    y_series = ~submit[k].between(y_feature_spec_info['최소'][i-1], y_feature_spec_info['최대'][i-1])
    if i == 1:
        df_indicator = y_series
    else:
        df_indicator = df_indicator + y_series


In [None]:
df_indicator

In [None]:
print(df_indicator.value_counts())
df_indicator[df_indicator==True] = 1
df_indicator[df_indicator==False] = 0

In [None]:
df_indicator

In [None]:
tv_valid_x['X_57'] = df_indicator



In [None]:
tv_valid_x

In [None]:
tv_valid_x['X_57'] = tv_valid_x['X_57'].astype('int')

In [None]:
tv_valid_x.dtypes

In [None]:
print(len(tv_train_x))
print(len(tv_valid_x))

In [None]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, tv_valid_x, tv_valid_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

In [None]:
submit

In [None]:
submit = submit.drop('ID', axis=1)

In [None]:
lg_nrmse(submit, tv_valid_y)