### Import Libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [4]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [5]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [6]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [7]:

train_df = pd.read_csv('data/train.csv')
test_x = pd.read_csv('data/test.csv')
train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)

In [8]:
def outlier_iqr(tmp, i):
    data = tmp[i]
    
    print(np.percentile(data,75))
    print(np.percentile(data,25))
    q25, q75 = np.percentile(data, 25), np.percentile(data,75)
    iqr = q75 - q25
    
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    print('변수 명 : ',i)
    print('IQR : ', iqr)
    print('lower bound : ', lower)
    print('upper bound : ', upper)
    
    tmp['X_57'] = np.where(data < upper, 
                    np.where(tmp['X_57'] == 0, 0, 1), 
                    np.where(tmp['X_57'] == 0, 1, 1))
    tmp['X_57'] = np.where(data>lower, 
                    np.where(tmp['X_57'] == 0, 0, 1), 
                    np.where(tmp['X_57'] == 0, 1, 1))
    
    print("tmp['X_58'].value_counts() : ", tmp['X_57'].value_counts());
    return tmp

In [9]:
X_57 = [0 for i in range(0, train_x.shape[0])]
len(X_57)
X_57 = pd.DataFrame(X_57)
train_x['X_57'] = X_57

In [10]:
# 2차 시도 (y와 corr 기준) fixed 2 !!!!!!!!!!
cols = ["X_14","X_15", "X_17", "X_18", "X_19", "X_20", "X_21","X_22",
        "X_24", "X_25", "X_26", "X_27", "X_28","X_29", "X_39", "X_40","X_41","X_42","X_43", "X_44","X_45"]


In [11]:
# train_x
for i in cols:
    train_x = outlier_iqr(train_x, i)

# test_x
X_57 = [0 for i in range(0, test_x.shape[0])]
X_57 = pd.DataFrame(X_57)
test_x['X_57'] = X_57

for i in cols:
    test_x = outlier_iqr(test_x, i)


13.39
13.35
변수 명 :  X_14
IQR :  0.040000000000000924
lower bound :  13.29
upper bound :  13.450000000000003
tmp['X_58'].value_counts() :  0    39143
1      464
Name: X_57, dtype: int64
13.41
13.36
변수 명 :  X_15
IQR :  0.05000000000000071
lower bound :  13.284999999999998
upper bound :  13.485000000000001
tmp['X_58'].value_counts() :  0    39129
1      478
Name: X_57, dtype: int64
13.53
13.5
변수 명 :  X_17
IQR :  0.02999999999999936
lower bound :  13.455000000000002
upper bound :  13.575
tmp['X_58'].value_counts() :  0    38680
1      927
Name: X_57, dtype: int64
13.47
13.43
변수 명 :  X_18
IQR :  0.040000000000000924
lower bound :  13.369999999999997
upper bound :  13.530000000000001
tmp['X_58'].value_counts() :  0    38639
1      968
Name: X_57, dtype: int64
3.31
3.16
변수 명 :  X_19
IQR :  0.1499999999999999
lower bound :  2.9350000000000005
upper bound :  3.535
tmp['X_58'].value_counts() :  0    38488
1     1119
Name: X_57, dtype: int64
3.27
3.1
변수 명 :  X_20
IQR :  0.16999999999999993
lower 

In [19]:
#Feature Selection

# 스크류 삽입 깊이 차이
train_x['X_58'] = train_x[['X_19', 'X_20', 'X_21', 'X_22']].max(axis=1) - train_x[['X_19', 'X_20', 'X_21', 'X_22']].min(axis=1)

# 커넥터 핀 치수
train_x['X_59'] = train_x[['X_24', 'X_25', 'X_26', 'X_27', 'X_28', 'X_29']].max(axis=1) - train_x[['X_24', 'X_25', 'X_26', 'X_27', 'X_28', 'X_29']].min(axis=1)

# 스크류 삽입 시 분당 회전 수
train_x['X_60'] = train_x[['X_34', 'X_35', 'X_36', 'X_37']].max(axis=1) - train_x[['X_34', 'X_35', 'X_36', 'X_37']].min(axis=1)

# 하우징 PCB 안착부 치수
train_x['X_61'] = train_x[['X_38', 'X_39', 'X_40']].max(axis=1) - train_x[['X_38', 'X_39', 'X_40']].min(axis=1)

# 레이돔 치수
train_x['X_62'] = train_x[['X_41', 'X_42', 'X_43', 'X_44']].max(axis=1) - train_x[['X_41', 'X_42', 'X_43', 'X_44']].min(axis=1)

# RF 부분 SMT 납 량
train_x['X_63'] = train_x[['X_50', 'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56']].max(axis=1) - train_x[['X_50', 'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56']].min(axis=1)



In [20]:
#Feature Selection

# 스크류 삽입 깊이 차이
test_x['X_58'] = test_x[['X_19', 'X_20', 'X_21', 'X_22']].max(axis=1) - test_x[['X_19', 'X_20', 'X_21', 'X_22']].min(axis=1)

# 커넥터 핀 치수
test_x['X_59'] = test_x[['X_24', 'X_25', 'X_26', 'X_27', 'X_28', 'X_29']].max(axis=1) - test_x[['X_24', 'X_25', 'X_26', 'X_27', 'X_28', 'X_29']].min(axis=1)

# 스크류 삽입 시 분당 회전 수
test_x['X_60'] = test_x[['X_34', 'X_35', 'X_36', 'X_37']].max(axis=1) - test_x[['X_34', 'X_35', 'X_36', 'X_37']].min(axis=1)

# 하우징 PCB 안착부 치수
test_x['X_61'] = test_x[['X_38', 'X_39', 'X_40']].max(axis=1) - test_x[['X_38', 'X_39', 'X_40']].min(axis=1)

# 레이돔 치수
test_x['X_62'] = test_x[['X_41', 'X_42', 'X_43', 'X_44']].max(axis=1) - test_x[['X_41', 'X_42', 'X_43', 'X_44']].min(axis=1)

# RF 부분 SMT 납 량
test_x['X_63'] = test_x[['X_50', 'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56']].max(axis=1) - test_x[['X_50', 'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56']].min(axis=1)



In [21]:
def outlier_iqr_cols(tmp, i):
    data = tmp[i]
    
    print(np.percentile(data,75))
    print(np.percentile(data,25))
    q25, q75 = np.percentile(data, 25), np.percentile(data,75)
    iqr = q75 - q25
    
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    print('변수 명 : ',i)
    print('IQR : ', iqr)
    print('lower bound : ', lower)
    print('upper bound : ', upper)
    
    tmp['X_57'] = np.where(data < upper, 0, 1) 
    tmp['X_57'] = np.where(data > lower, 0, 1)
    
    print("tmp['X_58'].value_counts() : ", tmp['X_57'].value_counts());
    return tmp

In [22]:
# train_x
for i in cols:
    train_x = outlier_iqr_cols(train_x, i)

# test_x
X_57 = [0 for i in range(0, test_x.shape[0])]
X_57 = pd.DataFrame(X_57)
test_x['X_57'] = X_57

for i in cols:
    test_x = outlier_iqr_cols(test_x, i)

13.39
13.35
변수 명 :  X_14
IQR :  0.040000000000000924
lower bound :  13.29
upper bound :  13.450000000000003
tmp['X_58'].value_counts() :  0    39288
1      319
Name: X_57, dtype: int64
13.41
13.36
변수 명 :  X_15
IQR :  0.05000000000000071
lower bound :  13.284999999999998
upper bound :  13.485000000000001
tmp['X_58'].value_counts() :  0    39551
1       56
Name: X_57, dtype: int64
13.53
13.5
변수 명 :  X_17
IQR :  0.02999999999999936
lower bound :  13.455000000000002
upper bound :  13.575
tmp['X_58'].value_counts() :  0    39220
1      387
Name: X_57, dtype: int64
13.47
13.43
변수 명 :  X_18
IQR :  0.040000000000000924
lower bound :  13.369999999999997
upper bound :  13.530000000000001
tmp['X_58'].value_counts() :  0    39431
1      176
Name: X_57, dtype: int64
3.31
3.16
변수 명 :  X_19
IQR :  0.1499999999999999
lower bound :  2.9350000000000005
upper bound :  3.535
tmp['X_58'].value_counts() :  0    39592
1       15
Name: X_57, dtype: int64
3.27
3.1
변수 명 :  X_20
IQR :  0.16999999999999993
lower 

In [23]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, train_x, train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 200,
            rstate=np.random.default_rng(1))

  0%|                                                                          | 0/200 [00:00<?, ?trial/s, best loss=?]

In [25]:
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_x, train_y)
preds = model.predict(test_x)

LightGBMError: Parameter num_iterations should be of type int, got "1100.0"

In [None]:
submit = pd.read_csv('./sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
submit.to_csv('./submission_3.csv', index = False)