# 필요한 package Load

In [None]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys, warnings
import seaborn as sns
if not sys.warnoptions: warnings.simplefilter("ignore")
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
pd.set_option('max_columns', 1040, 'max_rows', 140)

In [None]:
from tqdm import tqdm

In [None]:
train = pd.read_csv('train.csv')

In [None]:
test = pd.read_csv('test.csv')

In [None]:
submission = pd.read_csv('sample_submission.csv')

#### 변수
- id : 구분자
- rho : 측정 거리 (단위: mm)
- src : 광원 스펙트럼 (650 nm ~ 990 nm)
- dst : 측정 스펙트럼 (650 nm ~ 990 nm)
- hhb : 디옥시헤모글로빈 농도
- hbo2 : 옥시헤모글로빈 농도
- ca : 칼슘 농도
- na : 나트륨 농도

In [None]:
train_dst = train.filter(regex='_dst$', axis=1) # dst 데이터만 따로 뺀다.
test_dst = test.filter(regex='_dst$', axis=1) # 보간을 하기위해 결측값을 삭제한다.
test_dst.head(1)

In [None]:
train_dst.iloc[8].plot()

In [None]:
train_dst['first'] = 0

In [None]:
train_dst = train_dst[['first','650_dst', '660_dst', '670_dst', '680_dst', '690_dst', '700_dst',
       '710_dst', '720_dst', '730_dst', '740_dst', '750_dst', '760_dst',
       '770_dst', '780_dst', '790_dst', '800_dst', '810_dst', '820_dst',
       '830_dst', '840_dst', '850_dst', '860_dst', '870_dst', '880_dst',
       '890_dst', '900_dst', '910_dst', '920_dst', '930_dst', '940_dst',
       '950_dst', '960_dst', '970_dst', '980_dst', '990_dst']]

In [None]:
train_dst = train_dst.interpolate(method='linear', axis=1)
test_dst = test_dst.interpolate(method='linear', axis=1)

In [None]:
train_dst = train_dst.fillna(method='ffill',axis=1)

In [None]:
test_dst = test_dst.fillna(method='ffill',axis=1)

In [None]:
train.update(train_dst) # 보간한 데이터를 기존 데이터프레임에 업데이트 한다.
test.update(test_dst)

In [None]:
y_list = ['hhb', 'hbo2','ca','na']

### Feature2

In [None]:
df = pd.concat([train.drop(y_list,axis=1),test])

In [None]:
df.index = df.id

In [None]:
df['src_mean'] = df.loc[:,'650_src':'990_src'].mean(axis=1)
df['dst_mean'] = df.loc[:,'650_dst':'990_dst'].mean(axis=1)

In [None]:
for i in range(650,1000,10) : 
    df['rat_{}'.format(i)] =  (df['{}_dst'.format(i)]) / (df['{}_src'.format(i)])

In [None]:
df_imag = df.copy()

dst_list = list(df.loc[:,'650_dst':'990_dst'].columns)
df_real=df[dst_list]
df_imag=df[dst_list]

for i in tqdm(df_real.index):
    df_real.loc[i]=df_real.loc[i] - df_real.loc[i].mean()
    df_imag.loc[i]=df_imag.loc[i] - df_imag.loc[i].mean()
    
    df_real.loc[i] = np.fft.fft(df_real.loc[i], norm='ortho').real
    df_imag.loc[i] = np.fft.fft(df_imag.loc[i], norm='ortho').imag

real_part=[]
imag_part=[]

for col in dst_list:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
df_real.columns=real_part
df_imag.columns=imag_part
df_p = pd.concat((df_real, df_imag), axis=1)


df=pd.concat((df, df_p), axis=1)

In [None]:
# src의 스펙트럼 값의 직전값을 빼는 feature 생성
for i in range(650,990,10) : 
    df['src_diff{}'.format(i)] = df['{}_src'.format(i+10)] - df['{}_src'.format(i)]

In [None]:
# dst의 스펙트럼 값의 직전값을 빼는 feature 생성
for i in range(650,990,10) : 
    df['dst_diff{}'.format(i)] = df['{}_dst'.format(i+10)] - df['{}_dst'.format(i)]

In [None]:
# src, dst의 스펙트럼 값의 직전값의 비율 feature 생성
for i in range(650,990,10) : 
    df['src_rat{}'.format(i)] = df['{}_src'.format(i+10)] / df['{}_src'.format(i)]

for i in range(650,990,10) : 
    df['dst_rat{}'.format(i)] = df['{}_dst'.format(i+10)] / df['{}_dst'.format(i)]

In [None]:
# dst의 17번째 뒤의 스팩트럼의 값을 나누는 작업 수행 
for i in range(650,830,10) :
    df['rev2_dst_rat{}'.format(i)] = df['{}_dst'.format(i)] / df['{}_dst'.format(170+i)]

In [None]:
# dst의 평균값에 거리의 값을 나눔
df['rho_dst_mean'] = df['dst_mean'] / df['rho']

In [None]:
# src의 평균값에 거리의 값을 나눔 
df['rho_src_mean'] = df['src_mean'] / df['rho']

In [None]:
# dst의 값을 dst 행의 합으로 나누어 행별 비율로 만듦.
dst_rate = df[dst_list].div(df[dst_list].sum(axis=1),axis=0)

dst_rate.columns = 'rate_' + dst_rate.columns

df = pd.concat([df,dst_rate],axis=1)

In [None]:
# src의 값을 src 행의 합으로 나누어 행별 비율로 만듦.
src_rate = df[src_list].div(df[src_list].sum(axis=1),axis=0)

src_rate.columns = 'rate_' + src_rate.columns

df = pd.concat([df,src_rate],axis=1)

In [None]:
# src의 비율 대비 dst의 비율 비쳐 만듦 
for i in range(len(src_rate.columns)) : 
    df['div_rate{}'.format(i)] = dst_rate.iloc[:,i] / src_rate.iloc[:,i]


### features3

In [None]:
# dst를 src로 나눈 비율의 평균과 표준편차 만듦 
df['rat_mean']= df.loc[:,'rat_650':'rat_990'].replace(np.inf,1).mean(axis=1)
df['rat_std']= df.loc[:,'rat_650':'rat_990'].replace(np.inf,1).std(axis=1)

In [None]:
# src, dst의 최대값 최소값 표준편차 활용 
df['src_max'] = df.iloc[:,1:36].max(axis=1)
df['src_min'] = df.iloc[:,1:36].min(axis=1)
df['dst_max'] = df.iloc[:,36:71].max(axis=1)
df['dst_min'] = df.iloc[:,36:71].min(axis=1)
df['src_std'] = df.iloc[:,1:36].std(axis=1)
df['dst_std'] = df.iloc[:,36:71].std(axis=1)

In [None]:
# src, dst 합 사용 
df['src_sum'] = df.iloc[:,2:37].sum(axis=1)
df['dst_sum'] = df.iloc[:,37:72].sum(axis=1)

In [None]:
# src의 합과 dst의 합 비율 피쳐 만듦.
df['sum_rate'] = df['src_sum'] / df['dst_sum']

## 흡광계수 만들기 (빛의 흡수율 활용)

src는 광원의 빛의 세기 dst는 측정된 빛의 세기 활용하여 흡광계수 만들기

In [None]:
##흡수 계수 
for i in range(650,1000,10) : 
    df['흡수계수_{}'.format(i)] = np.log10(df['{}_src'.format(i)] / df['{}_dst'.format(i)]) / df.rho

In [None]:
# 흡광계수 평균
df['흡수평균'] = df.iloc[:,-35:].replace(np.inf,1).replace(-np.inf,-1).mean(axis=1)

In [None]:
# 흡광계수 표준변차 
df['흡수편차'] = df.iloc[:,-35:].replace(np.inf,1).replace(-np.inf,-1).std(axis=1)

In [None]:
# 흡광계수 합 
df['흡수합'] = df.iloc[:,-35:].replace(np.inf,1).replace(-np.inf,-1).sum(axis=1)

In [None]:
###reverse dst와 src의 비율 계산한걸 뒤집어 피쳐 생성 
for i in range(650,1000,10) : 
    df['rat_{}_r'.format(i)] =  (df['{}_src'.format(i)]) / (df['{}_dst'.format(i)])

In [None]:
# dst와 src의 비율로 만든 값을 나눔. 
for i in range(len(src_rate.columns)) : 
    df['div_rate{}_r'.format(i)] =src_rate.iloc[:,i]/ dst_rate.iloc[:,i] 

In [None]:
# dst를 합친것과 src를 합친것을 나눔. 
df['r_sum_rate'] = df['dst_sum']/ df['src_sum'] 

In [None]:
# 흡수계수의 값을 슬라이딩 하며 비율을 계산함 높은 성능 향상을 보임 
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df['흡수nextrat{}_{}'.format(i,j)] = df['흡수계수_{}'.format(i+j)] / df['흡수계수_{}'.format(i)]

###  hhb

In [None]:
df_hhb = df.copy()

In [None]:
df_hhb = df_hhb.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거. -> feature selection
drop_col = ['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst',
'960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980']

In [None]:
X_train = df_hhb.iloc[:train.shape[0],:].drop(columns=drop_col)
y_train = train.loc[:,"hhb"]
X_test = df_hhb.iloc[train.shape[0]:,:].drop(columns=drop_col)

In [None]:
import lightgbm as lgb#0.842541
from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=49748):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
#49748 0.510459

In [None]:
y_test =  model.predict(X_test)
submission['hhb'] = y_test

###  hbo2

In [None]:
df_hbo2 = df.copy()

In [None]:
df_hbo2 = df_hbo2.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_hbo2.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"hbo2"]
X_test = df_hbo2.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])

In [None]:
X_train.shape#451

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=15427):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
#15427 0.390309

In [None]:
y_test =  model.predict(X_test)
submission['hbo2'] = y_test

###  ca

In [None]:
df_ca = df.copy()

In [None]:
df_ca = df_ca.loc[:,:"흡수계수_990"]

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_ca['흡수nextrat{}_{}'.format(i,j)] = df_ca['흡수계수_{}'.format(i+j)] / df_ca['흡수계수_{}'.format(i)]

In [None]:
df_ca = df_ca.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_ca.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"ca"]
X_test = df_ca.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=46909):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['ca'] = y_test

In [None]:
#46909 1.2701

###  na

In [None]:
df_na = df.copy()

In [None]:
df_na = df_na.loc[:,:"흡수계수_990"]

In [None]:
df_na = df_na.fillna(0)

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_na['흡수nextrat{}_{}'.format(i,j)] = df_na['흡수계수_{}'.format(i+j)] / df_na['흡수계수_{}'.format(i)]

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_na.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"na"]
X_test = df_na.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
                    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=24713):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
# 24713 1.02729

In [None]:
y_test =  model.predict(X_test)
submission['na'] = y_test

In [None]:
def to_zero(x) : 
    if x <= 0 : 
        return(0)
    else : return(x)

In [None]:
for i in submission.columns : 
    submission[i] = submission[i].apply(to_zero)

In [None]:
submission.to_csv('edit_dst.csv',index=False)

In [None]:
a = submission.copy()

# CASE 2

###  hhb

In [None]:
df_hhb = df.copy()

In [None]:
df_hhb = df_hhb.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

drop_col = ['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst',
'960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980']

In [None]:
X_train = df_hhb.iloc[:train.shape[0],:].drop(columns=drop_col)
y_train = train.loc[:,"hhb"]
X_test = df_hhb.iloc[train.shape[0]:,:].drop(columns=drop_col)

In [None]:
# X_train.columns

In [None]:
import lightgbm as lgb#0.842541
from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=998)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=52956):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['hhb'] = y_test

###  hbo2

In [None]:
df_hbo2 = df.copy()

In [None]:
df_hbo2 = df_hbo2.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_hbo2.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"hbo2"]
X_test = df_hbo2.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=998)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=32581):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['hbo2'] = y_test

###  ca

In [None]:
df_ca = df.copy()

In [None]:
df_ca = df_ca.loc[:,:"흡수계수_990"]

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_ca['흡수nextrat{}_{}'.format(i,j)] = df_ca['흡수계수_{}'.format(i+j)] / df_ca['흡수계수_{}'.format(i)]

In [None]:
df_ca = df_ca.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_ca.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"ca"]
X_test = df_ca.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=998)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=30530):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['ca'] = y_test

###  na

In [None]:
df_na = df.copy()

In [None]:
df_na = df_na.loc[:,:"흡수계수_990"]

In [None]:
df_na = df_na.fillna(0)

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_na['흡수nextrat{}_{}'.format(i,j)] = df_na['흡수계수_{}'.format(i+j)] / df_na['흡수계수_{}'.format(i)]

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_na.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"na"]
X_test = df_na.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
                    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=998)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=11274):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['na'] = y_test

In [None]:
def to_zero(x) : 
    if x <= 0 : 
        return(0)
    else : return(x)

In [None]:
for i in submission.columns : 
    submission[i] = submission[i].apply(to_zero)

In [None]:
submission.to_csv('edit_dst_998.csv',index=False)

In [None]:
b = submission.copy()

## case3

###  hhb

In [None]:
df_hhb = df.copy()

In [None]:
df_hhb = df_hhb.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

drop_col = ['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst',
'960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980']

In [None]:
X_train = df_hhb.iloc[:train.shape[0],:].drop(columns=drop_col)
y_train = train.loc[:,"hhb"]
X_test = df_hhb.iloc[train.shape[0]:,:].drop(columns=drop_col)

In [None]:
import lightgbm as lgb#0.842541
from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=22882)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=41419):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['hhb'] = y_test

###  hbo2

In [None]:
df_hbo2 = df.copy()

In [None]:
df_hbo2 = df_hbo2.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_hbo2.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"hbo2"]
X_test = df_hbo2.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=22882)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=21921):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['hbo2'] = y_test

###  ca

In [None]:
df_ca = df.copy()

In [None]:
df_ca = df_ca.loc[:,:"흡수계수_990"]

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_ca['흡수nextrat{}_{}'.format(i,j)] = df_ca['흡수계수_{}'.format(i+j)] / df_ca['흡수계수_{}'.format(i)]

In [None]:
df_ca = df_ca.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_ca.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"ca"]
X_test = df_ca.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=22882)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=24330):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['ca'] = y_test

###  na

In [None]:
df_na = df.copy()

In [None]:
df_na = df_na.loc[:,:"흡수계수_990"]

In [None]:
df_na = df_na.fillna(0)

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_na['흡수nextrat{}_{}'.format(i,j)] = df_na['흡수계수_{}'.format(i+j)] / df_na['흡수계수_{}'.format(i)]

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_na.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"na"]
X_test = df_na.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
                    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=22882)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=22076):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['na'] = y_test

In [None]:
def to_zero(x) : 
    if x <= 0 : 
        return(0)
    else : return(x)

In [None]:
for i in submission.columns : 
    submission[i] = submission[i].apply(to_zero)

In [None]:
submission.to_csv('edit_dst22882.csv',index=False)

## CASE 4

###  hhb

In [None]:
df_hhb = df.copy()

In [None]:
df_hhb = df_hhb.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

drop_col = ['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst',
'960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980']

In [None]:
X_train = df_hhb.iloc[:train.shape[0],:].drop(columns=drop_col)
y_train = train.loc[:,"hhb"]
X_test = df_hhb.iloc[train.shape[0]:,:].drop(columns=drop_col)

In [None]:
import lightgbm as lgb#0.842541
from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=3402)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=50000):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['hhb'] = y_test

###  hbo2

In [None]:
df_hbo2 = df.copy()

In [None]:
df_hbo2 = df_hbo2.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_hbo2.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"hbo2"]
X_test = df_hbo2.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=3402)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=22222):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['hbo2'] = y_test

###  ca

In [None]:
df_ca = df.copy()

In [None]:
df_ca = df_ca.loc[:,:"흡수계수_990"]

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_ca['흡수nextrat{}_{}'.format(i,j)] = df_ca['흡수계수_{}'.format(i+j)] / df_ca['흡수계수_{}'.format(i)]

In [None]:
df_ca = df_ca.fillna(0)

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_ca.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"ca"]
X_test = df_ca.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src', 'src_diff650', 'src_diff660',
       'src_diff670', 'src_diff680', 'src_diff690', 'src_diff700',
       'src_diff710', 'src_diff720', 'src_diff730', 'src_diff740',
       'src_diff750', 'src_diff760', 'src_diff770', 'src_diff780',
       'src_diff790', 'src_diff800', 'src_diff810', 'src_diff820',
       'src_diff830', 'src_diff840', 'src_diff850', 'src_diff860',
       'src_diff870', 'src_diff880', 'src_diff890', 'src_diff900',
       'src_diff910', 'src_diff920', 'src_diff930', 'src_diff940',
       'src_diff950', 'src_diff960', 'src_diff970', 'src_diff980',
       'dst_diff650', 'dst_diff660', 'dst_diff670', 'dst_diff680',
       'dst_diff690', 'dst_diff700', 'dst_diff710', 'dst_diff720',
       'dst_diff730', 'dst_diff740', 'dst_diff750', 'dst_diff760',
       'dst_diff770', 'dst_diff780', 'dst_diff790', 'dst_diff800',
       'dst_diff810', 'dst_diff820', 'dst_diff830', 'dst_diff840',
       'dst_diff850', 'dst_diff860', 'dst_diff870', 'dst_diff880',
       'dst_diff890', 'dst_diff900', 'dst_diff910', 'dst_diff920',
       'dst_diff930', 'dst_diff940', 'dst_diff950', 'dst_diff960',
       'dst_diff970', 'dst_diff980','rho_src_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=3402)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=45530):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['ca'] = y_test

###  na

In [None]:
df_na = df.copy()

In [None]:
df_na = df_na.loc[:,:"흡수계수_990"]

In [None]:
df_na = df_na.fillna(0)

In [None]:
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        df_na['흡수nextrat{}_{}'.format(i,j)] = df_na['흡수계수_{}'.format(i+j)] / df_na['흡수계수_{}'.format(i)]

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.
# 모델을 학습시키며 성능의 감소를 가져오는 피쳐를 찾아내 제거.

X_train = df_na.iloc[:train.shape[0],:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
        '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])
y_train = train.loc[:,"na"]
X_test = df_na.iloc[train.shape[0]:,:].drop(columns=['id','650_src', '660_src', '670_src', '680_src', '690_src', '700_src',
       '710_src', '720_src', '730_src', '740_src', '750_src', '760_src',
       '770_src', '780_src', '790_src', '800_src', '810_src', '820_src',
       '830_src', '840_src', '850_src', '860_src', '870_src', '880_src',
       '890_src', '900_src', '910_src', '920_src', '930_src', '940_src',
       '950_src', '960_src', '970_src', '980_src', '990_src','650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag',
       '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag',
       '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag',
       '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag',
       '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag',
       '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag',
       '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag',
       '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag',
       '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag',
       '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag',
       '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag',
       '980_dst_fft_imag', '990_dst_fft_imag','650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real',
       '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real',
       '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real',
       '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real',
       '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real',
       '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real',
       '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real',
       '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real',
       '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real',
       '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real',
       '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real',
       '980_dst_fft_real', '990_dst_fft_real','src_diff650', 'src_diff660', 'src_diff670', 'src_diff680',
       'src_diff690', 'src_diff700', 'src_diff710', 'src_diff720',
       'src_diff730', 'src_diff740', 'src_diff750', 'src_diff760',
       'src_diff770', 'src_diff780', 'src_diff790', 'src_diff800',
       'src_diff810', 'src_diff820', 'src_diff830', 'src_diff840',
       'src_diff850', 'src_diff860', 'src_diff870', 'src_diff880',
       'src_diff890', 'src_diff900', 'src_diff910', 'src_diff920',
       'src_diff930', 'src_diff940', 'src_diff950', 'src_diff960',
       'src_diff970', 'src_diff980', 'dst_diff650', 'dst_diff660',
       'dst_diff670', 'dst_diff680', 'dst_diff690', 'dst_diff700',
       'dst_diff710', 'dst_diff720', 'dst_diff730', 'dst_diff740',
       'dst_diff750', 'dst_diff760', 'dst_diff770', 'dst_diff780',
       'dst_diff790', 'dst_diff800', 'dst_diff810', 'dst_diff820',
       'dst_diff830', 'dst_diff840', 'dst_diff850', 'dst_diff860',
       'dst_diff870', 'dst_diff880', 'dst_diff890', 'dst_diff900',
       'dst_diff910', 'dst_diff920', 'dst_diff930', 'dst_diff940',
       'dst_diff950', 'dst_diff960', 'dst_diff970', 'dst_diff980','rho_src_mean','rho_dst_mean',
                    '650_dst','660_dst','670_dst','680_dst','690_dst','700_dst','710_dst','720_dst',
'730_dst','740_dst','750_dst','760_dst','770_dst','780_dst','790_dst', '800_dst',
'810_dst', '820_dst','830_dst','840_dst','850_dst','860_dst','870_dst','880_dst','890_dst',
'900_dst','910_dst','920_dst','930_dst','940_dst','950_dst','960_dst','970_dst','980_dst','990_dst','src_sum','dst_sum','sum_rate'])

In [None]:
import lightgbm as lgb#0.217199

from sklearn.model_selection import train_test_split

train_x, train_cv, y, y_cv = train_test_split(X_train,y_train, test_size=0.15, random_state=3402)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=21000):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

from matplotlib import pylab as plt

fig, ax = plt.subplots(figsize=(12,18))

lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)

plt.show()

In [None]:
y_test =  model.predict(X_test)
submission['na'] = y_test

In [None]:
def to_zero(x) : 
    if x <= 0 : 
        return(0)
    else : return(x)

In [None]:
for i in submission.columns : 
    submission[i] = submission[i].apply(to_zero)

In [None]:
submission.to_csv('edit_dst_3402.csv',index=False)

# CASE 5

### 필요한 패키지 import 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split

### 최종 제출 형태 확인

In [None]:
sample = pd.read_csv('data/sample_submission.csv',engine='python')
sample.head()

### test 데이터 확인

In [None]:
test = pd.read_csv('data/test.csv',engine='python').iloc[:,1:] # index행 제거하기
test.head()

### train 데이터 확인

In [None]:
train = pd.read_csv('train.csv',engine='python').iloc[:,1:] # index행 제거하기
train.head()

In [None]:
# train 데이터에 존재하는 예측하고자 하는 값 추출 및 제거
train_y = train.iloc[:,-4:]
train = train.iloc[:,:-4]

In [None]:
# train data에 존재하는 Null값 확인
train.info()

#### Dacon에 있는 결측치 처리방법 사용

In [None]:
# dst data에만 Null값이 있는것을 확인하여 dst만 추출
train_dst = train.filter(regex='_dst$', axis=1).replace(0, np.NaN) # dst 데이터만 따로 뺀다.
test_dst = test.filter(regex='_dst$', axis=1).replace(0, np.NaN) # 보간을 하기위해 결측값을 삭제한다.
test_dst.head(1)

In [None]:
# 보간법 방법중 linear 방법을 활용하여 NaN값 삽입 
train_dst = train_dst.interpolate(methods='linear', axis=1)
test_dst = test_dst.interpolate(methods='linear', axis=1)
# 스팩트럼 데이터에서 연속해서 NaN이 있는 경우 처리가 안되기 때문에 이러한 값은 전부 0으로 처리
train_dst.fillna(0, inplace=True) 
test_dst.fillna(0, inplace=True)
test_dst.head(1)

In [None]:
train.update(train_dst) # 보간한 데이터를 기존 데이터프레임에 업데이트 한다.
test.update(test_dst)

In [None]:
# 다 채워진 모습을 볼 수 있다.
train.info()

In [None]:
# src와 dst간 사칙연산 결과 삽입. 어느정도 성능의 향상 존재 + dst가 0인 경우가 있기 때문에 1e-18 추가해서 계산.

add_list=['650_add', '660_add', '670_add', '680_add', '690_add', '700_add', '710_add', '720_add', '730_add', 
          '740_add', '750_add', '760_add', '770_add', '780_add', '790_add', '800_add', '810_add', '820_add', 
          '830_add', '840_add', '850_add', '860_add', '870_add', '880_add', '890_add', '900_add', '910_add', 
          '920_add', '930_add', '940_add', '950_add', '960_add', '970_add', '980_add', '990_add']
diff_list=['650_diff', '660_diff', '670_diff', '680_diff', '690_diff', '700_diff', '710_diff', '720_diff', '730_diff', 
          '740_diff', '750_diff', '760_diff', '770_diff', '780_diff', '790_diff', '800_diff', '810_diff', '820_diff', 
          '830_diff', '840_diff', '850_diff', '860_diff', '870_diff', '880_diff', '890_diff', '900_diff', '910_diff', 
          '920_diff', '930_diff', '940_diff', '950_diff', '960_diff', '970_diff', '980_diff', '990_diff']
div_list=['650_div', '660_div', '670_div', '680_div', '690_div', '700_div', '710_div', '720_div', '730_div', 
          '740_div', '750_div', '760_div', '770_div', '780_div', '790_div', '800_div', '810_div', '820_div', 
          '830_div', '840_div', '850_div', '860_div', '870_div', '880_div', '890_div', '900_div', '910_div', 
          '920_div', '930_div', '940_div', '950_div', '960_div', '970_div', '980_div', '990_div']
multi_list=['650_multi', '660_multi', '670_multi', '680_multi', '690_multi', '700_multi', '710_multi', '720_multi', '730_multi', 
          '740_multi', '750_multi', '760_multi', '770_multi', '780_multi', '790_multi', '800_multi', '810_multi', '820_multi', 
          '830_multi', '840_multi', '850_multi', '860_multi', '870_multi', '880_multi', '890_multi', '900_multi', '910_multi', 
          '920_multi', '930_multi', '940_multi', '950_multi', '960_multi', '970_multi', '980_multi', '990_multi']
dst_src_add = pd.DataFrame(train.iloc[:,1:36].values + train.iloc[:,36:71].values,columns=add_list)
dst_src_diff = pd.DataFrame(train.iloc[:,1:36].values - train.iloc[:,36:71].values,columns=diff_list)
dst_src_div = pd.DataFrame(train.iloc[:,1:36].values / (train.iloc[:,36:71].values+1e-18),columns=div_list)
dst_src_mul = pd.DataFrame(train.iloc[:,1:36].values * train.iloc[:,36:71].values,columns=multi_list)
train = pd.concat([train,dst_src_add],axis=1)
train = pd.concat([train,dst_src_diff],axis=1)
train = pd.concat([train,dst_src_div],axis=1)
train = pd.concat([train,dst_src_mul],axis=1)
train.shape

In [None]:
dst_src_add = pd.DataFrame(test.iloc[:,1:36].values + test.iloc[:,36:71].values,columns=add_list)
dst_src_diff = pd.DataFrame(test.iloc[:,1:36].values - test.iloc[:,36:71].values,columns=diff_list)
dst_src_mul = pd.DataFrame(test.iloc[:,1:36].values * test.iloc[:,36:71].values,columns=multi_list)
dst_src_div = pd.DataFrame(test.iloc[:,1:36].values / (test.iloc[:,36:71].values+1e-16),columns=div_list)
test = pd.concat([test,dst_src_add],axis=1)
test = pd.concat([test,dst_src_diff],axis=1)
test = pd.concat([test,dst_src_div],axis=1)
test = pd.concat([test,dst_src_mul],axis=1)
test.shape

### 퓨리에 변환 실시

In [None]:
src_list=['650_src', '660_src', '670_src', '680_src', '690_src', '700_src', '710_src', '720_src', '730_src', 
          '740_src', '750_src', '760_src', '770_src', '780_src', '790_src', '800_src', '810_src', '820_src', 
          '830_src', '840_src', '850_src', '860_src', '870_src', '880_src', '890_src', '900_src', '910_src', 
          '920_src', '930_src', '940_src', '950_src', '960_src', '970_src', '980_src', '990_src']

dst_list=['650_dst', '660_dst', '670_dst', '680_dst', '690_dst', '700_dst', '710_dst', '720_dst', '730_dst', 
          '740_dst', '750_dst', '760_dst', '770_dst', '780_dst', '790_dst', '800_dst', '810_dst', '820_dst', 
          '830_dst', '840_dst', '850_dst', '860_dst', '870_dst', '880_dst', '890_dst', '900_dst', '910_dst', 
          '920_dst', '930_dst', '940_dst', '950_dst', '960_dst', '970_dst', '980_dst', '990_dst']

In [None]:
alpha_real=train[dst_list]
alpha_imag=train[dst_list]

beta_real=test[dst_list]
beta_imag=test[dst_list]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in dst_list:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

In [None]:
train=pd.concat((train, alpha), axis=1)
test=pd.concat((test, beta), axis=1)

In [None]:
del alpha, beta

In [None]:
train.shape, test.shape

### 흡광계수 만들기 (빛의 흡수율 활용)

src는 광원의 빛의 세기 dst는 측정된 빛의 세기 활용하여 흡광계수 만들기

In [None]:
for i in range(650,1000,10) : 
    train['흡수계수_{}'.format(i)] = np.log10(train['{}_src'.format(i)] / train['{}_dst'.format(i)]) / train.rho
#     train['투과율_{}'.format(i)] = np.log10(train['{}_src'.format(i)] / train['{}_dst'.format(i)])
    test['흡수계수_{}'.format(i)] = np.log10(test['{}_src'.format(i)] / test['{}_dst'.format(i)]) / test.rho
#     test['투과율_{}'.format(i)] = np.log10(test['{}_src'.format(i)] / test['{}_dst'.format(i)])
    
train = train.fillna(0)
test = test.fillna(0)

train.isnull().sum().sum(), test.isnull().sum().sum()

### 흡수량 만들어 삽입

src의 실제 세기 대비 흡수된 빛의 세기를 계산하여 활용

In [None]:
srrho_lisrho = ['65_srrho', '66_srrho', '67_srrho', '68_srrho', '69_srrho', '70_srrho', '71_srrho', '72_srrho', '73_srrho', 
          '74_srrho', '75_srrho', '76_srrho', '77_srrho', '78_srrho', '79_srrho', '80_srrho', '81_srrho', '82_srrho', 
          '83_srrho', '84_srrho', '85_srrho', '86_srrho', '87_srrho', '88_srrho', '89_srrho', '90_srrho', '91_srrho', 
          '92_srrho', '93_srrho', '94_srrho', '95_srrho', '96_srrho', '97_srrho', '98_srrho', '99_srrho','65_dsrho',
          '66_dsrho', '67_dsrho', '68_dsrho', '69_dsrho', '70_dsrho', '71_dsrho', '72_dsrho', '73_dsrho', 
          '74_dsrho', '75_dsrho', '76_dsrho', '77_dsrho', '78_dsrho', '79_dsrho', '80_dsrho', '81_dsrho', '82_dsrho', 
          '83_dsrho', '84_dsrho', '85_dsrho', '86_dsrho', '87_dsrho', '88_dsrho', '89_dsrho', '90_dsrho', '91_dsrho', 
          '92_dsrho', '93_dsrho', '94_dsrho', '95_dsrho', '96_dsrho', '97_dsrho', '98_dsrho', '99_dsrho']

In [None]:
for i in range(650,1000,10) : 
    train['흡수량_{}'.format(i)] = (train['{}_src'.format(i)]-train['{}_dst'.format(i)])/(train['{}_src'.format(i)]+1e-20)
    test['흡수량_{}'.format(i)] =  (test['{}_src'.format(i)]-test['{}_dst'.format(i)])/(test['{}_src'.format(i)]+1e-20)
a = train.iloc[:,1:71].apply(lambda x: x/(train['rho']^2))
b = test.iloc[:,1:71].apply(lambda x: x/(test['rho']^2))
a.columns = srrho_lisrho
b.columns = srrho_lisrho
train = pd.concat([train,a],axis=1)
test = pd.concat([test,b],axis=1)

In [None]:
# 이렇게 구한 흡수계수 활용 흡수평균 및 편차 합 변수 만듦
train['흡수평균'] = train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).mean(axis=1)
train['흡수편차'] = train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).std(axis=1)
train['흡수합'] = train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).sum(axis=1)
test['흡수평균'] = test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).mean(axis=1)
test['흡수편차'] = test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).std(axis=1)
test['흡수합'] = test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).sum(axis=1)

In [None]:
# 흡수계수, src, dst의 값을 슬라이딩 하며 비율을 계산함 높은 성능 향상을 보임 
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        train['흡수nextrat{}_{}'.format(i,j)] = train['흡수계수_{}'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (train['흡수계수_{}'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        train['srcnextrat{}_{}'.format(i,j)] = train['{}_src'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (train['{}_src'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        train['dstnextrat{}_{}'.format(i,j)] = train['{}_dst'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (train['{}_dst'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        test['흡수nextrat{}_{}'.format(i,j)] = test['흡수계수_{}'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (test['흡수계수_{}'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        test['srcnextrat{}_{}'.format(i,j)] = test['{}_src'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (test['{}_src'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        test['dstnextrat{}_{}'.format(i,j)] = test['{}_dst'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (test['{}_dst'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)

In [None]:
srr_liss=['65srr', '66srr', '67srr', '68srr', '69srr', '70srr', '71srr', '72srr', '73srr', 
          '74srr', '75srr', '76srr', '77srr', '78srr', '79srr', '80srr', '81srr', '82srr', 
          '83srr', '84srr', '85srr', '86srr', '87srr', '88srr', '89srr', '90srr', '91srr', 
          '92srr', '93srr', '94srr', '95srr', '96srr', '97srr', '98srr']
dsr_liss = ['65dss','66dss', '67dss', '68dss', '69dss', '70dss', '71dss', '72dss', '73dss', 
          '74dss', '75dss', '76dss', '77dss', '78dss', '79dss', '80dss', '81dss', '82dss', 
          '83dss', '84dss', '85dss', '86dss', '87dss', '88dss', '89dss', '90dss', '91dss', 
          '92dss', '93dss', '94dss', '95dss', '96dss', '97dss', '98dss']

In [None]:
# 한칸 뒤에 있는 값을 나누어 계산함 분모가 0일때를 대비하여 잔차 삽입
a = pd.DataFrame(columns = srr_liss, index=range(10000))
b = pd.DataFrame(columns = srr_liss, index=range(10000))
c = pd.DataFrame(columns = dsr_liss, index=range(10000))
d = pd.DataFrame(columns = dsr_liss, index=range(10000))
for i in range(1,35):
    a.iloc[:,i-1] = train.iloc[:,i]/(train.iloc[:,i+1]+1e-20)
    b.iloc[:,i-1] = test.iloc[:,i]/(test.iloc[:,i+1]+1e-20)
    c.iloc[:,i-1] = train.iloc[:,i+36]/(train.iloc[:,i+37]+1e-20)
    d.iloc[:,i-1] = test.iloc[:,i+36]/(test.iloc[:,i+37]+1e-20)
train = pd.concat([train,a],axis=1)
train = pd.concat([train,c],axis=1)
test = pd.concat([test,b],axis=1)
test = pd.concat([test,d],axis=1)

In [None]:
# src 및 dst의 값 행정규화 시행 
srstd_list=['65srstd', '66srstd', '67srstd', '68srstd', '69srstd', '70srstd', '71srstd', '72srstd', '73srstd', 
          '74srstd', '75srstd', '76srstd', '77srstd', '78srstd', '79srstd', '80srstd', '81srstd', '82srstd', 
          '83srstd', '84srstd', '85srstd', '86srstd', '87srstd', '88srstd', '89srstd', '90srstd', '91srstd', 
          '92srstd', '93srstd', '94srstd', '95srstd', '96srstd', '97srstd', '98srstd', '99srstd','65dststd',
          '66dststd', '67dststd', '68dststd', '69dststd', '70dststd', '71dststd', '72dststd', '73dststd', 
          '74dststd', '75dststd', '76dststd', '77dststd', '78dststd', '79dststd', '80dststd', '81dststd', '82dststd', 
          '83dststd', '84dststd', '85dststd', '86dststd', '87dststd', '88dststd', '89dststd', '90dststd', '91dststd', 
          '92dststd', '93dststd', '94dststd', '95dststd', '96dststd', '97dststd', '98dststd', '99dststd']

mean_1 = train.iloc[:,1:36].mean(axis=1)
mean_2 = train.iloc[:,36:71].mean(axis=1)

std_1 = train.iloc[:,1:36].std(axis=1)
std_2 = train.iloc[:,36:71].std(axis=1)

mean_1t = test.iloc[:,1:36].mean(axis=1)
mean_2t = test.iloc[:,36:71].mean(axis=1)

std_1t = test.iloc[:,1:36].std(axis=1)
std_2t = test.iloc[:,36:71].std(axis=1)

tra_1 = train.iloc[:,1:36].apply(lambda x: (x-mean_1)/std_1)
tra_2 = train.iloc[:,36:71].apply(lambda x: (x-mean_2)/std_2)
tra = pd.concat([tra_1,tra_2],axis=1)
tra.columns = srstd_list

tes_1 = test.iloc[:,1:36].apply(lambda x: (x-mean_1t)/std_1t)
tes_2 = test.iloc[:,36:71].apply(lambda x: (x-mean_2t)/std_2t)
tes = pd.concat([tes_1,tes_2],axis=1)
tes.columns = srstd_list

train = pd.concat([train,tra],axis=1)
test = pd.concat([test,tes],axis=1)

In [None]:
# src, dst의 값 행 minmax scaling 시행
srmax_list=['65srmax', '66srmax', '67srmax', '68srmax', '69srmax', '70srmax', '71srmax', '72srmax', '73srmax', 
          '74srmax', '75srmax', '76srmax', '77srmax', '78srmax', '79srmax', '80srmax', '81srmax', '82srmax', 
          '83srmax', '84srmax', '85srmax', '86srmax', '87srmax', '88srmax', '89srmax', '90srmax', '91srmax', 
          '92srmax', '93srmax', '94srmax', '95srmax', '96srmax', '97srmax', '98srmax', '99srmax','65dstmax',
          '66dstmax', '67dstmax', '68dstmax', '69dstmax', '70dstmax', '71dstmax', '72dstmax', '73dstmax', 
          '74dstmax', '75dstmax', '76dstmax', '77dstmax', '78dstmax', '79dstmax', '80dstmax', '81dstmax', '82dstmax', 
          '83dstmax', '84dstmax', '85dstmax', '86dstmax', '87dstmax', '88dstmax', '89dstmax', '90dstmax', '91dstmax', 
          '92dstmax', '93dstmax', '94dstmax', '95dstmax', '96dstmax', '97dstmax', '98dstmax', '99dstmax']

max_1 = train.iloc[:,1:36].max(axis=1)
max_2 = train.iloc[:,36:71].max(axis=1)

min_1 = train.iloc[:,1:36].min(axis=1)
min_2 = train.iloc[:,36:71].min(axis=1)

max_1t = test.iloc[:,1:36].max(axis=1)
max_2t = test.iloc[:,36:71].max(axis=1)

min_1t = test.iloc[:,1:36].min(axis=1)
min_2t = test.iloc[:,36:71].min(axis=1)

tra_1 = train.iloc[:,1:36].apply(lambda x: (x-min_1)/(max_1 - min_1))
tra_2 = train.iloc[:,36:71].apply(lambda x: (x-min_2)/(max_2 -min_2))
tra = pd.concat([tra_1,tra_2],axis=1)
tra.columns = srmax_list

tes_1 = test.iloc[:,1:36].apply(lambda x: (x-min_1t)/(max_1t - min_1t))
tes_2 = test.iloc[:,36:71].apply(lambda x: (x-min_2t)/(max_2t - min_2t))
tes = pd.concat([tes_1,tes_2],axis=1)
tes.columns = srmax_list

train = pd.concat([train,tra],axis=1)
test = pd.concat([test,tes],axis=1)

In [None]:
# src, dst값을 최대값으로 나눈 값 삽입 
srmaratio_list=['65srmaratio', '66srmaratio', '67srmaratio', '68srmaratio', '69srmaratio', '70srmaratio', '71srmaratio', '72srmaratio', '73srmaratio', 
          '74srmaratio', '75srmaratio', '76srmaratio', '77srmaratio', '78srmaratio', '79srmaratio', '80srmaratio', '81srmaratio', '82srmaratio', 
          '83srmaratio', '84srmaratio', '85srmaratio', '86srmaratio', '87srmaratio', '88srmaratio', '89srmaratio', '90srmaratio', '91srmaratio', 
          '92srmaratio', '93srmaratio', '94srmaratio', '95srmaratio', '96srmaratio', '97srmaratio', '98srmaratio', '99srmaratio','65dstmaratio',
          '66dstmaratio', '67dstmaratio', '68dstmaratio', '69dstmaratio', '70dstmaratio', '71dstmaratio', '72dstmaratio', '73dstmaratio', 
          '74dstmaratio', '75dstmaratio', '76dstmaratio', '77dstmaratio', '78dstmaratio', '79dstmaratio', '80dstmaratio', '81dstmaratio', '82dstmaratio', 
          '83dstmaratio', '84dstmaratio', '85dstmaratio', '86dstmaratio', '87dstmaratio', '88dstmaratio', '89dstmaratio', '90dstmaratio', '91dstmaratio', 
          '92dstmaratio', '93dstmaratio', '94dstmaratio', '95dstmaratio', '96dstmaratio', '97dstmaratio', '98dstmaratio', '99dstmaratio']
max_1 = train.iloc[:,1:36].max(axis=1)
max_2 = train.iloc[:,36:71].max(axis=1)

tra_1 = train.iloc[:,1:36].apply(lambda x: x/max_1)
tra_2 = train.iloc[:,36:71].apply(lambda x: x/max_2)
tra = pd.concat([tra_1,tra_2],axis=1)
tra.columns = srmaratio_list

max_1t = test.iloc[:,1:36].max(axis=1)
max_2t = test.iloc[:,36:71].max(axis=1)

tra_1 = test.iloc[:,1:36].apply(lambda x: x/max_1t)
tra_2 = test.iloc[:,36:71].apply(lambda x: x/max_2t)
tes = pd.concat([tra_1,tra_2],axis=1)
tes.columns = srmaratio_list

train = pd.concat([train,tra],axis=1)
test = pd.concat([test,tes],axis=1)

In [None]:
# src, dst의 최대값 최소값 평균, 표준편자, 최대값 최소값 차이, src, dst의 최대값 최소값 차이의 비율 삽입
train['src_max'] = train.iloc[:,1:36].max(axis=1)
train['src_min'] = train.iloc[:,1:36].min(axis=1)
train['dst_max'] = train.iloc[:,36:71].max(axis=1)
train['dst_min'] = train.iloc[:,36:71].min(axis=1)
train['src_mean'] = train.iloc[:,1:36].mean(axis=1)
train['dst_mean'] = train.iloc[:,36:71].mean(axis=1)
train['src_std'] = train.iloc[:,1:36].std(axis=1)
train['dst_std'] = train.iloc[:,36:71].std(axis=1)
train['max_min_src'] = train['src_max'] - train['src_min']
train['max_min_dst'] = train['dst_max'] - train['dst_min']
train['dst_src_max_min_ratio'] = train['max_min_src'] / (train['max_min_dst']+0.01)
test['src_max'] = test.iloc[:,1:36].max(axis=1)
test['src_min'] = test.iloc[:,1:36].min(axis=1)
test['dst_max'] = test.iloc[:,36:71].max(axis=1)
test['dst_min'] = test.iloc[:,36:71].min(axis=1)
test['src_mean'] = test.iloc[:,1:36].mean(axis=1)
test['dst_mean'] = test.iloc[:,36:71].mean(axis=1)
test['src_std'] = test.iloc[:,1:36].std(axis=1)
test['dst_std'] = test.iloc[:,36:71].std(axis=1)
test['max_min_src'] = test['src_max'] - test['src_min']
test['max_min_dst'] = test['dst_max'] - test['dst_min']
test['dst_src_max_min_ratio'] = test['max_min_src'] / (test['max_min_dst']+0.01)
train.head()

In [None]:
srmul_list=['65srmul', '66srmul', '67srmul', '68srmul', '69srmul', '70srmul', '71srmul', '72srmul', '73srmul', 
          '74srmul', '75srmul', '76srmul', '77srmul', '78srmul', '79srmul', '80srmul', '81srmul', '82srmul', 
          '83srmul', '84srmul', '85srmul', '86srmul', '87srmul', '88srmul', '89srmul', '90srmul', '91srmul', 
          '92srmul', '93srmul', '94srmul', '95srmul', '96srmul', '97srmul', '98srmul', '99srmul','65dstmul',
          '66dstmul', '67dstmul', '68dstmul', '69dstmul', '70dstmul', '71dstmul', '72dstmul', '73dstmul', 
          '74dstmul', '75dstmul', '76dstmul', '77dstmul', '78dstmul', '79dstmul', '80dstmul', '81dstmul', '82dstmul', 
          '83dstmul', '84dstmul', '85dstmul', '86dstmul', '87dstmul', '88dstmul', '89dstmul', '90dstmul', '91dstmul', 
          '92dstmul', '93dstmul', '94dstmul', '95dstmul', '96dstmul', '97dstmul', '98dstmul', '99dstmul']

In [None]:
# rho의 거리 제곱을 곱하여 거리와 상관없는 빛의 세기로 만듦.
a = train.iloc[:,1:71].apply(lambda x: x*(train['rho']^2))
a.columns = srmul_list
b = test.iloc[:,1:71].apply(lambda x: x*(test['rho']^2))
b.columns = srmul_list

train =pd.concat([train,a],axis=1)
test =pd.concat([test,b],axis=1)

In [None]:
train.shape, test.shape

In [None]:
srmui_list=['65srmui', '66srmui', '67srmui', '68srmui', '69srmui', '70srmui', '71srmui', '72srmui', '73srmui', 
          '74srmui', '75srmui', '76srmui', '77srmui', '78srmui', '79srmui', '80srmui', '81srmui', '82srmui', 
          '83srmui', '84srmui', '85srmui', '86srmui', '87srmui', '88srmui', '89srmui', '90srmui', '91srmui', 
          '92srmui', '93srmui', '94srmui', '95srmui', '96srmui', '97srmui', '98srmui', '99srmui']

In [None]:
# src와 거리의 제곱을 곱한 후 dst를 곱하는 피쳐를 만듦.
a = train.iloc[:,1:36].apply(lambda x: x*(train['rho']^2)).values*train.iloc[:,36:71].values
a = pd.DataFrame(a,columns=srmui_list)
b = test.iloc[:,1:36].apply(lambda x: x*(test['rho']^2)).values*test.iloc[:,36:71].values
b = pd.DataFrame(b,columns=srmui_list)

train =pd.concat([train,a],axis=1)
test =pd.concat([test,b],axis=1)

In [None]:
# src에 거리제곱을 나누어 거리와 관계없는 빛의 세기로 만들고 dst를 곱함
srmuii_list=['65srmuii', '66srmuii', '67srmuii', '68srmuii', '69srmuii', '70srmuii', '71srmuii', '72srmuii', '73srmuii', 
          '74srmuii', '75srmuii', '76srmuii', '77srmuii', '78srmuii', '79srmuii', '80srmuii', '81srmuii', '82srmuii', 
          '83srmuii', '84srmuii', '85srmuii', '86srmuii', '87srmuii', '88srmuii', '89srmuii', '90srmuii', '91srmuii', 
          '92srmuii', '93srmuii', '94srmuii', '95srmuii', '96srmuii', '97srmuii', '98srmuii', '99srmuii']

a = train.iloc[:,1:36].apply(lambda x: x/(train['rho']^2)).values*train.iloc[:,36:71].values
a = pd.DataFrame(a,columns=srmuii_list)
b = test.iloc[:,1:36].apply(lambda x: x/(test['rho']^2)).values*test.iloc[:,36:71].values
b = pd.DataFrame(b,columns=srmuii_list)

train =pd.concat([train,a],axis=1)
test =pd.concat([test,b],axis=1)

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
train_y.head()

In [None]:
# lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['hhb'], test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=46741):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

In [None]:
y_test =  model.predict(test.values)
sample['hhb'] = y_test

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['hbo2'], test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=29668):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

In [None]:
y_test =  model.predict(test.values)
sample['hbo2'] = y_test

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['ca'], test_size=0.15, random_state=12300)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=29652):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

In [None]:
y_test =  model.predict(test.values)
sample['ca'] = y_test

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['na'], test_size=0.15, random_state=12300)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=12017):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

In [None]:
y_test =  model.predict(test.values)
sample['na'] = y_test

In [None]:
# sample 값에 제대로 값이 삽입 됫는지 확인 - 음수 값이 존재함을 볼 수 있음
sample.describe()

In [None]:
# 음수 값은 있을 수 없기 때문에 음수값은 전부 0으로 치환
def zero_fill(x):
    if x <=0:
        return .0
    else:
        return x

In [None]:
sample['hhb'] = sample['hhb'].map(zero_fill)
sample['na'] = sample['na'].map(zero_fill)
sample.describe()

In [None]:
sample.to_csv('Making.csv',index=False)

# Case 6

## 필요한 패키지 import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split

## 최종 제출 형태 확인

In [None]:
sample = pd.read_csv('sample_submission.csv',engine='python')
sample.head()

## test 데이터 확인

In [None]:
test = pd.read_csv('test.csv',engine='python').iloc[:,1:]
test.head()

## train 데이터 확인

In [None]:
train = pd.read_csv('train.csv',engine='python').iloc[:,1:]
train.head()

In [None]:
# train 데이터에 존재하는 예측하고자 하는 값 추출 및 제거
train_y = train.iloc[:,-4:]
train = train.iloc[:,:-4]

In [None]:
# train data에 존재하는 Null값 확인
train.info()

#### Dacon에 있는 결측치 처리방법 사용

In [None]:
# dst data에만 Null값이 있는것을 확인하여 dst만 추출
train_dst = train.filter(regex='_dst$', axis=1).replace(0, np.NaN) # dst 데이터만 따로 뺀다.
test_dst = test.filter(regex='_dst$', axis=1).replace(0, np.NaN) # 보간을 하기위해 결측값을 삭제한다.
test_dst.head(1)

In [None]:
# 보간법 방법중 linear 방법을 활용하여 NaN값 삽입 
train_dst = train_dst.interpolate(methods='linear', axis=1)
test_dst = test_dst.interpolate(methods='linear', axis=1)
# 스팩트럼 데이터에서 연속해서 NaN이 있는 경우 처리가 안되기 때문에 이러한 값은 전부 0으로 처리
train_dst.fillna(0, inplace=True) 
test_dst.fillna(0, inplace=True)
test_dst.head(1)

In [None]:
train.update(train_dst) # 보간한 데이터를 기존 데이터프레임에 업데이트 한다.
test.update(test_dst)

In [None]:
# 다 채워진 모습을 볼 수 있다
train.info()

In [None]:
# src와 dst간 사칙연산 결과 삽입. 어느정도 성능의 향상 존재 + dst가 0인 경우가 있기 때문에 1e-18 추가해서 계산.

add_list=['650_add', '660_add', '670_add', '680_add', '690_add', '700_add', '710_add', '720_add', '730_add', 
          '740_add', '750_add', '760_add', '770_add', '780_add', '790_add', '800_add', '810_add', '820_add', 
          '830_add', '840_add', '850_add', '860_add', '870_add', '880_add', '890_add', '900_add', '910_add', 
          '920_add', '930_add', '940_add', '950_add', '960_add', '970_add', '980_add', '990_add']
diff_list=['650_diff', '660_diff', '670_diff', '680_diff', '690_diff', '700_diff', '710_diff', '720_diff', '730_diff', 
          '740_diff', '750_diff', '760_diff', '770_diff', '780_diff', '790_diff', '800_diff', '810_diff', '820_diff', 
          '830_diff', '840_diff', '850_diff', '860_diff', '870_diff', '880_diff', '890_diff', '900_diff', '910_diff', 
          '920_diff', '930_diff', '940_diff', '950_diff', '960_diff', '970_diff', '980_diff', '990_diff']
div_list=['650_div', '660_div', '670_div', '680_div', '690_div', '700_div', '710_div', '720_div', '730_div', 
          '740_div', '750_div', '760_div', '770_div', '780_div', '790_div', '800_div', '810_div', '820_div', 
          '830_div', '840_div', '850_div', '860_div', '870_div', '880_div', '890_div', '900_div', '910_div', 
          '920_div', '930_div', '940_div', '950_div', '960_div', '970_div', '980_div', '990_div']
multi_list=['650_multi', '660_multi', '670_multi', '680_multi', '690_multi', '700_multi', '710_multi', '720_multi', '730_multi', 
          '740_multi', '750_multi', '760_multi', '770_multi', '780_multi', '790_multi', '800_multi', '810_multi', '820_multi', 
          '830_multi', '840_multi', '850_multi', '860_multi', '870_multi', '880_multi', '890_multi', '900_multi', '910_multi', 
          '920_multi', '930_multi', '940_multi', '950_multi', '960_multi', '970_multi', '980_multi', '990_multi']
dst_src_add = pd.DataFrame(train.iloc[:,1:36].values + train.iloc[:,36:71].values,columns=add_list)
dst_src_diff = pd.DataFrame(train.iloc[:,1:36].values - train.iloc[:,36:71].values,columns=diff_list)
dst_src_div = pd.DataFrame(train.iloc[:,1:36].values / (train.iloc[:,36:71].values+1e-18),columns=div_list)
dst_src_mul = pd.DataFrame(train.iloc[:,1:36].values * train.iloc[:,36:71].values,columns=multi_list)
train = pd.concat([train,dst_src_add],axis=1)
train = pd.concat([train,dst_src_diff],axis=1)
train = pd.concat([train,dst_src_div],axis=1)
train = pd.concat([train,dst_src_mul],axis=1)
train.shape

In [None]:
dst_src_add = pd.DataFrame(test.iloc[:,1:36].values + test.iloc[:,36:71].values,columns=add_list)
dst_src_diff = pd.DataFrame(test.iloc[:,1:36].values - test.iloc[:,36:71].values,columns=diff_list)
dst_src_mul = pd.DataFrame(test.iloc[:,1:36].values * test.iloc[:,36:71].values,columns=multi_list)
dst_src_div = pd.DataFrame(test.iloc[:,1:36].values / (test.iloc[:,36:71].values+1e-16),columns=div_list)
test = pd.concat([test,dst_src_add],axis=1)
test = pd.concat([test,dst_src_diff],axis=1)
test = pd.concat([test,dst_src_div],axis=1)
test = pd.concat([test,dst_src_mul],axis=1)
test.shape

# 퓨리에 변환 실시

In [None]:
src_list=['650_src', '660_src', '670_src', '680_src', '690_src', '700_src', '710_src', '720_src', '730_src', 
          '740_src', '750_src', '760_src', '770_src', '780_src', '790_src', '800_src', '810_src', '820_src', 
          '830_src', '840_src', '850_src', '860_src', '870_src', '880_src', '890_src', '900_src', '910_src', 
          '920_src', '930_src', '940_src', '950_src', '960_src', '970_src', '980_src', '990_src']

dst_list=['650_dst', '660_dst', '670_dst', '680_dst', '690_dst', '700_dst', '710_dst', '720_dst', '730_dst', 
          '740_dst', '750_dst', '760_dst', '770_dst', '780_dst', '790_dst', '800_dst', '810_dst', '820_dst', 
          '830_dst', '840_dst', '850_dst', '860_dst', '870_dst', '880_dst', '890_dst', '900_dst', '910_dst', 
          '920_dst', '930_dst', '940_dst', '950_dst', '960_dst', '970_dst', '980_dst', '990_dst']

In [None]:
alpha_real=train[dst_list]
alpha_imag=train[dst_list]

beta_real=test[dst_list]
beta_imag=test[dst_list]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in dst_list:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

In [None]:
train=pd.concat((train, alpha), axis=1)
test=pd.concat((test, beta), axis=1)

In [None]:
del alpha, beta

## 흡광계수 만들기 (빛의 흡수율 활용)

src는 광원의 빛의 세기 dst는 측정된 빛의 세기 활용하여 흡광계수 만들기

In [None]:
for i in range(650,1000,10) : 
    train['흡수계수_{}'.format(i)] = np.log10(train['{}_src'.format(i)] / train['{}_dst'.format(i)]) / train.rho
#     train['투과율_{}'.format(i)] = np.log10(train['{}_src'.format(i)] / train['{}_dst'.format(i)])
    test['흡수계수_{}'.format(i)] = np.log10(test['{}_src'.format(i)] / test['{}_dst'.format(i)]) / test.rho
#     test['투과율_{}'.format(i)] = np.log10(test['{}_src'.format(i)] / test['{}_dst'.format(i)])
    
train = train.fillna(0)
test = test.fillna(0)

train.isnull().sum().sum(), test.isnull().sum().sum()

## 흡수량 만들어 삽입

src의 실제 세기 대비 흡수된 빛의 세기를 계산하여 활용

In [None]:
srrho_lisrho = ['65_srrho', '66_srrho', '67_srrho', '68_srrho', '69_srrho', '70_srrho', '71_srrho', '72_srrho', '73_srrho', 
          '74_srrho', '75_srrho', '76_srrho', '77_srrho', '78_srrho', '79_srrho', '80_srrho', '81_srrho', '82_srrho', 
          '83_srrho', '84_srrho', '85_srrho', '86_srrho', '87_srrho', '88_srrho', '89_srrho', '90_srrho', '91_srrho', 
          '92_srrho', '93_srrho', '94_srrho', '95_srrho', '96_srrho', '97_srrho', '98_srrho', '99_srrho','65_dsrho',
          '66_dsrho', '67_dsrho', '68_dsrho', '69_dsrho', '70_dsrho', '71_dsrho', '72_dsrho', '73_dsrho', 
          '74_dsrho', '75_dsrho', '76_dsrho', '77_dsrho', '78_dsrho', '79_dsrho', '80_dsrho', '81_dsrho', '82_dsrho', 
          '83_dsrho', '84_dsrho', '85_dsrho', '86_dsrho', '87_dsrho', '88_dsrho', '89_dsrho', '90_dsrho', '91_dsrho', 
          '92_dsrho', '93_dsrho', '94_dsrho', '95_dsrho', '96_dsrho', '97_dsrho', '98_dsrho', '99_dsrho']

In [None]:
for i in range(650,1000,10) : 
    train['흡수량_{}'.format(i)] = (train['{}_src'.format(i)]-train['{}_dst'.format(i)])/(train['{}_src'.format(i)]+1e-20)
    test['흡수량_{}'.format(i)] =  (test['{}_src'.format(i)]-test['{}_dst'.format(i)])/(test['{}_src'.format(i)]+1e-20)
a = train.iloc[:,1:71].apply(lambda x: x/(train['rho']^2))
b = test.iloc[:,1:71].apply(lambda x: x/(test['rho']^2))
a.columns = srrho_lisrho
b.columns = srrho_lisrho
train = pd.concat([train,a],axis=1)
test = pd.concat([test,b],axis=1)

In [None]:
train.isnull().sum().sum()

In [None]:
# 이렇게 구한 흡수계수 활용 흡수평균 및 편차 합 변수 만듦
train['흡수평균'] = train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).mean(axis=1)
train['흡수편차'] = train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).std(axis=1)
train['흡수합'] = train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).sum(axis=1)
test['흡수평균'] = test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).mean(axis=1)
test['흡수편차'] = test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).std(axis=1)
test['흡수합'] = test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).sum(axis=1)

In [None]:
# 흡수계수의 값으로 곱하여 새로운 피쳐 생성
srr_liss=['65srrr', '66srrr', '67srrr', '68srrr', '69srrr', '70srrr', '71rsrr', '72srrr', '73srrr', 
          '74srrr', '75srrr', '76srrr', '77srrr', '78srrr', '79srrr', '80srrr', '81srrr', '82srrr', 
          '83srrr', '84srrr', '85srrr', '86srrr', '87srrr', '88srrr', '89srrr', '90srrr', '91srrr', 
          '92srrr', '93srrr', '94srrr', '95srrr', '96srrr', '97srrr', '98srrr','99srrr']
dsr_liss = ['65dsss','66dsss', '67dsss', '68dsss', '69dsss', '70dsss', '71dsss', '72dsss', '73dsss', 
          '74dsss', '75dsss', '76dsss', '77dsss', '78dsss', '79dsss', '80dsss', '81dsss', '82dsss', 
          '83dsss', '84dsss', '85dsss', '86dsss', '87dsss', '88dsss', '89dsss', '90dsss', '91dsss', 
          '92dsss', '93dsss', '94dsss', '95dsss', '96dsss', '97dsss', '98dsss','99dsss']

a = pd.DataFrame(train.iloc[:,1:36].values * train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values,columns=srr_liss)
c = pd.DataFrame(train.iloc[:,36:71].values * train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values,columns=dsr_liss)
b = pd.DataFrame(test.iloc[:,1:36].values * test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values, columns = srr_liss)
d = pd.DataFrame(test.iloc[:,36:71].values * test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values,columns = dsr_liss)

train = pd.concat([train,a],axis=1)
train = pd.concat([train,c],axis=1)
test = pd.concat([test,b],axis=1)
test = pd.concat([test,d],axis=1)

In [None]:
# 흡수계수의 값으로 나누어 새로운 피쳐 생성 
srr_liss=['65srrr', '66srrr', '67srrr', '68srrr', '69srrr', '70srrr', '71rsrr', '72srrr', '73srrr', 
          '74srrr', '75srrr', '76srrr', '77srrr', '78srrr', '79srrr', '80srrr', '81srrr', '82srrr', 
          '83srrr', '84srrr', '85srrr', '86srrr', '87srrr', '88srrr', '89srrr', '90srrr', '91srrr', 
          '92srrr', '93srrr', '94srrr', '95srrr', '96srrr', '97srrr', '98srrr','99srrr']
dsr_liss = ['65dsss','66dsss', '67dsss', '68dsss', '69dsss', '70dsss', '71dsss', '72dsss', '73dsss', 
          '74dsss', '75dsss', '76dsss', '77dsss', '78dsss', '79dsss', '80dsss', '81dsss', '82dsss', 
          '83dsss', '84dsss', '85dsss', '86dsss', '87dsss', '88dsss', '89dsss', '90dsss', '91dsss', 
          '92dsss', '93dsss', '94dsss', '95dsss', '96dsss', '97dsss', '98dsss','99dsss']

a = pd.DataFrame(train.iloc[:,1:36].values / (train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values + 1e-20),columns=srr_liss)
c = pd.DataFrame(train.iloc[:,36:71].values / (train.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values+ 1e-20),columns=dsr_liss)
b = pd.DataFrame(test.iloc[:,1:36].values / (test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values+ 1e-20), columns = srr_liss)
d = pd.DataFrame(test.iloc[:,36:71].values / (test.iloc[:,281:316].replace(np.inf,1).replace(-np.inf,-1).values+ 1e-20),columns = dsr_liss)

train = pd.concat([train,a],axis=1)
train = pd.concat([train,c],axis=1)
test = pd.concat([test,b],axis=1)
test = pd.concat([test,d],axis=1)

In [None]:
train.isnull().sum().sum(),test.isnull().sum().sum()

In [None]:
srr_liss=['65srr', '66srr', '67srr', '68srr', '69srr', '70srr', '71srr', '72srr', '73srr', 
          '74srr', '75srr', '76srr', '77srr', '78srr', '79srr', '80srr', '81srr', '82srr', 
          '83srr', '84srr', '85srr', '86srr', '87srr', '88srr', '89srr', '90srr', '91srr', 
          '92srr', '93srr', '94srr', '95srr', '96srr', '97srr', '98srr']
dsr_liss = ['65dss','66dss', '67dss', '68dss', '69dss', '70dss', '71dss', '72dss', '73dss', 
          '74dss', '75dss', '76dss', '77dss', '78dss', '79dss', '80dss', '81dss', '82dss', 
          '83dss', '84dss', '85dss', '86dss', '87dss', '88dss', '89dss', '90dss', '91dss', 
          '92dss', '93dss', '94dss', '95dss', '96dss', '97dss', '98dss']

In [None]:
# src, dst의 바로 다음 값으로 나누어 파장별 비율 계산 
a = pd.DataFrame(columns = srr_liss, index=range(10000))
b = pd.DataFrame(columns = srr_liss, index=range(10000))
c = pd.DataFrame(columns = dsr_liss, index=range(10000))
d = pd.DataFrame(columns = dsr_liss, index=range(10000))
for i in range(1,35):
    a.iloc[:,i-1] = train.iloc[:,i]/(train.iloc[:,i+1]+1e-20)
    b.iloc[:,i-1] = test.iloc[:,i]/(test.iloc[:,i+1]+1e-20)
    c.iloc[:,i-1] = train.iloc[:,i+36]/(train.iloc[:,i+37]+1e-20)
    d.iloc[:,i-1] = test.iloc[:,i+36]/(test.iloc[:,i+37]+1e-20)
train = pd.concat([train,a],axis=1)
train = pd.concat([train,c],axis=1)
test = pd.concat([test,b],axis=1)
test = pd.concat([test,d],axis=1)

In [None]:
train.isnull().sum().sum(),test.isnull().sum().sum()

In [None]:
# 파장별 나누는 것의 sliding을 넓힘. 높은 성능 향상을 보임 
for j in range(10,350,10) : 
    for i in range(650,1000-j,10) : 
        train['흡수nextrat{}_{}'.format(i,j)] = train['흡수계수_{}'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (train['흡수계수_{}'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        train['srcnextrat{}_{}'.format(i,j)] = train['{}_src'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (train['{}_src'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        train['dstnextrat{}_{}'.format(i,j)] = train['{}_dst'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (train['{}_dst'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        test['흡수nextrat{}_{}'.format(i,j)] = test['흡수계수_{}'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (test['흡수계수_{}'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        test['srcnextrat{}_{}'.format(i,j)] = test['{}_src'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (test['{}_src'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)
        test['dstnextrat{}_{}'.format(i,j)] = test['{}_dst'.format(i+j)].replace(np.inf,1).replace(-np.inf,-1) / (test['{}_dst'.format(i)].replace(np.inf,1).replace(-np.inf,-1)+1e-20)

In [None]:
train.isnull().sum().sum(),test.isnull().sum().sum()

In [None]:
# src 및 dst의 값 행정규화 시행 

srstd_list=['65srstd', '66srstd', '67srstd', '68srstd', '69srstd', '70srstd', '71srstd', '72srstd', '73srstd', 
          '74srstd', '75srstd', '76srstd', '77srstd', '78srstd', '79srstd', '80srstd', '81srstd', '82srstd', 
          '83srstd', '84srstd', '85srstd', '86srstd', '87srstd', '88srstd', '89srstd', '90srstd', '91srstd', 
          '92srstd', '93srstd', '94srstd', '95srstd', '96srstd', '97srstd', '98srstd', '99srstd','65dststd',
          '66dststd', '67dststd', '68dststd', '69dststd', '70dststd', '71dststd', '72dststd', '73dststd', 
          '74dststd', '75dststd', '76dststd', '77dststd', '78dststd', '79dststd', '80dststd', '81dststd', '82dststd', 
          '83dststd', '84dststd', '85dststd', '86dststd', '87dststd', '88dststd', '89dststd', '90dststd', '91dststd', 
          '92dststd', '93dststd', '94dststd', '95dststd', '96dststd', '97dststd', '98dststd', '99dststd']

mean_1 = train.iloc[:,1:36].mean(axis=1)
mean_2 = train.iloc[:,36:71].mean(axis=1)

std_1 = train.iloc[:,1:36].std(axis=1)
std_2 = train.iloc[:,36:71].std(axis=1)

mean_1t = test.iloc[:,1:36].mean(axis=1)
mean_2t = test.iloc[:,36:71].mean(axis=1)

std_1t = test.iloc[:,1:36].std(axis=1)
std_2t = test.iloc[:,36:71].std(axis=1)

tra_1 = train.iloc[:,1:36].apply(lambda x: (x-mean_1)/std_1)
tra_2 = train.iloc[:,36:71].apply(lambda x: (x-mean_2)/std_2)
tra = pd.concat([tra_1,tra_2],axis=1)
tra.columns = srstd_list

tes_1 = test.iloc[:,1:36].apply(lambda x: (x-mean_1t)/std_1t)
tes_2 = test.iloc[:,36:71].apply(lambda x: (x-mean_2t)/std_2t)
tes = pd.concat([tes_1,tes_2],axis=1)
tes.columns = srstd_list

train = pd.concat([train,tra],axis=1)
test = pd.concat([test,tes],axis=1)

In [None]:
# src, dst의 값 행 minmax scaling 시행

srmax_list=['65srmax', '66srmax', '67srmax', '68srmax', '69srmax', '70srmax', '71srmax', '72srmax', '73srmax', 
          '74srmax', '75srmax', '76srmax', '77srmax', '78srmax', '79srmax', '80srmax', '81srmax', '82srmax', 
          '83srmax', '84srmax', '85srmax', '86srmax', '87srmax', '88srmax', '89srmax', '90srmax', '91srmax', 
          '92srmax', '93srmax', '94srmax', '95srmax', '96srmax', '97srmax', '98srmax', '99srmax','65dstmax',
          '66dstmax', '67dstmax', '68dstmax', '69dstmax', '70dstmax', '71dstmax', '72dstmax', '73dstmax', 
          '74dstmax', '75dstmax', '76dstmax', '77dstmax', '78dstmax', '79dstmax', '80dstmax', '81dstmax', '82dstmax', 
          '83dstmax', '84dstmax', '85dstmax', '86dstmax', '87dstmax', '88dstmax', '89dstmax', '90dstmax', '91dstmax', 
          '92dstmax', '93dstmax', '94dstmax', '95dstmax', '96dstmax', '97dstmax', '98dstmax', '99dstmax']

max_1 = train.iloc[:,1:36].max(axis=1)
max_2 = train.iloc[:,36:71].max(axis=1)

min_1 = train.iloc[:,1:36].min(axis=1)
min_2 = train.iloc[:,36:71].min(axis=1)

max_1t = test.iloc[:,1:36].max(axis=1)
max_2t = test.iloc[:,36:71].max(axis=1)

min_1t = test.iloc[:,1:36].min(axis=1)
min_2t = test.iloc[:,36:71].min(axis=1)

tra_1 = train.iloc[:,1:36].apply(lambda x: (x-min_1)/(max_1 - min_1))
tra_2 = train.iloc[:,36:71].apply(lambda x: (x-min_2)/(max_2 -min_2))
tra = pd.concat([tra_1,tra_2],axis=1)
tra.columns = srmax_list

tes_1 = test.iloc[:,1:36].apply(lambda x: (x-min_1t)/(max_1t - min_1t))
tes_2 = test.iloc[:,36:71].apply(lambda x: (x-min_2t)/(max_2t - min_2t))
tes = pd.concat([tes_1,tes_2],axis=1)
tes.columns = srmax_list

train = pd.concat([train,tra],axis=1)
test = pd.concat([test,tes],axis=1)

In [None]:
# src, dst값을 최대값으로 나눈 값 삽입 

srmaratio_list=['65srmaratio', '66srmaratio', '67srmaratio', '68srmaratio', '69srmaratio', '70srmaratio', '71srmaratio', '72srmaratio', '73srmaratio', 
          '74srmaratio', '75srmaratio', '76srmaratio', '77srmaratio', '78srmaratio', '79srmaratio', '80srmaratio', '81srmaratio', '82srmaratio', 
          '83srmaratio', '84srmaratio', '85srmaratio', '86srmaratio', '87srmaratio', '88srmaratio', '89srmaratio', '90srmaratio', '91srmaratio', 
          '92srmaratio', '93srmaratio', '94srmaratio', '95srmaratio', '96srmaratio', '97srmaratio', '98srmaratio', '99srmaratio','65dstmaratio',
          '66dstmaratio', '67dstmaratio', '68dstmaratio', '69dstmaratio', '70dstmaratio', '71dstmaratio', '72dstmaratio', '73dstmaratio', 
          '74dstmaratio', '75dstmaratio', '76dstmaratio', '77dstmaratio', '78dstmaratio', '79dstmaratio', '80dstmaratio', '81dstmaratio', '82dstmaratio', 
          '83dstmaratio', '84dstmaratio', '85dstmaratio', '86dstmaratio', '87dstmaratio', '88dstmaratio', '89dstmaratio', '90dstmaratio', '91dstmaratio', 
          '92dstmaratio', '93dstmaratio', '94dstmaratio', '95dstmaratio', '96dstmaratio', '97dstmaratio', '98dstmaratio', '99dstmaratio']
max_1 = train.iloc[:,1:36].max(axis=1)
max_2 = train.iloc[:,36:71].max(axis=1)

tra_1 = train.iloc[:,1:36].apply(lambda x: x/max_1)
tra_2 = train.iloc[:,36:71].apply(lambda x: x/max_2)
tra = pd.concat([tra_1,tra_2],axis=1)
tra.columns = srmaratio_list

max_1t = test.iloc[:,1:36].max(axis=1)
max_2t = test.iloc[:,36:71].max(axis=1)

tra_1 = test.iloc[:,1:36].apply(lambda x: x/max_1t)
tra_2 = test.iloc[:,36:71].apply(lambda x: x/max_2t)
tes = pd.concat([tra_1,tra_2],axis=1)
tes.columns = srmaratio_list

train = pd.concat([train,tra],axis=1)
test = pd.concat([test,tes],axis=1)

In [None]:
train.shape, test.shape

In [None]:
# src, dst의 최대값 최소값 평균, 표준편자, 최대값 최소값 차이, src, dst의 최대값 최소값 차이의 비율 삽입

train['src_max'] = train.iloc[:,1:36].max(axis=1)
train['src_min'] = train.iloc[:,1:36].min(axis=1)
train['dst_max'] = train.iloc[:,36:71].max(axis=1)
train['dst_min'] = train.iloc[:,36:71].min(axis=1)
train['src_mean'] = train.iloc[:,1:36].mean(axis=1)
train['dst_mean'] = train.iloc[:,36:71].mean(axis=1)
train['src_std'] = train.iloc[:,1:36].std(axis=1)
train['dst_std'] = train.iloc[:,36:71].std(axis=1)
train['max_min_src'] = train['src_max'] - train['src_min']
train['max_min_dst'] = train['dst_max'] - train['dst_min']
train['dst_src_max_min_ratio'] = train['max_min_src'] / (train['max_min_dst']+0.01)
test['src_max'] = test.iloc[:,1:36].max(axis=1)
test['src_min'] = test.iloc[:,1:36].min(axis=1)
test['dst_max'] = test.iloc[:,36:71].max(axis=1)
test['dst_min'] = test.iloc[:,36:71].min(axis=1)
test['src_mean'] = test.iloc[:,1:36].mean(axis=1)
test['dst_mean'] = test.iloc[:,36:71].mean(axis=1)
test['src_std'] = test.iloc[:,1:36].std(axis=1)
test['dst_std'] = test.iloc[:,36:71].std(axis=1)
test['max_min_src'] = test['src_max'] - test['src_min']
test['max_min_dst'] = test['dst_max'] - test['dst_min']
test['dst_src_max_min_ratio'] = test['max_min_src'] / (test['max_min_dst']+0.01)
train.head()

In [None]:
srmul_list=['65srmul', '66srmul', '67srmul', '68srmul', '69srmul', '70srmul', '71srmul', '72srmul', '73srmul', 
          '74srmul', '75srmul', '76srmul', '77srmul', '78srmul', '79srmul', '80srmul', '81srmul', '82srmul', 
          '83srmul', '84srmul', '85srmul', '86srmul', '87srmul', '88srmul', '89srmul', '90srmul', '91srmul', 
          '92srmul', '93srmul', '94srmul', '95srmul', '96srmul', '97srmul', '98srmul', '99srmul','65dstmul',
          '66dstmul', '67dstmul', '68dstmul', '69dstmul', '70dstmul', '71dstmul', '72dstmul', '73dstmul', 
          '74dstmul', '75dstmul', '76dstmul', '77dstmul', '78dstmul', '79dstmul', '80dstmul', '81dstmul', '82dstmul', 
          '83dstmul', '84dstmul', '85dstmul', '86dstmul', '87dstmul', '88dstmul', '89dstmul', '90dstmul', '91dstmul', 
          '92dstmul', '93dstmul', '94dstmul', '95dstmul', '96dstmul', '97dstmul', '98dstmul', '99dstmul']

In [None]:
# rho의 거리 제곱을 곱하여 거리와 상관없는 빛의 세기로 만듦.

a = train.iloc[:,1:71].apply(lambda x: x*(train['rho']^2))
a.columns = srmul_list
b = test.iloc[:,1:71].apply(lambda x: x*(test['rho']^2))
b.columns = srmul_list

train =pd.concat([train,a],axis=1)
test =pd.concat([test,b],axis=1)

In [None]:
train.shape, test.shape

In [None]:
srmui_list=['65srmui', '66srmui', '67srmui', '68srmui', '69srmui', '70srmui', '71srmui', '72srmui', '73srmui', 
          '74srmui', '75srmui', '76srmui', '77srmui', '78srmui', '79srmui', '80srmui', '81srmui', '82srmui', 
          '83srmui', '84srmui', '85srmui', '86srmui', '87srmui', '88srmui', '89srmui', '90srmui', '91srmui', 
          '92srmui', '93srmui', '94srmui', '95srmui', '96srmui', '97srmui', '98srmui', '99srmui']

In [None]:
# src와 거리의 제곱을 곱한 후 dst를 곱하는 피쳐를 만듦.
a = train.iloc[:,1:36].apply(lambda x: x*(train['rho']^2)).values*train.iloc[:,36:71].values
a = pd.DataFrame(a,columns=srmui_list)
b = test.iloc[:,1:36].apply(lambda x: x*(test['rho']^2)).values*test.iloc[:,36:71].values
b = pd.DataFrame(b,columns=srmui_list)

train =pd.concat([train,a],axis=1)
test =pd.concat([test,b],axis=1)

In [None]:
# src에 거리제곱을 나누어 거리와 관계없는 빛의 세기로 만들고 dst를 곱함
srmuii_list=['65srmuii', '66srmuii', '67srmuii', '68srmuii', '69srmuii', '70srmuii', '71srmuii', '72srmuii', '73srmuii', 
          '74srmuii', '75srmuii', '76srmuii', '77srmuii', '78srmuii', '79srmuii', '80srmuii', '81srmuii', '82srmuii', 
          '83srmuii', '84srmuii', '85srmuii', '86srmuii', '87srmuii', '88srmuii', '89srmuii', '90srmuii', '91srmuii', 
          '92srmuii', '93srmuii', '94srmuii', '95srmuii', '96srmuii', '97srmuii', '98srmuii', '99srmuii']

a = train.iloc[:,1:36].apply(lambda x: x/(train['rho']^2)).values*train.iloc[:,36:71].values
a = pd.DataFrame(a,columns=srmuii_list)
b = test.iloc[:,1:36].apply(lambda x: x/(test['rho']^2)).values*test.iloc[:,36:71].values
b = pd.DataFrame(b,columns=srmuii_list)

train =pd.concat([train,a],axis=1)
test =pd.concat([test,b],axis=1)

In [None]:
# src의 값에 dst 값을 빼고 src로 나누어 흡수량 만듦.
srmuii_list=['65srmuii', '66srmuii', '67srmuii', '68srmuii', '69srmuii', '70srmuii', '71srmuii', '72srmuii', '73srmuii', 
          '74srmuii', '75srmuii', '76srmuii', '77srmuii', '78srmuii', '79srmuii', '80srmuii', '81srmuii', '82srmuii', 
          '83srmuii', '84srmuii', '85srmuii', '86srmuii', '87srmuii', '88srmuii', '89srmuii', '90srmuii', '91srmuii', 
          '92srmuii', '93srmuii', '94srmuii', '95srmuii', '96srmuii', '97srmuii', '98srmuii', '99srmuii']

a = pd.DataFrame((train.iloc[:,1:36].values - train.iloc[:,36:71].values)/(train.iloc[:,1:36].values+1e-20),columns=srmuii_list)
b = pd.DataFrame((test.iloc[:,1:36].values - test.iloc[:,36:71].values)/(test.iloc[:,1:36].values+1e-20),columns=srmuii_list)
train =pd.concat([train,a],axis=1)
test =pd.concat([test,b],axis=1)

In [None]:
train.shape, test.shape

In [None]:
train.isnull().sum().sum()

In [None]:
train_y.head()

#### lgbm 모델을 돌리는데 dart를 사용 많은 양을 학습시킨 후 최소의 mae num_rounds를 찾아내 재학습 시행 이를 반복함.

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['hhb'], test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=27068):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

# from matplotlib import pylab as plt

# fig, ax = plt.subplots(figsize=(12,18))

# lgb.plot_importance(model, max_num_featurs=50, height=0.8, ax=ax)

# plt.show()

In [None]:
y_test =  model.predict(test.values)
sample['hhb'] = y_test

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['hbo2'], test_size=0.15, random_state=14414)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=20396):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)


model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

In [None]:
y_test =  model.predict(test.values)
sample['hbo2'] = y_test

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['ca'], test_size=0.15, random_state=12300)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=43259):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

In [None]:
y_test =  model.predict(test.values)
sample['ca'] = y_test

In [None]:
train_x, train_cv, y, y_cv = train_test_split(train.values,train_y['na'], test_size=0.15, random_state=12300)

def lgbm_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=288, num_rounds=15248):

    param = {}

    param['boosting'] = 'dart'
    
    param['objective'] = 'regression'

    param['learning_rate'] = 0.05

    param['max_depth'] = 10

    param['metric'] = 'mae'
    
    param['is_training_metric'] = True
    
    param['min_child_weight'] = 1

    param['bagging_fraction'] = 0.8
    
    param['num_leaves'] = 128

    param['feature_fraction'] = 0.8

    param['bagging_freq'] = 6
    
    param['seed'] = seed_val
    
    param['min_split_gain'] = 0.01
    
    num_rounds = num_rounds

    plst = list(param.items())

    train_ds = lgb.Dataset(train_X, label=train_y)

    test_ds = lgb.Dataset(test_X, label=test_y)

    model = lgb.train(param, train_ds, num_rounds,test_ds, early_stopping_rounds=180)

    return model

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

model = lgbm_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)

In [None]:
y_test =  model.predict(test.values)
sample['na'] = y_test

In [None]:
# 음수 값은 있을 수 없기 때문에 음수값은 전부 0으로 치환
def zero_fill(x):
    if x <=0:
        return .0
    else:
        return x

In [None]:
sample['hhb'] = sample['hhb'].map(zero_fill)
sample['na'] = sample['na'].map(zero_fill)
sample.describe()

In [None]:
sample.to_csv('TaeHyun.csv',index=False)

# final CSV Making 

## 가중평균 기법 사용

random_state 바꾼것들을 하나의 집단으로 다른 피쳐들로 만든 csv들을 하나의 집단으로 만듦 

나온 결과값에 대하여 성능이 좋은 모델에 가중치를 더 두어 평균을 냄

In [None]:
a = pd.read_csv('edit_dst.csv')
b = pd.read_csv('edit_dst_998.csv')
c= pd.read_csv('edit_dst22882.csv')
d= pd.read_csv('edit_dst_3402.csv')

sub = (a+b+c+d)/4

t1 = pd.read_csv('TaeHyun.csv')
t2 = pd.read_csv('Making.csv')

t = (t1+t2)/2

sub2 = (0.7*sub) + (0.3 * t)

sub2.id = submission.id

sub2.to_csv('es_final.csv',index=False)