# Featuring engeering 
1. 푸리에 급수이용   
[데이콘 대회 참고](https://dacon.io/competitions/official/235608/codeshare/1130?page=1&dtype=recent&ptype=pub)  
[푸리에 변환 설명 참고](https://www.youtube.com/watch?v=spUNpyF58BY&feature=youtu.be)   


    - 푸리에 급수: 주기 함수를 삼각함수의 가중치로 분해한 급수
    - 푸리에 변환: 입력 신호를 다양한 주파수를 갖는 주기함수들의 합으로 분해하여 표현하는 방법 
    
![image](https://user-images.githubusercontent.com/33725048/85993286-0ea3c880-ba31-11ea-9dab-418597e6b786.png)


[주파수 영역 이해 참고 사진](https://en.wikipedia.org/wiki/Fourier_transform)
    - 푸리에 스펙트럼: 주파수 성분이 원 신호에 얼마나 강하게 포함되어 있는가(주파수 성분의 강도)



2. 통계적 특질 사용
    - 신호를 분석할 때 통계 특질, 피크 특질, 주파수 특질 등을 통해 특질을 추출해낼 수 있는데 그 중 통계적  특질을 이용 
    - [신호데이터 분석을 할 때 특질을 공부하며 작성했던 다소 미흡한 블로그 링크입니다.(R Programming)](https://ssung-22.tistory.com/10)

In [None]:
import pandas as pd
import warnings
import missingno as msno

import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
import lightgbm as lgbm

warnings.filterwarnings('ignore')

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(train.columns)
print(test.columns)

Index(['layer_1', 'layer_2', 'layer_3', 'layer_4', '0', '1', '2', '3', '4',
       '5',
       ...
       '216', '217', '218', '219', '220', '221', '222', '223', '224', '225'],
      dtype='object', length=230)
Index(['id', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '216', '217', '218', '219', '220', '221', '222', '223', '224', '225'],
      dtype='object', length=227)


In [5]:
#독립변수와 종속변수를 분리합니다.
train_X = train.iloc[:,4:]
train_Y = train.iloc[:,0:4]
test_X = test.iloc[:,1:]

### 푸리에 급수 적용 

In [None]:
import numpy as np
from tqdm import tqdm

feature_col = list(train_X)

alpha_real = train_X[feature_col]
alpha_imag = train_X[feature_col]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
real_part=[]
imag_part=[]

for col in feature_col:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part

alpha = pd.concat((alpha_real, alpha_imag), axis=1)

In [None]:
import numpy as np
from tqdm import tqdm

feature_col = list(test_X)

alpha_real = test_X[feature_col]
alpha_imag = test_X[feature_col]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
real_part=[]
imag_part=[]

for col in feature_col:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns = real_part
alpha_imag.columns = imag_part
alpha_test = pd.concat((alpha_real, alpha_imag), axis=1)

test_X = pd.concat((test_X, alpha_test), axis=1)


In [None]:
train_X = pd.concat((train_X, alpha), axis=1)

In [None]:
import pickle

# alpha.to_pickle('DFT.pickle')
pickle.dump(alpha, open('DFT.pkl', 'wb'))

### train test split

In [None]:
trn_X, tst_X, trn_y, tst_y = train_test_split(train_X, train_Y, test_size=0.05, shuffle=True)

In [None]:
# 데이터 셋 설정
train_set = lgbm.Dataset(trn_X, trn_y)
valid_set = lgbm.Dataset(tst_X, tst_y)

In [None]:

# 기초 파라미터 설정
lgb_param = {'objective': 'regression', 
            'n_estimators': 500,
            'drop_rate': 0.8, 
            'skip_drop': 0.8, 
            'learning_rate' : 0.5,
            'max_depth' : 6,
            'random_state' : 42,
            'metric' : 'l1',
            'colsample_bytree' : 0.7,
            'subsample' : 0.7,
            }

In [None]:
trn_X = trn_X.astype('float32')
tst_X = tst_X.astype('float32')

trn_X.info(memory_usage='deep')

In [None]:
models = {}

for col in trn_y.columns:
    train_set = lgbm.Dataset(trn_X, trn_y[col])
    valid_set = lgbm.Dataset(tst_X, tst_y[col])
    model = lgbm.train(lgb_param, train_set=train_set, valid_sets=valid_set,
                        num_boost_round = 1000, verbose_eval=10)
    models[col] = model

In [None]:
sample_sub = pd.read_csv('data/sample_submission.csv', index_col=0)
sample_sub

In [None]:
for col in train_Y.columns:
    pred = models[col].predict(test_X)
    sample_sub[col] = pred

In [None]:
sample_sub.to_csv('lgbm_baseline.csv') # 33.9점 

### 통계적 특질 적용 

In [None]:
# 데이터 수를 고려하여 신호의 크기를 나타내는 값 
import numpy as np
from scipy.stats import kurtosis, iqr

def rms(x):
    return np.sqrt(np.mean(x**2))

def rss(x):
    return rms(x)*len(x)

# 왜도 

def skewness(x):
    return (sum((x-np.mean(x))**3)/len(x))/(sum((x-np.mean(x))**2)/len(x))**(3/2)

In [None]:
function_list = ['mean', 'min', 'max', 'std', skewness, rss]

In [None]:
train_X2 = train_X.aggregate(function_list,axis=1)

In [None]:
test_X2 = test_X.aggregate(function_list,axis=1)

In [None]:
import pickle

alpha = pickle.load(open('DFT.pkl','rb'))

In [None]:
from sklearn.model_selection import train_test_split

trn_X, tst_X, trn_y, tst_y = train_test_split(train_X2, train_Y, test_size=0.05, shuffle=True)

In [None]:
import lightgbm as lgbm

In [None]:
models = {}

for col in trn_y.columns:
    train_set = lgbm.Dataset(trn_X, trn_y[col])
    valid_set = lgbm.Dataset(tst_X, tst_y[col])
    model = lgbm.train(lgb_param, train_set=train_set, valid_sets=valid_set,
                        num_boost_round = 1000, verbose_eval=10)
    models[col] = model

In [None]:
sample_sub = pd.read_csv('data/sample_submission.csv', index_col=0)
sample_sub

In [None]:
for col in train_Y.columns:
    pred = models[col].predict(test_X2)
    sample_sub[col] = pred

In [None]:
sample_sub.to_csv('lgbm_baseline_summary0627.csv')  # 63점 

통계적 특질만 적용해서하면 점수가 매우 낮음. 

푸리에급수를 적용한 것과 통계적 특질 적용한 것 모두 사용하면 메모리 에러 발생.