<a href="https://colab.research.google.com/github/ssumannb/Rader_performance_prediction/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [4]:
PATH = "./drive/MyDrive/Colab Notebooks/radarsensor_fault_prediction"

In [5]:
train_df = pd.read_csv(f'{PATH}/train.csv')
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

In [6]:
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2)

In [None]:
COL_CAT = []
COL_CON = []

In [None]:
for col in train_x:
  print(col,len(train_x[col].unique()))

#### 데이터 전처리
1. 결측값, 이상값 처리
2. 변수 변환 (범주형, 연속형, 스케일링)

In [9]:
# feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_x)
train_x_scaled = scaler.transform(train_x)
train_x_scaled = pd.DataFrame(train_x_scaled, columns=train_x.columns)

# xy_corr(train_x_scaled,'standard scaling')
train_x_scaled.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31675,31676,31677,31678,31679,31680,31681,31682,31683,31684
X_01,0.801827,-0.349655,0.801827,-0.349655,-0.349655,-1.885089,1.185403,-0.349655,-0.733607,0.417874,...,0.033922,-0.349655,-1.11756,-0.733607,0.033922,0.417874,0.033922,1.185403,0.033922,-0.349655
X_02,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,...,-0.443966,-0.443966,2.252424,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966,-0.443966
X_03,-0.514182,-0.708495,0.690559,-0.63077,-0.708495,-1.038828,-0.319869,-0.727927,-0.572476,-0.339301,...,0.807146,-0.980534,-1.272003,-1.07769,-0.669633,0.612833,0.554539,-0.397594,0.360226,-0.805652
X_04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
X_05,1.479525,-0.528687,1.486815,-0.814793,-0.716387,-0.705453,-0.630738,-0.699986,-0.639849,-0.543266,...,-0.949646,-0.517753,-0.481306,-0.528687,-0.643494,-0.853063,-0.734611,1.490459,-0.678118,-0.592469


#### Y값 정상/비정상으로 범주화

In [None]:
y_spec_info = pd.read_csv(f'{PATH}/meta/y_feature_spec_info.csv', index_col='Feature')
y_spec_info.columns=['min_val','max_val']
y_spec_info.head(14)

In [None]:
train_y_class = pd.DataFrame()

for col in list(train_y.columns):
  train_y_class[col] = train_y[col].apply(lambda x: 1 if x >= y_spec_info.loc[col, 'min_val'] and x <= y_spec_info.loc[col, 'max_val'] else 0)

train_y_class.head()

In [None]:
train_y_class.info()

In [None]:
train_y_class['label'] = train_y_class.sum(axis=1)

In [None]:
train_y_class['label'] = train_y_class['label'].apply(lambda x: True if x == 14.0 else False)

In [None]:
train_y_class['label'].value_counts()

True     28547
False     3138
Name: label, dtype: int64

In [None]:
train_y['label'] = train_y_class['label']

In [None]:
train_y.columns

Index(['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08', 'Y_09',
       'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14', 'label'],
      dtype='object')

#### X Feature and Y Feature 상관관계 체크
* sklearn.feature_selectio.r_regression()  
: target에 대한 각 feature의 pearson's correlation을 계산한다.  

** 함수화 하여 feature engineering 결과 시각화 시 사용하기

In [None]:
from sklearn.feature_selection import r_regression


def xy_corr(x:pd.DataFrame, title:str):
  correlations = []
  Y = train_y[:]
  for y_ in list(Y.columns):
    correlations.append(r_regression(x, train_y[y_]))

  plt.figure(figsize=(15, 5))
  im = plt.imshow(correlations, cmap='cool', interpolation='nearest', aspect='auto')
  plt.colorbar(im)
  plt.xticks(np.arange(0, len(x.columns)-1), rotation=45, labels=list(x.columns))    
  plt.yticks(np.arange(0,len(train_y.columns)-1), labels=list(train_y.columns))
  plt.title(title)
  plt.savefig(f'{PATH}/features/xy_corr({title}).png')
  # plt.show()

In [None]:
xy_corr(train_x, "original x features")

In [None]:
print(train_x.info())
print(valid_x.info())

In [None]:
train_x_scaled_ = train_x_scaled.drop(['X_04', 'X_23', 'X_47', 'X_48'], axis=1)
xy_corr(train_x_scaled_, 'drop n차검사여부')

In [None]:
alpha_range = np.arange(0.1, 1, 0.1)
print(alpha_range)

for _alpha in alpha_range:
  MTLReg = linear_model.MultiTaskLasso(alpha=_alpha)
  MTLReg.fit(train_x, train_y)
  print(f'alpha({_alpha})',MTLReg.score(valid_x, valid_y))
  print(MTLReg.coef_)
  plt.imshow(MTLReg.coef_, cmap='cool', interpolation='nearest')
  plt.show()

#### 파이프라인

데이터 스케일링  
PCA  
모델(XGBoost, RF Regressor, LigntGBM, SVR, KernelRidge)  
*https://data-newbie.tistory.com/186

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import LassoCV , ElasticNetCV , RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA 
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.cross_decomposition import PLSRegression as  PLS
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns

In [8]:
cv = ShuffleSplit(n_splits=5 , test_size=0.3, random_state=42)
pipe_linear = Pipeline([('scl', StandardScaler()),
                        ('poly', PolynomialFeatures()),
                        ('fit', LinearRegression())])
pipe_lasso = Pipeline([('scl', StandardScaler()),
                       ('poly', PolynomialFeatures()),
                       ('fit', Lasso(random_state = 42))])
pipe_ridge = Pipeline([('scl', StandardScaler()),
                       ('poly', PolynomialFeatures()),
                       ('fit', Ridge(random_state = 42))])
pipe_pca = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA()),
                     ('fit', Ridge(random_state = 42))])
pipe_pls = Pipeline([('scl', StandardScaler()),
                     ('fit', PLS())])
pipe_gbr = Pipeline([('scl', StandardScaler()),
                     ('fit', GBR())])
pipe_rfr = Pipeline([('scl', StandardScaler()),
                     ('fit', RFR())])
pipe_svr = Pipeline([('scl', StandardScaler()),
                     ('fit', SVR())])
pipe_KR = Pipeline([('scl', StandardScaler()),
                    ('fit', KernelRidge())])

pipes = [
    pipe_linear , pipe_lasso ,  pipe_pca ,
    pipe_ridge , pipe_pls , pipe_gbr , 
    pipe_rfr , pipe_svr , pipe_KR 
]
pipes_label = [
    'linear', 'lasso', 'pca', 'ridge',
    'pls', 'gbr', 'rfr', 'svr', 'KR'
]

In [9]:
pipes_dict = {}
for i, pipe in enumerate(pipes):
  _pipes = []
  for _ in range(14):
    _pipes.append(pipe)
  
  pipes_dict[pipes_label[i]] = _pipes

pipes_dict

{'linear': [Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', LinearRegression())]),
  Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', LinearRegression())]),
  Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', LinearRegression())]),
  Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', LinearRegression())]),
  Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', LinearRegression())]),
  Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', LinearRegression())]),
  Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', LinearRegression())]),
  Pipeline(steps=[('scl', StandardScaler()), ('poly', PolynomialFeatures()),
                  ('fit', Linea

In [None]:
pipes_dict

In [10]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(1,15): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [None]:
scores_dict = {}
preds_dict = {}
nrmses_dict = {}

for pipe_key, pipelines in pipes_dict.items():
  scores = []
  preds = []
  for i, pipe in enumerate(pipelines):
    train_y_1d = train_y[train_y.columns[i]]
    valid_y_1d = valid_y[valid_y.columns[i]]
    pipe.fit(train_x, train_y_1d)
    pred = pipe.predict(valid_x)
    preds.append(pred)
    scores.append(pipe.score(valid_x, valid_y_1d))
  
  scores_dict[pipe_key] = scores
  preds_dict[pipe_key] = pipe_key
#  nrmses_dict[pipe_key] = lg_nrmse(valid_y, preds)

scores_dict
preds_dict

In [20]:
scores = []
preds = []
nrmses = []

for pipe in pipes:
  pipe.fit(train_x, train_y)
  preds.append(pipe.predict(valid_x))
  scores.append(pipe.score(valid_x, valid_y))

for pred in preds:
  nrmses.append(lg_nrmse(valid_y, pred))

ValueError: ignored

In [None]:
### 
grid_params_linear = [{
    "poly__degree" : np.arange(1,3), 
    "fit__fit_intercept" : [True, False], 
}]
grid_params_lasso = [{
    "poly__degree" : np.arange(1,3),
    "fit__tol" : np.logspace(-5,0,10) ,
    "fit__alpha" : np.logspace(-5,1,10) ,     
                     }]
grid_params_pca = [{
    "pca__n_components" : np.arange(2,8)
}]
grid_params_ridge = [{
    "poly__degree" : np.arange(1,3),
    "fit__alpha" : np.linspace(2,5,10) ,
    "fit__solver" : [ "cholesky","lsqr","sparse_cg"] ,
    "fit__tol" : np.logspace(-5,0,10) ,
                     }]
grid_params_pls = [{
    "fit__n_components" : np.arange(2,8)
}]
min_samples_split_range = [0.5, 0.7 , 0.9]

grid_params_gbr =[{
    "fit__max_features" : ["sqrt","log2"] ,
    "fit__loss" : ["ls","lad","huber","quantile"] , 
    "fit__max_depth" : [5,6,7,8] ,
    "fit__min_samples_split" : min_samples_split_range ,
}]
grid_params_rfr =[{
    "fit__max_features" : ["sqrt","log2"] , 
    "fit__max_depth" : [5,6,7,8] ,
    "fit__min_samples_split" : min_samples_split_range ,
}]
grid_params_svr =[{
    "fit__kernel" : ["rbf", "linear"] ,
    "fit__degree" : [2, 3, 5] , 
    "fit__gamma" : np.logspace(-5,1,10) ,
}]
grid_params_KR =[{
    "fit__kernel" : ["rbf","linear"] , 
    "fit__gamma" : np.logspace(-5,1,10) ,
}]
pipe = [
    pipe_linear , pipe_lasso ,  pipe_pca ,
    pipe_ridge , pipe_pls , pipe_gbr , 
    pipe_rfr , pipe_svr , pipe_KR 
]

params = [
    grid_params_linear , grid_params_lasso , grid_params_pca,
    grid_params_ridge , grid_params_pls , grid_params_gbr ,
    grid_params_rfr , grid_params_svr , grid_params_KR
]

In [None]:
del train_y['label']

In [None]:
train_y0 = train_y['Y_01']
valid_y0 = valid_y['Y_01']

In [None]:
jobs = 20

grid_dict = {
    0: 'Linear', 
    1: 'Lasso', 
    2: 'pca regression' , 
    3: 'Ridge' ,
    4: 'PLSRegression',
    5: "GradientDescentRegressor" ,
    6: "RandomForestRegressor" ,
    7: "SupportVectorRegressor" ,
    8: "Kernel RidgeRegression"
            }

model_mse = {}
model_r2 = {}
model_best_params = {}

for idx , (param , model) in enumerate(zip(params , pipe)) :
    search = GridSearchCV(model, param, scoring  = "neg_mean_squared_error", cv=cv, n_jobs=jobs , verbose=-1)
    search.fit(train_x, train_y0)
    y_pred = search.predict(valid_x)
    model_mse[grid_dict.get(idx)] = mse(valid_y0, y_pred)  
    model_r2[grid_dict.get(idx)] = r2(valid_y0, y_pred)  
    model_best_params[grid_dict.get(idx)] = search.best_params_
print("finish")

fig ,ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale = 2)
output = pd.DataFrame([model_r2.keys() , model_r2.values()], index = ["algo","r2"]).T
output.sort_values(["r2"], ascending= False ,inplace=True)
ax = sns.barplot(y="algo", x="r2", data=output)
plt.show()

#### 결과 저장

In [None]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(1,15): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [None]:
test_x = pd.read_csv('./test.csv').drop(columns=['ID'])

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

In [None]:
submit.to_csv('./submit.csv', index=False)