In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split

from sklearn.svm import SVR
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
os.chdir("/content/drive/MyDrive/암빅데이터_경진대회/최종코드")

## [1] 데이터 로드

In [4]:
# 학습데이터 불러오기 (for Training)
df_x_tr = pd.read_csv("./data/preprocessed/data_x_tr.csv")
df_y_tr = pd.read_csv("./data/preprocessed/data_y_tr.csv")
df_x_tr.drop(['Unnamed: 0'],axis=1, inplace=True)  # (10000,9)
df_y_tr.drop(['Unnamed: 0'],axis=1, inplace=True)  # (10000,1)

In [5]:
# 검증데이터 불러오기 (for Test)
df_x_ts = pd.read_csv("./data/preprocessed/data_x_ts.csv")
df_y_ts = pd.read_csv("./data/preprocessed/data_y_ts.csv")
df_x_ts.drop(['Unnamed: 0'],axis=1, inplace=True)  # (5000,9)
df_y_ts.drop(['Unnamed: 0'],axis=1, inplace=True)  # (5000,1)

In [6]:
# Train, Valid 분할 - train_test_split 사용
x_train, x_val, y_train, y_val = train_test_split(df_x_tr, df_y_tr, test_size=0.1, shuffle=True, stratify=df_y_tr, random_state=23)

In [7]:
# index 번호 0부터 순서대로 reset.
x_train.reset_index(drop=True, inplace=True)
x_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)  

In [8]:
print("x_train의 shape:",x_train.shape)
print("x_val의 shape:",x_val.shape)
print("y_train의 shape:",y_train.shape)
print("y_val의 shape:",y_val.shape)

x_train의 shape: (9000, 9)
x_val의 shape: (1000, 9)
y_train의 shape: (9000, 1)
y_val의 shape: (1000, 1)


## [2] Modeling

### Support Vector Machine 

In [28]:
## RBF kernel SVM
# gamma와 C 조절하며 최적화하기

def get_best_params(C_range, gamma_range, reg):
    param_grid = dict(gamma = gamma_range, C = C_range)
    cv = StratifiedShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 32)

    grid = GridSearchCV(reg, param_grid = param_grid, cv = cv, scoring = 'neg_mean_absolute_error')
    grid.fit(x_train, y_train)

    print("최적 하이퍼 파라미터 :",grid.best_params_)
    mae= -1*grid.best_score_
    print("최적 mae :", mae)

    return grid.best_params_, grid.best_estimator_


C_range = np.array([1.e+00, 1.e+03])  # 1.e+05
gamma_range = np.array([1.e-06, 1.e-02])  # 1.e+00
svr_reg = SVR(kernel='rbf')

svc_params, best_svc = get_best_params(C_range, gamma_range, svr_reg)

최적 하이퍼 파라미터 : {'C': 1.0, 'gamma': 0.01}
최적 mae : 0.9828635811201947


In [11]:
## Best인 경우
svr_reg = SVR(kernel='rbf', C=1000, gamma=0.01)
svr_reg.fit(x_train, y_train)

# Validation (학습데이터의 1/10)
y_pred1 = svr_reg.predict(x_val)
print("Validation MAE :", mean_absolute_error(y_val, y_pred1))

# External Validation (검증데이터)
y_pred2 = svr_reg.predict(df_x_ts)
print("External Validation MAE :", mean_absolute_error(df_y_ts, y_pred2))

Validation MAE : 0.9814829164986304
External Validation MAE : 0.369215432928282
