In [None]:
import os
import random
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR 
import math
import joblib 

random.seed(42)

In [None]:
# 최종 merge 데이터
mergedata = pd.read_csv('data/total_df.csv', encoding='cp949')
# mergedata = mergedata.sample(n=10000)
# month 열 추가
mergedata['기준_날짜시간']=pd.to_datetime(mergedata['기준_날짜시간'],format="%Y-%m-%d %H:%M:%S")
mergedata['month']=mergedata['기준_날짜시간'].dt.month
# 학습시킬 features로 df 재구성 
mergedata=mergedata[['month','time','요일', '전체_건수','버스 수', '버스_하차승객수', '지하철하차승객수', '강수', '기온', '풍속', 'dem']]
# one-hot encoding
mergedata=pd.get_dummies(mergedata, columns=['month','time','요일']) # 요일, month, time

In [None]:
mergedata

In [None]:
mergedata = mergedata.sample(10000)
X_train, X_test, y_train, y_test = train_test_split(mergedata.drop('전체_건수',axis=1), mergedata['전체_건수'], test_size=0.2, random_state=42)

In [None]:
# SVR 
pipeline = Pipeline([('Scaler', StandardScaler()), ('SVR', SVR())])

kernel_list = ['linear','rbf']
C_list = [1, 10, 20]
parameters = {'SVR__kernel':kernel_list, 'SVR__C':C_list}

svm_grid = GridSearchCV(pipeline, parameters, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs=-1) # 5-fold cross validation # 메모리 병렬처리 
svm_grid.fit(X_train, y_train)

svr_best_C = svm_grid.best_params_['SVR__C']
svr_best_kernel = svm_grid.best_params_['SVR__kernel']

reg = SVR(C = svr_best_C, 
          kernel = svr_best_kernel)
reg.fit(X_train, y_train)

y_pred=reg.predict(X_test)
RMSE = math.sqrt(mean_squared_error(y_test, y_pred))

print("SVR RMSE':{}".format(RMSE))
print("SVR best kernel:{}, best C:{}".format(svr_best_kernel, svr_best_C))
joblib.dump(reg,f'SVR_{round(RMSE,2)}_k{svr_best_kernel}_C{svr_best_C}.pkl')