In [37]:
# 필요한 라이브러리 설치 및 임포트

import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import recall_score as recall
import joblib

# [Step 3] 머신러닝 모델링1

#### **<span style="color:blue">[3-1] 학습 및 테스트 데이터 로딩</span>**

In [38]:
# train_x.csv / train_y.csv / test_x.csv / test_y.csv 4개의 데이터 로딩
x_train = pd.read_csv('./train_x.csv')
y_train = pd.read_csv('./train_y.csv')
x_test = pd.read_csv('./test_x.csv')
y_test = pd.read_csv('./test_y.csv')
x_train.tail(2)

Unnamed: 0,time,SO2,CO,O3,NO2,PM10,PM25,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),현지기압(hPa),지면온도(°C),month,day,hour,PM10_lag1
8730,2021-09-30 21:00:00,0.003,0.6,0.012,0.036,30.0,19.0,20.9,0.0,0.4,0.0,83,20.4,999.2,19.2,9,30,21,15.0
8731,2021-09-30 22:00:00,0.003,0.6,0.004,0.042,33.0,19.0,20.4,0.0,0.8,70.0,81,19.4,999.2,18.6,9,30,22,14.0


In [39]:
x_train = x_train.drop('time',axis=1)
y_train = y_train.drop('time',axis=1)
x_test = x_test.drop('time',axis=1)
y_test = y_test.drop('time',axis=1)

In [40]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8732 entries, 0 to 8731
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SO2        8732 non-null   float64
 1   CO         8732 non-null   float64
 2   O3         8732 non-null   float64
 3   NO2        8732 non-null   float64
 4   PM10       8732 non-null   float64
 5   PM25       8732 non-null   float64
 6   기온(°C)     8732 non-null   float64
 7   강수량(mm)    8732 non-null   float64
 8   풍속(m/s)    8732 non-null   float64
 9   풍향(16방위)   8732 non-null   float64
 10  습도(%)      8732 non-null   int64  
 11  증기압(hPa)   8732 non-null   float64
 12  현지기압(hPa)  8732 non-null   float64
 13  지면온도(°C)   8732 non-null   float64
 14  month      8732 non-null   int64  
 15  day        8732 non-null   int64  
 16  hour       8732 non-null   int64  
 17  PM10_lag1  8732 non-null   float64
dtypes: float64(14), int64(4)
memory usage: 1.2 MB


In [41]:
# 풍향은 범주형인데 상관없나?
import seaborn as sns
import matplotlib.pyplot as plt

In [42]:
#sns.kdeplot(x_train['풍향(16방위)'],hue=y_train)


---

In [43]:
x_test

Unnamed: 0,SO2,CO,O3,NO2,PM10,PM25,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),현지기압(hPa),지면온도(°C),month,day,hour,PM10_lag1
0,0.004,0.8,0.002,0.052,38.0,24.0,-2.8,0.0,2.3,50,55,2.7,1016.3,-3.3,1,2,0,23.0
1,0.004,0.8,0.002,0.052,34.0,23.0,-2.9,0.0,2.1,50,57,2.8,1015.8,-3.4,1,2,1,20.0
2,0.004,0.8,0.002,0.052,35.0,26.0,-2.6,0.0,1.9,50,57,2.9,1015.3,-2.5,1,2,2,20.0
3,0.004,0.6,0.002,0.046,33.0,24.0,-2.1,0.0,2.5,50,56,2.9,1015.1,-2.1,1,2,3,19.0
4,0.003,0.5,0.005,0.039,33.0,25.0,-1.9,0.0,2.0,50,55,2.9,1014.3,-2.1,1,2,4,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2130,0.002,0.4,0.044,0.010,10.0,9.0,13.2,0.0,4.3,340,45,6.8,1013.1,12.4,3,31,18,29.0
2131,0.002,0.4,0.036,0.017,11.0,8.0,12.3,0.0,2.9,340,47,6.7,1013.4,10.0,3,31,19,34.0
2132,0.002,0.4,0.032,0.018,10.0,7.0,11.6,0.0,2.7,340,48,6.5,1014.6,8.9,3,31,20,49.0
2133,0.003,0.3,0.038,0.013,11.0,5.0,10.5,0.0,3.5,320,51,6.4,1015.4,7.8,3,31,21,51.0


In [44]:
x_train

Unnamed: 0,SO2,CO,O3,NO2,PM10,PM25,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),현지기압(hPa),지면온도(°C),month,day,hour,PM10_lag1
0,0.002,0.3,0.046,0.012,10.0,7.0,17.0,0.6,2.9,50.0,90,17.4,999.6,17.8,10,2,0,31.0
1,0.003,0.3,0.041,0.014,9.0,6.0,16.6,0.0,1.8,50.0,90,16.9,999.8,17.6,10,2,1,27.0
2,0.003,0.3,0.039,0.015,9.0,8.0,16.8,0.0,0.9,20.0,91,17.4,1000.0,17.9,10,2,2,28.0
3,0.003,0.4,0.029,0.016,10.0,7.0,16.6,0.0,0.9,50.0,93,17.5,1000.5,17.7,10,2,3,26.0
4,0.002,0.4,0.029,0.016,11.0,9.0,16.7,0.0,1.3,20.0,93,17.6,1001.0,17.4,10,2,4,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8727,0.003,0.5,0.038,0.024,36.0,24.0,24.2,0.0,2.3,290.0,70,21.0,998.3,22.5,9,30,18,11.0
8728,0.003,0.7,0.020,0.036,35.0,24.0,22.7,0.0,0.2,0.0,71,19.5,998.4,20.6,9,30,19,15.0
8729,0.003,0.6,0.016,0.035,34.0,21.0,21.7,0.0,0.9,320.0,79,20.4,998.8,19.9,9,30,20,18.0
8730,0.003,0.6,0.012,0.036,30.0,19.0,20.9,0.0,0.4,0.0,83,20.4,999.2,19.2,9,30,21,15.0


#### **<span style="color:blue">[3-2] 모델링: LinearRegression</span>**

* Train과 Test로 나눈 데이터를 기준으로 LinearRegression 모델링을 진행하고 평가를 해주세요.
* 그리고 모델 파일을 pkl로 저장해주세요.
* 성능지표 : MSE, R-squared Score(소수점 5째자리까지)

In [45]:
# LinearRegression 코드를 작성하세요.
model = LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [46]:
# predict로 예측하여 y_pred_LR에 저장해보세요.
y_pred_LR = model.predict(x_test)

In [47]:
# test_y 데이터와 y_pred_LR 데이터로 성능을 평가하여 출력해보세요.
# 성능지표는 mse와 r2를 이용하세요.
print('mse:',mse(y_test,y_pred_LR))
print('r2 :',r2_score(y_test,y_pred_LR))


mse: 37.43321087025133
r2 : 0.932947722502021


In [48]:
# 학습한 모델을 pkl 파일로 저장해보세요.
joblib.dump(model, './LR_model.pkl')

['./LR_model.pkl']

# [Step 4] 머신러닝 모델링2

#### **<span style="color:blue">(선택 수행)[4-1] 모델링: 랜덤포레스트</span>**

* 랜덤포레스트: 배깅의 일종으로 의사결정나무(Decision Tree) 여러 개를 모아서 숲을 랜덤으로 구성하고 이를 종합해서 최종 모델을 산출하는 기법
* Train과 Test로 나눈 데이터를 기준으로 렌덤포레스트로 모델을 학습을 진행하고 평가를 해주세요.
* 그리고 모델 파일을 pkl로 저장해주세요.
* 성능지표 : MSE, R-squared Score(소수점 5째자리까지)

In [49]:
# 아래에 필요한 코드를 작성하고 결과를 확인합니다.
model = RFR(max_depth=5, random_state=1)
model.fit(x_train,y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(max_depth=5, random_state=1)

In [50]:
# Test 데이터로 성능을 평가해보세요.
y_pred_RFR = model.predict(x_test)
print('mse:',mse(y_pred_RFR,y_test))
print('r2 :',r2_score(y_pred_RFR,y_test))

mse: 40.48350989072956
r2 : 0.9223579574536955


In [51]:
# 학습한 모델을 파일로 저장해보세요.
joblib.dump(model, './RFR_model.pkl')

['./RFR_model.pkl']

In [52]:
# (다음 진행은 선택입니다)
# 렌덤포레스트로 학습한 모델의 feature_importances를 구해보세요.
# 확인할 수 있는 내용으로 우리 모델에서의 인사이트를 정리해보세요.
# Feature의 중요도 확인
import numpy as np
print(x_train.columns)
print(np.round(model.feature_importances_,3))



Index(['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', '기온(°C)', '강수량(mm)',
       '풍속(m/s)', '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)',
       'month', 'day', 'hour', 'PM10_lag1'],
      dtype='object')
[0.    0.001 0.001 0.    0.962 0.02  0.    0.    0.    0.    0.003 0.001
 0.004 0.    0.    0.001 0.002 0.001]


In [53]:
import matplotlib.pyplot as plt
import seaborn as sns
# sns.scatterplot(x_test['PM10'])
# sns.kdeplot(y_test) 
# plt.show()

In [54]:
# (선택) 확인할 수 있는 인사이트
# 1. 현재시간의 미세먼지 농도는 다음시간의 미세먼지 농도에 영향을 미친다
# 2.
# 3.

In [55]:
model = RFR(max_depth=5, random_state=1)
model.fit(x_train.drop('PM10',axis=1),y_train)
e = model.predict(x_test.drop('PM10',axis=1))
print('mse:',mse(e,y_test))
print('r2 :',r2_score(e,y_test))
print(x_train.drop('PM10',axis=1).columns)
print(np.round(model.feature_importances_,3))

  


mse: 283.057833719622
r2 : 0.5043466026941981
Index(['SO2', 'CO', 'O3', 'NO2', 'PM25', '기온(°C)', '강수량(mm)', '풍속(m/s)',
       '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)', 'month',
       'day', 'hour', 'PM10_lag1'],
      dtype='object')
[0.    0.061 0.002 0.001 0.575 0.001 0.    0.002 0.001 0.004 0.011 0.272
 0.002 0.019 0.008 0.002 0.039]


In [56]:
model = RFR(max_depth=5, random_state=1)
model.fit(x_train.drop(['PM10','PM25'],axis=1),y_train)
f = model.predict(x_test.drop(['PM10','PM25'],axis=1))
print('mse:',mse(f,y_test))
print('r2 :',r2_score(f,y_test))
print(x_train.drop(['PM10','PM25'],axis=1).columns)
print(np.round(model.feature_importances_,3))

  


mse: 461.19730077399583
r2 : -0.5037124322398856
Index(['SO2', 'CO', 'O3', 'NO2', '기온(°C)', '강수량(mm)', '풍속(m/s)', '풍향(16방위)',
       '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)', 'month', 'day', 'hour',
       'PM10_lag1'],
      dtype='object')
[0.003 0.133 0.098 0.035 0.009 0.    0.029 0.012 0.015 0.053 0.141 0.01
 0.023 0.249 0.002 0.188]


In [57]:
model = RFR(max_depth=5, random_state=1)
model.fit(x_train.drop(['PM10','증기압(hPa)'],axis=1),y_train)
g = model.predict(x_test.drop(['PM10','증기압(hPa)'],axis=1))
print('mse:',mse(g,y_test))
print('r2 :',r2_score(g,y_test))
print(x_train.drop(['PM10','증기압(hPa)'],axis=1).columns)
print(np.round(model.feature_importances_,3))

  


mse: 285.98444733223016
r2 : 0.5146627777715659
Index(['SO2', 'CO', 'O3', 'NO2', 'PM25', '기온(°C)', '강수량(mm)', '풍속(m/s)',
       '풍향(16방위)', '습도(%)', '현지기압(hPa)', '지면온도(°C)', 'month', 'day', 'hour',
       'PM10_lag1'],
      dtype='object')
[0.    0.059 0.003 0.002 0.576 0.003 0.    0.002 0.002 0.005 0.271 0.002
 0.024 0.009 0.002 0.039]


In [58]:
model = RFR(max_depth=5, random_state=1)
model.fit(x_train.drop(['PM10','CO'],axis=1),y_train)
h = model.predict(x_test.drop(['PM10','CO'],axis=1))
print('mse:',mse(h,y_test))
print('r2 :',r2_score(h,y_test))
print(x_train.drop(['PM10','CO'],axis=1).columns)
print(np.round(model.feature_importances_,3))

  


mse: 270.95453625070303
r2 : 0.5263622220974027
Index(['SO2', 'O3', 'NO2', 'PM25', '기온(°C)', '강수량(mm)', '풍속(m/s)', '풍향(16방위)',
       '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)', 'month', 'day', 'hour',
       'PM10_lag1'],
      dtype='object')
[0.    0.001 0.001 0.605 0.004 0.    0.002 0.002 0.005 0.017 0.295 0.001
 0.023 0.009 0.002 0.032]


In [59]:


model = RFR(max_depth=5, random_state=1)
model.fit(x_train.drop(['PM10','PM10_lag1'],axis=1),y_train)
i = model.predict(x_test.drop(['PM10','PM10_lag1'],axis=1))
print('mse:',mse(i,y_test))
print('r2 :',r2_score(i,y_test))
print(x_train.drop(['PM10','PM10_lag1'],axis=1).columns)
print(np.round(model.feature_importances_,3))

  


mse: 278.8400216054299
r2 : 0.526817974437586
Index(['SO2', 'CO', 'O3', 'NO2', 'PM25', '기온(°C)', '강수량(mm)', '풍속(m/s)',
       '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)', 'month',
       'day', 'hour'],
      dtype='object')
[0.    0.062 0.003 0.001 0.578 0.002 0.    0.003 0.002 0.005 0.011 0.283
 0.003 0.022 0.021 0.003]


In [76]:
model = RFR(max_depth=5, random_state=1)
model.fit(x_train.drop(['CO','PM10_lag1'],axis=1),y_train)
r = model.predict(x_test.drop(['CO','PM10_lag1'],axis=1))
print('mse:',mse(r,y_test))
print('r2 :',r2_score(r,y_test))
print(x_train.drop(['CO','PM10_lag1'],axis=1).columns)
print(np.round(model.feature_importances_,3))

  


mse: 40.40165713163392
r2 : 0.9224528224344097
Index(['SO2', 'O3', 'NO2', 'PM10', 'PM25', '기온(°C)', '강수량(mm)', '풍속(m/s)',
       '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)', 'month',
       'day', 'hour'],
      dtype='object')
[0.    0.001 0.    0.966 0.017 0.    0.    0.    0.    0.004 0.001 0.005
 0.    0.    0.002 0.002]


In [60]:
y_test

Unnamed: 0,PM10_1
0,34.0
1,35.0
2,33.0
3,33.0
4,32.0
...,...
2130,11.0
2131,10.0
2132,11.0
2133,11.0


In [61]:
e

array([41.86324802, 41.66394223, 50.31065835, ..., 13.40469409,
        9.29121611,  9.37752618])

#### **<span style="color:blue">(선택 수행)[4-2] 모델링: GradientBoosting</span>**

* GradientBoosting: 앞선 모델의 에러를 다음 모델의 예측 값으로 활용하면서 가중치 업데이트 하는데 경사하강법(Gradient Descent)를 활용해서 최적 모델을 만드는 기법
* Train과 Test로 나눈 데이터를 기준으로 그라디언트부스팅으로 모델을 학습을 진행하고 평가를 해주세요.
* 그리고 모델 파일을 pkl로 저장해주세요.
* 성능지표 : RMSE, R-squared Score(소수점 5째자리까지)

In [62]:
# 아래에 필요한 코드를 작성하고 결과를 확인합니다.
model = GBR(max_depth=5,random_state=1)
model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(max_depth=5, random_state=1)

In [63]:
# Test 데이터로 성능을 평가해보세요.
y_pred_GBR = model.predict(x_test)
print('rmse: ',mse(y_test,y_pred_GBR)**0.5)
print('r2 : ',r2_score(y_test,y_pred_GBR))

rmse:  6.281779295244021
r2 :  0.9293158889533608


In [64]:
# 학습한 모델을 파일로 저장해보세요.
joblib.dump(model, './GBR_model.pkl')

['./GBR_model.pkl']

In [65]:
# (다음 진행은 선택입니다)
# 그라디언트부스팅으로 학습한 모델의 feature_importances를 구해보세요.
# 확인할 수 있는 내용으로 우리 모델에서의 인사이트를 정리해보세요.
# Feature의 중요도 확인
print(x_train.columns)
np.around(model.feature_importances_,5)


Index(['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', '기온(°C)', '강수량(mm)',
       '풍속(m/s)', '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)',
       'month', 'day', 'hour', 'PM10_lag1'],
      dtype='object')


array([5.3000e-04, 2.7900e-03, 1.8200e-03, 8.0000e-04, 9.6987e-01,
       5.8400e-03, 1.7400e-03, 5.1000e-04, 9.3000e-04, 7.7000e-04,
       3.0400e-03, 1.1600e-03, 3.7200e-03, 1.6000e-03, 1.9000e-04,
       2.0200e-03, 9.7000e-04, 1.7100e-03])

In [66]:
# 확인할 수 있는 인사이트
# 1.PM10의 가중치가 높음
# 2.PM10 >>>>>>>>>>>>>>>>>>> PM2.5 > CO
# 3.

#### **<span style="color:blue">(선택 수행)[4-3] 모델링: Self Choice Model</span>**

* Self Choice Model: 앞선 교육과정에서 배운 머신러닝 모델 중 어떤 것이든 좋습니다. 원하는 모델을 선택해서 학습을 시켜보세요.
* Train과 Test로 나눈 데이터를 기준으로 Self Choice Model로 모델을 학습을 진행하고 평가를 해주세요.
* 그리고 모델 파일을 pkl로 저장해주세요.
* 성능지표 : RMSE, R-squared Score(소수점 5째자리까지)

In [67]:
# 아래에 필요한 코드를 작성하고 결과를 확인합니다.
from sklearn.svm import SVR
model = SVR(kernel='rbf',C=0.01,gamma=0.5)
model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


SVR(C=0.01, gamma=0.5)

In [68]:
y_pred_svm = model.predict(x_test)

In [69]:
# Test 데이터로 성능을 평가해보세요.
print('rmse:',mse(y_test,y_pred_svm)**0.5)
print('r2:',r2_score(y_test,y_pred_svm))

rmse: 26.08935486514733
r2: -0.21922296131352903


In [70]:
y_pred_svm

array([29.89083679, 29.89083679, 29.89083679, ..., 29.89083679,
       29.89083679, 29.89083679])

In [71]:
# 학습한 모델을 파일로 저장해보세요.




In [72]:
# (다음 진행은 선택입니다)
# 그라디언트부스팅으로 학습한 모델의 feature_importances를 구해보세요.
# 확인할 수 있는 내용으로 우리 모델에서의 인사이트를 정리해보세요.
# Feature의 중요도 확인



In [73]:
# Feature의 중요도 확인





In [74]:
# 확인할 수 있는 인사이트
# 1.
# 2.
# 3.

In [86]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=100)
model.fit(x_train,y_train)
y_pred_knn = model.predict(x_test)
print('rmse:',mse(y_test,y_pred_knn)**0.5)
print('r2:',r2_score(y_test,y_pred_knn))

rmse: 7.971402763256385
r2: 0.8861780665179587


In [96]:
drop_cols = ['SO2'] 
model = GBR(max_depth=5,random_state=3)
model.fit(x_train.drop(drop_cols,axis=1),y_train)
y_pred_GBR = model.predict(x_test.drop(drop_cols,axis=1))
print('rmse: ',mse(y_test,y_pred_GBR)**0.5)
print('r2 : ',r2_score(y_test,y_pred_GBR))
joblib.dump(model, './self_model.pkl')
print(x_train.drop(drop_cols,axis=1).columns)
print(np.around(model.feature_importances_,6))

# 기존
# rmse:  6.281779295244021
# r2 :  0.9293158889533608
# 전체드랍
# rmse:  6.410372200428976
# r2 :  0.926392351188135
# drop so2
# rmse:  6.251752339460786
# r2 :  0.9299900151728114


rmse:  6.2451105615304385
r2 :  0.9301386914948764
Index(['CO', 'O3', 'NO2', 'PM10', 'PM25', '기온(°C)', '강수량(mm)', '풍속(m/s)',
       '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)', 'month',
       'day', 'hour', 'PM10_lag1'],
      dtype='object')
[2.20000e-03 1.95100e-03 8.17000e-04 9.69742e-01 6.22600e-03 1.07100e-03
 5.38000e-04 7.21000e-04 1.85700e-03 3.84600e-03 9.77000e-04 3.71700e-03
 1.66000e-03 9.30000e-05 2.06700e-03 9.03000e-04 1.61500e-03]


#### **<span style="color:blue">(선택 수행)[4-4] 머신러닝 모델에 대해 성능 최적화 진행</span>**

* 위 머신러닝 모델들에 대해 성능 최적화를 진행해보세요.

In [79]:
x_train.columns

Index(['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', '기온(°C)', '강수량(mm)',
       '풍속(m/s)', '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)',
       'month', 'day', 'hour', 'PM10_lag1'],
      dtype='object')

In [91]:
import warnings
warnings.filterwarnings(action='ignore') 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
params = {'max_depth':range(3,15),
          'min_samples_leaf':range(1,10),
          'min_samples_split':range(1,10),
          }

drop_cols = ['SO2'] 
gbr_model = GBR(random_state=1)
model = RandomizedSearchCV(
    gbr_model,
    params,
    cv=3
)
model.fit(x_train.drop(drop_cols,axis=1),y_train)
y_pred_GBR = model.predict(x_test.drop(drop_cols,axis=1))
print('rmse: ',mse(y_test,y_pred_GBR)**0.5)
print('r2 : ',r2_score(y_test,y_pred_GBR))
joblib.dump(model, './self_model.pkl')
print(x_train.drop(drop_cols,axis=1).columns)
print(np.around(model.feature_importances_,6))

# 기존
# rmse:  6.281779295244021
# r2 :  0.9293158889533608
# 전체드랍
# rmse:  6.410372200428976
# r2 :  0.926392351188135
# drop so2
# rmse:  6.251752339460786
# r2 :  0.9299900151728114


rmse:  6.797688122972817
r2 :  0.9172288625041064
Index(['CO', 'O3', 'NO2', 'PM10', 'PM25', '기온(°C)', '강수량(mm)', '풍속(m/s)',
       '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', '지면온도(°C)', 'month',
       'day', 'hour', 'PM10_lag1'],
      dtype='object')


AttributeError: 'RandomizedSearchCV' object has no attribute 'feature_importances_'

In [92]:
model.cv_results_

{'mean_fit_time': array([2.75107066e+00, 3.05785004e+00, 3.98945808e-03, 3.06148632e+00,
        1.00465902e+00, 1.32978034e+00, 1.01794608e+00, 3.01228340e+00,
        4.15031544e+00, 3.32427025e-03]),
 'std_fit_time': array([0.07775491, 0.05691946, 0.0008142 , 0.03386213, 0.00124096,
        0.0038497 , 0.01849865, 0.01969529, 0.05349332, 0.00047002]),
 'mean_score_time': array([0.01063903, 0.01096805, 0.        , 0.01195693, 0.00432046,
        0.00498811, 0.00465838, 0.01296639, 0.01760666, 0.        ]),
 'std_score_time': array([1.24364545e-03, 8.13810597e-04, 0.00000000e+00, 8.02831108e-04,
        4.71149592e-04, 6.00743332e-06, 4.73050513e-04, 1.41360587e-03,
        9.46913844e-04, 0.00000000e+00]),
 'param_min_samples_split': masked_array(data=[5, 5, 1, 2, 8, 7, 7, 5, 7, 1],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(d

In [None]:
# cv_result = {'mean_fit_time': array([2.75107066e+00, 3.05785004e+00, 3.98945808e-03, 3.06148632e+00,
#         1.00465902e+00, 1.32978034e+00, 1.01794608e+00, 3.01228340e+00,
#         4.15031544e+00, 3.32427025e-03]),
#  'std_fit_time': array([0.07775491, 0.05691946, 0.0008142 , 0.03386213, 0.00124096,
#         0.0038497 , 0.01849865, 0.01969529, 0.05349332, 0.00047002]),
#  'mean_score_time': array([0.01063903, 0.01096805, 0.        , 0.01195693, 0.00432046,
#         0.00498811, 0.00465838, 0.01296639, 0.01760666, 0.        ]),
#  'std_score_time': array([1.24364545e-03, 8.13810597e-04, 0.00000000e+00, 8.02831108e-04,
#         4.71149592e-04, 6.00743332e-06, 4.73050513e-04, 1.41360587e-03,
#         9.46913844e-04, 0.00000000e+00]),
#  'param_min_samples_split': masked_array(data=[5, 5, 1, 2, 8, 7, 7, 5, 7, 1],
#               mask=[False, False, False, False, False, False, False, False,
#                     False, False],
#         fill_value='?',
#              dtype=object),
#  'param_min_samples_leaf': masked_array(data=[4, 6, 2, 4, 8, 7, 3, 1, 6, 5],
#               mask=[False, False, False, False, False, False, False, False,
#                     False, False],
#         fill_value='?',
#              dtype=object),
#  'param_max_depth': masked_array(data=[8, 9, 13, 9, 3, 4, 3, 9, 13, 4],
#               mask=[False, False, False, False, False, False, False, False,
#                     False, False],
#         fill_value='?',
#              dtype=object),
#  'params': [{'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 8},
#   {'min_samples_split': 5, 'min_samples_leaf': 6, 'max_depth': 9},
#   {'min_samples_split': 1, 'min_samples_leaf': 2, 'max_depth': 13},
#   {'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 9},
#   {'min_samples_split': 8, 'min_samples_leaf': 8, 'max_depth': 3},
#   {'min_samples_split': 7, 'min_samples_leaf': 7, 'max_depth': 4},
#   {'min_samples_split': 7, 'min_samples_leaf': 3, 'max_depth': 3},
#   {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 9},
#   {'min_samples_split': 7, 'min_samples_leaf': 6, 'max_depth': 13},
#   {'min_samples_split': 1, 'min_samples_leaf': 5, 'max_depth': 4}],
#  'split0_test_score': array([0.92404611, 0.92281582,        nan, 0.92255512, 0.92956097,
#         0.94103371, 0.93543319, 0.91084647, 0.92028578,        nan]),
#  'split1_test_score': array([0.50243465, 0.48789379,        nan, 0.5151733 , 0.45775399,
#         0.46341084, 0.46727758, 0.53448192, 0.49320829,        nan]),
#  'split2_test_score': array([0.86316221, 0.87673163,        nan, 0.86439537, 0.89418752,
#         0.88650765, 0.88701024, 0.86451808, 0.87060016,        nan]),
#  'mean_test_score': array([0.76321432, 0.76248041,        nan, 0.76737459, 0.76050082,
#         0.76365073, 0.76324034, 0.76994882, 0.76136475,        nan]),
#  'std_test_score': array([0.18606673, 0.19507144,        nan, 0.17990694, 0.21456088,
#         0.21346548, 0.21020888, 0.16757103, 0.1906971 ,        nan]),
#  'rank_test_score': array([ 5,  6,  9,  2,  8,  3,  4,  1,  7, 10])}

In [93]:
model.best_params_

{'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 9}

In [97]:
model.best_score_

AttributeError: 'GradientBoostingRegressor' object has no attribute 'best_score_'