# 과제
#### 주어진 데이터는 여러 column 정보를 이용해 자전거의 수요(count)를 예측하는 데이터입니다.  지금까지의 세션에서 알아본 내용을 바탕으로 자유롭게 모델링을 해주세요

#### 필수 포함 내용
* 오늘 알아본 회귀 모델들 전부 사용(로지스틱 회귀 제외)하고 결과 비교
* 규제가 있는 모델의 alpha 파라미터 값 변화에 따른 평가 결과 비교 (단순 값 비교, 시각화 등 자유)
* train test 분할은 train_test_split(X_features, y_target, test_size=0.2, random_state=1004)
* 평가지표는 RMSE 사용

## ++
* 본 과제는 성능보다는 오늘 알아본 모델의 사용법을 알아보는데 의의를 두었습니다. 성능이 과제의 점수를 매기진 않지만 전체에서 최고 성능을 낸 분에게 제가 커피를 사 드립니다.

### 자전거 수요 예측 데이터
    * datetime: hourly date + timestamp
    * season: 1=봄, 2=여름, 3=가을, 4=겨울
    * holiday: 1=주말을 제외한 국경일 등의 휴일, 0=휴일이 아닌 날
    * workingday: 1=주말 및 휴일이 아닌 주중, 0=주말 및 휴일
    * weather:
    * 1=맑음, 약간 구름 낀 흐림
    * 2=안개, 안개 + 흐림
    * 3=가벼운 눈, 가벼운 비 + 천둥
    * 4=심한 눈/비, 천둥/번개
    * temp: 온도(섭씨)
    * atemp: 체감온도(섭씨)
    * humidity: 상대습도
    * windspeed: 풍속
    * casual: 사전에 등록되지 않은 사용자가 대여한 횟수
    * registered: 사전에 등록된 사용자가 대여한 횟수
    * count: 대여 횟수 (target)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

bike_df = pd.read_csv("./data/bike.csv")

In [2]:
bike_df.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [4]:
#  문자형 datatime 칼럼을 datetime형으로 대체 후 연/월/일/시간으로 쪼개기
bike_df['datetime'] = bike_df.datetime.apply(pd.to_datetime)
bike_df['year'] = bike_df.datetime.apply(lambda x: x.year)
bike_df['month'] = bike_df.datetime.apply(lambda x : x.month)
bike_df['day'] = bike_df.datetime.apply(lambda x : x.day)
bike_df['hour'] = bike_df.datetime.apply(lambda x: x.hour)
bike_df.head(10)
bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
 12  year        10886 non-null  int64         
 13  month       10886 non-null  int64         
 14  day         10886 non-null  int64         
 15  hour        10886 non-null  int64         
dtypes: datetime64[ns](1), 

In [5]:
drop_columns = ['datetime','casual','registered']
bike_df.drop(drop_columns, axis=1,inplace=True)
bike_df

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,year,month,day,hour
0,1,0,0,1,9.84,14.395,81,0.0000,16,2011,1,1,0
1,1,0,0,1,9.02,13.635,80,0.0000,40,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,0.0000,32,2011,1,1,2
3,1,0,0,1,9.84,14.395,75,0.0000,13,2011,1,1,3
4,1,0,0,1,9.84,14.395,75,0.0000,1,2011,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,19.695,50,26.0027,336,2012,12,19,19
10882,4,0,1,1,14.76,17.425,57,15.0013,241,2012,12,19,20
10883,4,0,1,1,13.94,15.910,61,15.0013,168,2012,12,19,21
10884,4,0,1,1,13.94,17.425,61,6.0032,129,2012,12,19,22


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression , Ridge , Lasso

y_target = bike_df['count']
X_features = bike_df.drop(['count'],axis=1,inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=1004)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

np.sqrt(mean_squared_error(y_test, pred))

142.9359009335816

In [7]:
# 타깃 칼럼인 count 값을 log1p로 로그 변환
y_target_log = np.log1p(y_target)

# 로그 변환된 y_target_log를 반영하여 학습/테스트 데이터 셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target_log, test_size=0.3, random_state=0)
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

# 테스트 데이터 셋의 Target 값은 Log 변환되었으므로 다시 expm1를 이용하여 원래 scale로 변환
y_test_exp = np.expm1(y_test)

# 예측 값 역시 Log 변환된 타깃 기반으로 학습되어 예측되었으므로 다시 exmpl으로 scale변환
pred_exp = np.expm1(pred)

np.sqrt(mean_squared_error(y_test_exp ,pred_exp))

162.59426809004614

In [42]:
X_features_ohe = pd.get_dummies(X_features, columns=['year', 'month','day', 'hour', 'holiday',
                                              'workingday','season','weather'])
X_features_ohe.head(10)

Unnamed: 0,temp,atemp,humidity,windspeed,year_2011,year_2012,month_1,month_2,month_3,month_4,...,workingday_0,workingday_1,season_1,season_2,season_3,season_4,weather_1,weather_2,weather_3,weather_4
0,9.84,14.395,81,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
1,9.02,13.635,80,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,9.02,13.635,80,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
3,9.84,14.395,75,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
4,9.84,14.395,75,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
5,9.84,12.88,75,6.0032,1,0,1,0,0,0,...,1,0,1,0,0,0,0,1,0,0
6,9.02,13.635,80,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
7,8.2,12.88,86,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
8,9.84,14.395,75,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
9,13.12,17.425,76,0.0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0


In [43]:
def get_rmse(model):
# 모델이 들어왔을때
    pred = model.predict(X_test)
# X_test로 모델을 예측하고
    mse = mean_squared_error(y_test , pred)
    rmse = np.sqrt(mse)
    print('{0} RMSE: {1}'.format(model ,np.round(rmse, 3)))

def get_rmses(models):
    rmses = [ ]
    for model in models:
        rmse = get_rmse(model)
        rmses.append(rmse)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target, test_size=0.2, random_state=1004)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

ridge_reg_a0 = Ridge(alpha=0)
ridge_reg_a0.fit(X_train, y_train)

ridge_reg_a10 = Ridge(alpha=10)
ridge_reg_a10.fit(X_train, y_train)

ridge_reg_a100 = Ridge(alpha=100)
ridge_reg_a100.fit(X_train, y_train)

lasso_reg_a0 = Lasso(alpha=0)
lasso_reg_a0.fit(X_train, y_train)

lasso_reg_a01 = Lasso(alpha=0.06)
lasso_reg_a01.fit(X_train, y_train)

lasso_reg_a1 = Lasso(alpha=1)
lasso_reg_a1.fit(X_train, y_train)

ElasticNet_a0 = ElasticNet(alpha=0)
ElasticNet_a0.fit(X_train, y_train)

ElasticNet_a01 = ElasticNet(alpha=0.1)
ElasticNet_a01.fit(X_train, y_train)

ElasticNet_a1 = ElasticNet(alpha=1)
ElasticNet_a1.fit(X_train, y_train)

models = [lr_reg, ridge_reg_a0, ridge_reg_a10, ridge_reg_a100, 
          lasso_reg_a0, lasso_reg_a01, lasso_reg_a1, 
          ElasticNet_a0, ElasticNet_a01, ElasticNet_a1]
get_rmses(models)

  lasso_reg_a0.fit(X_train, y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  ElasticNet_a0.fit(X_train, y_train)
  model = cd_fast.enet_coordinate_descent(


LinearRegression() RMSE: 101.814
Ridge(alpha=0) RMSE: 101.875
Ridge(alpha=10) RMSE: 101.866
Ridge(alpha=100) RMSE: 105.623
Lasso(alpha=0) RMSE: 101.814
Lasso(alpha=0.06) RMSE: 101.793
Lasso(alpha=1) RMSE: 105.49
ElasticNet(alpha=0) RMSE: 101.814
ElasticNet(alpha=0.1) RMSE: 121.28
ElasticNet(alpha=1) RMSE: 149.706


  model = cd_fast.enet_coordinate_descent(


In [64]:
#원-핫 인코딩이 적용된 피처 데이터 세트 기반으로 학습/예측 데이터 분할 
X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log, test_size = 0.2, random_state = 1004)

#모델과 학습/ 테스트 데이터 세트를 입력하면 성능 평가 수치를 반환 
def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    if is_expm1 :
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)
    print("###", model, '###')
    print(np.sqrt(mean_squared_error(y_test, pred)))
#end of function get_model_predict

# 모델별로 평가 수행 
lr_reg = LinearRegression()
ridge_reg_a0 = Ridge(alpha=0)
ridge_reg_a10 = Ridge(alpha=10)
ridge_reg_a100 = Ridge(alpha=100)
lasso_reg_a0 = Lasso(alpha=0)
lasso_reg_a01 = Lasso(alpha=0.06)
lasso_reg_a1 = Lasso(alpha=1)
ElasticNet_a0 = ElasticNet(alpha=0)
ElasticNet_a01 = ElasticNet(alpha=0.1)
ElasticNet_a1 = ElasticNet(alpha=1)

models = [lr_reg, ridge_reg_a0, ridge_reg_a10, ridge_reg_a100, 
          lasso_reg_a0, lasso_reg_a01, lasso_reg_a1, 
          ElasticNet_a0, ElasticNet_a01, ElasticNet_a1]

for model in models:
    get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=True)

### LinearRegression() ###
95.4821491403464
### Ridge(alpha=0) ###
95.6137241914359
### Ridge(alpha=10) ###
96.6200751892072
### Ridge(alpha=100) ###
111.24919097353386


  model.fit(X_train, y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model.fit(X_train, y_train)
  model = cd_fast.enet_coordinate_descent(


### Lasso(alpha=0) ###
95.4821496069303
### Lasso(alpha=0.06) ###
168.1908540681255
### Lasso(alpha=1) ###
179.47634411960686
### ElasticNet(alpha=0) ###
95.4821496069303
### ElasticNet(alpha=0.1) ###
169.23459562445197
### ElasticNet(alpha=1) ###
177.98450864780455


  model = cd_fast.enet_coordinate_descent(


In [72]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

rf_reg = RandomForestRegressor(n_estimators=500)
gbm_reg = GradientBoostingRegressor(n_estimators=500)

for model in [rf_reg, gbm_reg]:
    get_model_predict(model,X_train.values, X_test.values, y_train.values, y_test.values,is_expm1=True)

### RandomForestRegressor(n_estimators=500) ###
47.38599850829775
### GradientBoostingRegressor(n_estimators=500) ###
51.04625223681326
