In [10]:
import numpy as np
import pandas as pd
import os

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 관련 라이브러리 임포트 
import matplotlib.font_manager as fm

#  한글글꼴로 변경
# plt.rcParams['font.family'] = '한글글꼴명'
plt.rcParams['font.size'] = 11.0
# plt.rcParams['font.family'] = 'batang'
# plt.rcParams['font.family'] = 'Malgun Gothic'

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
matplotlib.rcParams['axes.unicode_minus'] = False

# 그래프 기본 크기 설정 
plt.rcParams['figure.figsize'] = [10, 6]

from sklearn.ensemble import VotingClassifier, VotingRegressor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score, recall_score, precision_score, classification_report, r2_score

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.pipeline import Pipeline

# 퀴즈  : 회기문제 
- 자전거 대여 수요 예측 
- 앙상블 voting 방식으로 여러 모델을 그룹화 시켜  모델을 새로 정의하고 테스트하여라 

```
datetime - hourly date + timestamp (날짜와 시간)
season - 1 = spring, 2 = summer, 3 = fall, 4 = winter (계절)
holiday - whether the day is considered a holiday (휴일)
workingday - whether the day is neither a weekend nor holiday (평일)
weather - 
1: Clear, Few clouds, Partly cloudy, Partly cloudy (맑은 날씨)
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist (안개)
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds (가벼운 눈, 비)
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog (폭설, 폭우)
temp - temperature in Celsius (온도)
atemp - “feels like” temperature in Celsius (체감 온도)
humidity - relative humidity (상대 습도)
windspeed - wind speed (풍속)
casual - number of non-registered user rentals initiated (비회원 대여량)
registered - number of registered user rentals initiated (회원 대여량)
count - number of total rentals (총 대여량)
```

In [11]:
bike_df = pd.read_csv('data/bike_train.csv')
print(bike_df.shape)
bike_df.head(3)

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32


In [12]:
# NULL 값은 없음 
bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [13]:
# 문자열을 datetime 타입으로 변경. 
# 시계열 형식으로 변경하여 년월일시간 등으로 후에 사용
bike_df['datetime'] = bike_df.datetime.apply(pd.to_datetime)
bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB


In [14]:
# datetime 타입에서 년, 월, 일, 시간 추출
bike_df['year'] = bike_df.datetime.apply(lambda x : x.year)
bike_df['month'] = bike_df.datetime.apply(lambda x : x.month)
bike_df['day'] = bike_df.datetime.apply(lambda x : x.day)
bike_df['hour'] = bike_df.datetime.apply(lambda x: x.hour)
print(bike_df.info())
bike_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
 12  year        10886 non-null  int64         
 13  month       10886 non-null  int64         
 14  day         10886 non-null  int64         
 15  hour        10886 non-null  int64         
dtypes: datetime64[ns](1), 

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2


In [15]:
# 회원, 비회원 대여량은 삭제 
drop_columns = ['datetime','casual','registered']
bike_df.drop(drop_columns, axis=1,inplace=True)

In [16]:
# X y 설정
y_target = bike_df['count']
X_features = bike_df.drop(['count'],axis=1,inplace=False)

In [17]:
# 'year','month','hour','season','weather' feature들을 One Hot Encoding
X_features = pd.get_dummies(X_features, columns=['year','month','hour', 'holiday',
                                              'workingday','season','weather'])

In [18]:
X_features.columns

Index(['temp', 'atemp', 'humidity', 'windspeed', 'day', 'year_2011',
       'year_2012', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'holiday_0',
       'holiday_1', 'workingday_0', 'workingday_1', 'season_1', 'season_2',
       'season_3', 'season_4', 'weather_1', 'weather_2', 'weather_3',
       'weather_4'],
      dtype='object')

In [19]:
# Train test 분리
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

In [20]:
# from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor

model_linear = LinearRegression()
model_lasso = Lasso(alpha=0.001)
model_ridge = Ridge(alpha=0.001)

model_vo_r = VotingRegressor( estimators=[('LINEAR', model_linear),('LASSO', model_lasso), ('RIDGE', model_ridge)])

model_vo_r.fit(X_train , y_train)
model_vo_r.score(X_test , y_test), model_vo_r.score(X_train , y_train)

  model = cd_fast.enet_coordinate_descent(


(0.6824857410220659, 0.6977477304913294)

In [21]:
r2_score(y_test, model_vo_r.predict(X_test))

0.6824857410220659

In [22]:
regressor = [model_linear, model_lasso, model_ridge]
for r in regressor:
    r.fit(X_train , y_train)
    print(r, r.score(X_test , y_test), r.score(X_train , y_train))
    print()

LinearRegression() 0.6824850960662128 0.6977477015479163

Lasso(alpha=0.001) 0.6824875811389104 0.6977475255657988

Ridge(alpha=0.001) 0.6824845215232669 0.6977477889294663



  model = cd_fast.enet_coordinate_descent(
