<font size=6><b>Bike Sharing Demand - ML

<img src="./logo.png">
* ref : https://www.kaggle.com/competitions/bike-sharing-demand/data <br>
* ref : https://dacon.io/competitions/official/235985/data

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

from sklearn.ensemble     import RandomForestRegressor
from sklearn.tree         import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# ---- 추가 모델
from sklearn.ensemble     import AdaBoostRegressor, VotingRegressor
from xgboost              import XGBRegressor
from lightgbm             import LGBMRegressor



# Data Load

In [3]:
train = pd.read_csv("./train.csv", parse_dates=['datetime'])
test  = pd.read_csv("./test.csv" , parse_dates=['datetime'])

In [4]:
df_list = [train, test]
for df in df_list:
    df.rename(columns = {'datetime' : 'regdate', 'count' : 'regcount'}, inplace = True)
    df.columns = df.columns.str.lower()
    print(df.info())    
    print("====="*10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   regdate     10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  regcount    10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dty

## 점수보기

# Feature Engineering

## 파생피쳐
* regdate
* day_type

In [5]:
df_list = [train, test]
for df in df_list:
    df['y'] = df['regdate'].dt.year
    df['m'] = df['regdate'].dt.month
    df['d'] = df['regdate'].dt.day
    # train과 test 날짜가 다르다 / 있으면 안 된다
    df['h'] = df['regdate'].dt.hour
    df['w'] = df['regdate'].dt.dayofweek
    #df['woy'] = df['regdate'].dt.weekofyear
    
    
    
    df['day_type'] = 0
    df['day_type'] = np.where( (df['holiday']==0) & (df['workingday'] == 1),   1,  df['day_type'])
    df['day_type'] = np.where( (df['holiday']==1) & (df['workingday'] == 0),   2,  df['day_type'])
    
    # df['peak'] = df[['h', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  ( x['h'] == 8 or 17 <= x['h'] <= 18 or 12 <= x['h'] <= 12)) or (x['workingday'] == 0 and  10 <= x['h'] <= 19)], axis = 1)
    # df['ideal'] = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 27 and x['windspeed'] < 30], axis = 1)
    # df['sticky'] = df[['humidity', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['humidity'] >= 60], axis = 1)
    
#---------------------------------------------------------------------   
    #sandy
    df['holiday'] = df[['m', 'd', 'holiday', 'y']].apply(lambda x: (x['holiday'], 1)[x['y'] == 2012 and x['m'] == 10 and (x['d'] in [30])], axis = 1)
    #christmas day and others
    df['holiday'] = df[['m', 'd', 'holiday']].apply(lambda x: (x['holiday'], 1)[x['m'] == 12 and (x['d'] in [24, 26, 31])], axis = 1)
    df['workingday'] = df[['m', 'd', 'workingday']].apply(lambda x: (x['workingday'], 0)[x['m'] == 12 and x['d'] in [24, 31]], axis = 1)
    df.set_index('regdate', inplace=True)
    df.drop('d', axis = 1, inplace = True)
    
#---------------------------------------------------------------------
    # oh - encoding
    df =pd.get_dummies(train, columns=['season','weather','m','y', 'h','w', 'day_type'])
    # df.info()
#---------------------------------------------------------------------
    # target scaler : 회귀에서는 target 반드시 스케일링 한다
    
#---------------------------------------------------------------------    
    

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6493 entries, 2011-01-20 00:00:00 to 2012-12-31 23:00:00
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      6493 non-null   int64  
 1   holiday     6493 non-null   int64  
 2   workingday  6493 non-null   int64  
 3   weather     6493 non-null   int64  
 4   temp        6493 non-null   float64
 5   atemp       6493 non-null   float64
 6   humidity    6493 non-null   int64  
 7   windspeed   6493 non-null   float64
 8   y           6493 non-null   int64  
 9   m           6493 non-null   int64  
 10  h           6493 non-null   int64  
 11  w           6493 non-null   int64  
 12  day_type    6493 non-null   int64  
dtypes: float64(3), int64(10)
memory usage: 710.2 KB


## windspeed 0 채우기

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10886 entries, 2011-01-01 00:00:00 to 2012-12-19 23:00:00
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      10886 non-null  int64  
 1   holiday     10886 non-null  int64  
 2   workingday  10886 non-null  int64  
 3   weather     10886 non-null  int64  
 4   temp        10886 non-null  float64
 5   atemp       10886 non-null  float64
 6   humidity    10886 non-null  int64  
 7   windspeed   10886 non-null  float64
 8   casual      10886 non-null  int64  
 9   registered  10886 non-null  int64  
 10  regcount    10886 non-null  int64  
 11  y           10886 non-null  int64  
 12  m           10886 non-null  int64  
 13  h           10886 non-null  int64  
 14  w           10886 non-null  int64  
 15  day_type    10886 non-null  int64  
dtypes: float64(3), int64(13)
memory usage: 1.4 MB


## windspeed 0 채우기

In [8]:
train.shape, test.shape

((10886, 16), (6493, 13))

In [9]:
def my_fill_windspeed(df):
    target = df[['regcount','casual','registered']]
    df = df.drop(['regcount','casual','registered'], axis=1)

    df1   = df[df['windspeed'] != 0]
    y_df1 = df1['windspeed']
    X_df1 = df1.drop('windspeed', axis=1)
    
    rf = RandomForestRegressor(random_state=11)
    X_df1_8, X_df1_2, y_df1_8, y_df1_2 = train_test_split(X_df1, y_df1, test_size=0.2,random_state=11)
    rf.fit(X_df1_8, y_df1_8)
    
    pred = rf.predict(X_df1_2)
    mse_score = mean_squared_error(y_df1_2, pred)
    print("RMSE : ", np.sqrt(mse_score) )

    df0 = df[df['windspeed'] == 0]
    y_df0 = df0['windspeed']
    X_df0 = df0.drop('windspeed', axis=1)
    pred = rf.predict(X_df0)
    widx = X_df0.index.values
    
    df.loc[widx, 'windspeed'] = pred
    return pd.concat([df, target], axis=1)

In [10]:
df = pd.concat([train, test], axis=0, ignore_index=True)
train_idx = df.iloc[:train.shape[0], :].index
test_idx  = df.iloc[train.shape[0]:, :].index
#---------------------------------
df = my_fill_windspeed(df)
#---------------------------------
train = df.loc[train_idx]
test  = df.loc[test_idx]
test = test.reset_index(drop=True)

# train분리해서 학습시켜라/ train에는 없고, test에 있는 경우도 있다/ 없는 내용도 예측할 수 있냐 테스트

RMSE :  4.859093922505261


## 점수보기

* class sklearn.linear_model.<b>Ridge</b>(alpha=1.0, *, fit_intercept=True, copy_X=True, max_iter=None, tol=0.0001, solver='auto', positive=False, random_state=None)
* class sklearn.linear_model.<b>Lasso</b>(alpha=1.0, *, fit_intercept=True, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')

★CART 모델 : Classifacationa And Regression Tree model/ 대부분의 Tree모델은 회귀, 분류 둘다 제공

★sklearn : estimator - model - regressor/classifier

In [11]:
import sklearn
# sklearn.metrics.get_scorer_names()

In [12]:
modelsss = Ridge(alpha=1.0, random_state=0)
modelsss.__class__.__name__
# 모델 이름 확인하기!!

'Ridge'

In [13]:
#-----------------모델의 가중치(coef_ 혹은 feature_importances_ ) 차트로 확인
def my_view_chart(model_name, model, X_train):
    if tpl[0] == "RIDGE" :
        ax1 = plt.subplot(4,1,1)
        s = pd.Series(model.coef_, index=X_train.columns).sort_values()
        sns.barplot(x=s.values, y=s.index, ax=ax1)
        plt.show()
    elif tpl[0] == "RF" :
        ax2 = plt.subplot(4,1,2)
        s = pd.Series(model.feature_importances_, index=X_train.columns).sort_values()
        sns.barplot(x=s.values, y=s.index, ax=ax2)
        plt.show()
    elif tpl[0] == "LGBM" :
        ax3 = plt.subplot(4,1,3)
        s = pd.Series(model.feature_importances_, index=X_train.columns).sort_values()
        sns.barplot(x=s.values, y=s.index, ax=ax3)
        plt.show()
    elif tpl[0] == "XGB" :
        ax4 = plt.subplot(4,1,4)
        # pd.Series(model.get_booster().get_score(importance_type='weight'), index=X_train.columns).sort_values().plot(kind='bar')
        s = pd.Series(model.get_booster().get_score(importance_type='weight'), index=X_train.columns).sort_values()
        # weight,gain,
        sns.barplot(x=s.values, y=s.index, ax=ax4)
        plt.show()
        
#---------------------------------------------------------------------
# tree는 무조건 feature_importance있다. regression이 문제      
# feature_importance확인은 무조건 fit한 이후에!!!!
# VotingRegressor, ansemble은 각각 피쳐 중요도 다른 애들 섞어야 점수가 좋다, 서로가 서로를 보완 > overfitting 방지
# "RF"-"LGBM" : 둘다 'h'에 치중한 학습 / 오버피팅은 당연
#---------------------------------------------------------------------

* 원핫 인코딩 하고 히트맵 그려보기

In [14]:
# plt.figure(figsize=(8,8))
# sns.heatmap(df[['casual', 'registered','holiday', 'workingday','day_type_0',
#        'day_type_1', 'day_type_2']].corr(), annot = True, fmt = '2f', cmap = 'coolwarm')
# plt.show()

#--> 그나마 'holiday', 'workingday', 'day_type'별로 연관성이 있어보임 
#--> 'holiday', 'workingday' 과 'day_type' 다중공선 걸림  --> 칼럼 특징 다 살리는 'day_type'살리는 것이 의미있다고 판단
#--> 그러나 점수보기 전까지는 모른다 뭘 살려야 할지

# # 상감분석
# 0~0.3미만은 연관성이 거의 없다
# 0.3~0.7 연관성이 높다
# 0.7~1 연관성이 매우 높다 다중공성 O
# linear 연관성만 보여줌, 맹신하면 안 됨

In [15]:
# plt.figure(figsize=(20,20))
# sns.heatmap(df[['casual', 'registered','h_0', 'h_1', 'h_2', 'h_3', 'h_4',
#        'h_5', 'h_6', 'h_7', 'h_8', 'h_9', 'h_10', 'h_11', 'h_12', 'h_13',
#        'h_14', 'h_15', 'h_16', 'h_17', 'h_18', 'h_19', 'h_20', 'h_21', 'h_22',
#        'h_23', ]].corr(), annot = True, fmt = '2f', cmap = 'coolwarm')
# plt.show()
# 0~7시, 17~18시 타겟과 연관관계 있어보인다

In [16]:
# plt.figure(figsize=(20,20))
# sns.heatmap(train[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
#        'humidity', 'windspeed', 'y', 'm', 'd', 'h', 'w', 'day_type',
#        'regcount', 'casual', 'registered']].corr(), annot = True, fmt = '2f', cmap = 'coolwarm')
# plt.show()

## 점수보기

In [17]:
# my_fit_score(train)
# # my_fit_score(train, chart_view=True)

# 타겟스케일링 - 아웃라이어삭제
# VR-XGB-LGBM :0.28827793271329355

## feature_importance 확인

In [18]:
# train.columns

## 원핫인코딩

In [19]:
for col in train.columns:
    print(col,df[col].nunique())
# 이산형 데이터 가지는 컬럼확인

season 4
holiday 2
workingday 2
weather 4
temp 50
atemp 65
humidity 89
windspeed 2191
y 2
m 12
h 24
w 7
day_type 3
regcount 822
casual 309
registered 731


In [20]:
df.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'y', 'm', 'h', 'w', 'day_type', 'regcount',
       'casual', 'registered'],
      dtype='object')

## 점수보기
* RF
RMSLE:  0.3288546947603633

In [21]:
train.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'y', 'm', 'h', 'w', 'day_type', 'regcount',
       'casual', 'registered'],
      dtype='object')

In [22]:
def my_fit_score(df, chart_view=False) :
    model_list = [ 
                   # ("RIDGE"  , Ridge(alpha=1.0, random_state=0)),
                   # ("LASSO"  , Lasso(alpha=1.0, random_state=0)),
                   ("DTR"    , DecisionTreeRegressor(random_state=0)),
                   # ("RF"     , RandomForestRegressor(random_state=0)),
                   # ("LR"     , LinearRegression()                   ),
                   # ("ABOOST" , AdaBoostRegressor(random_state=0)                  ),
                 #   ("XGB"    , XGBRegressor(random_state=0)                       ),  # booster = gblinear
                   ("LGBM"   , LGBMRegressor(random_state=0)                      )
                 #   ("VR-XGB-LGBM"    , VotingRegressor([("XGB", XGBRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) ) ,
                 #   ("VR-RF-LGBM"    , VotingRegressor([("DTR", RandomForestRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) )
                 ]
    
    # y_c = df['casual'] 
    # y_r = df['registered'] 
    
#-------------- 아웃라이어 삭제 : ★★★train만 삭제
# 스케일링 하기 전에 아웃라이어 삭제해야 이쁘게 된다
# windspeed = 0채우고 이상치 삭제
    del_idx_list = []
    idx = df[df['weather']==4].index
    del_idx_list.extend(idx)
    idx = df[df['temp']>40].index
    del_idx_list.extend(idx)
    idx = df[df['windspeed']>50].index
    del_idx_list.extend(idx)
    train.drop(del_idx_list, axis=0, inplace=True)

#--------------------------------------------------------------------- 



#------------------타겟 피쳐 로그스케일링 > O / (다른 피쳐는 안 해도 타겟 피쳐는 무조건 스케일링!!!!)
    y_c = np.log1p( df['casual'] )
    y_r = np.log1p( df['registered'] )
    X = df.drop(['regcount','casual','registered'], axis=1)
    
    
#-----------------연속형 피쳐 스케일링 >o
    X['temp']      = np.log1p( X['temp'] )
#     #   X['temp'] = np.log(   X['temp'] )+1
    X['atemp']     = np.log1p( X['atemp'] )
    X['humidity']  = np.log1p( X['humidity'] )
    X['windspeed'] = np.log1p( X['windspeed'] )
    
# #---------------------------------------------------------------------

    
    for tpl in model_list :
        # print( tpl[0] ) 
        
        # model = tpl[1]
        # X_train, X_test, y_train, y_r_test = train_test_split(X, y_r, random_state=0, test_size=0.2)
        # model.fit(X_train, y_train)
        # y_r_pred = model.predict(X_test)
        
        
        # model = tpl[1]
        # score_list_r = cross_val_score(model, X, y_r, scoring = 'neg_mean_squared_log_error', cv = 5)
        # r_sc_mean=np.mean(score_list_r)
        # score_list_c = cross_val_score(model, X, y_c, scoring = 'neg_mean_squared_log_error', cv = 5) #-----에러가 큰데, best로 판다/음수를 붙여서 error가 클수록 안 좋은 것으로 판단
        # c_sc_mean = np.sqrt( np.mean(score_list_c) )  #------------RMSE
        
        
        kf = KFold(n_splits=3, shuffle=False)
        for i,  (train_index, test_index) in enumerate(kf.split(X)):
            X_train = X.iloc[train_index] 
            y_r_train = y_r.iloc[train_index]
            X_test  = X.iloc[test_index]  
            y_r_test  = y_r.iloc[test_index] 
            
            
        # kf = KFold(n_splits=5, shuffle=False)    
            X_train = X.iloc[train_index] 
            y_c_train = y_c.iloc[train_index] 
            X_test  = X.iloc[test_index]  
            y_c_test  = y_c.iloc[test_index] 
        
        
        
        
        
        if tpl[0]=='DTR' or tpl[0]=='RF' :
            model = tpl[1]
            mydic = {"min_samples_split": [1,2,3,4], 
                     "min_samples_leaf": [1,2,3,4]}
            models = GridSearchCV(model, scoring = 'neg_mean_squared_error',cv = 5, param_grid = mydic, refit=True )
            
            models.fit(X_train, y_r_train)
            # pred_r = models.predict(X_test)
            sc_r = (models.best_score_)

            models.fit(X_train, y_c_train)
            # pred_c = models.predict(X_test)
            sc_c = (models.best_score_)
#             print(sc_r, sc_c, models.best_params_) 
#             print("-"*30)
            
            

        elif tpl[0] == 'LGBM' :
            model = tpl[1]
            mydic = {"min_child_samples": [10,20,30,40], 
                     "subsample_for_bin": [100000,200000,300000,400000]}
            models_LGBM = GridSearchCV(model, scoring = 'neg_mean_squared_error',cv = 5, param_grid = mydic, refit=True )
            
            models_LGBM.fit(X_train, y_r_train)
            # pred_r = models.predict(X_test)
            sc_r = (models_LGBM .best_score_)

            models_LGBM.fit(X_train, y_c_train)
            # pred_c = models.predict(X_test)
            sc_c = (models_LGBM.best_score_)
            # print(sc_r, sc_c,models_LGBM.best_params_) 
            # print("-"*30)
            
    return models_LGBM


          
        
        
        
#-----------------모델의 가중치(coef_ 혹은 feature_importances_ ) 차트로 확인
       
            # my_view_chart(tpl[0], model, X-train)

#---------------------------------------------------------------------    
       
        
#------------------타겟 피쳐 로그스케일링 복원        
        # y_pred_comb = np.expm1(y_r_pred)+np.expm1(y_c_pred)
        # y_real_comb = np.expm1(y_r_test)+np.expm1(y_c_test)
        # y_pred_comb[y_pred_comb < 0] = 0
        # msle_score = mean_squared_log_error(y_real_comb, y_pred_comb)
        
#---------------------------------------------------------------------
# 다중공선 (원핫인코딩 하기전 컬럼으로 확인)
# m-----season
# temp--atemp
# w-----day_type 
# 지워봐도 점수가 좋아지지 않음
#---------------------------------------------------------------------

#-------------------원핫 인코더('weather', 'season', 'y', 'm', 'h', 'w')
# 피쳐간의 상감도 분석/ 컬럼 중에 한 값만 유난히 특별히 두드러지게 다른 칼럼과 연관있을 거 같을 때/연관있다고 모델이 인식 
# > 여름,가을,겨울은 별로 대여수랑 관련 없다/봄에 대여수 많다 > 봄과 대여수 연관도 있다고 판단
# 컬럼과 컬럼 상감도 분석하면 > 칼럼 간 연관도 없다고 판단
# 바이너리 카테고리 칼럼(여,남)은 원핫인코더 하지 말아라 > 다중공성 > 오버핏팅 
# 다중공성 걸리더라도 유난히 한 값이 연관성이 크다면 원핫인코딩 유지
#---------------------------------------------------------------------

    


In [23]:
models_LGBM=my_fit_score(train)

* coef --> 가중치(w)
* intercept --> 절편
* 주는 모델이 있고, 아닌 모델이 있다

## Outlier 처리
* <font color=red size=4><b>train에만 처리
*  점수 낮아짐

In [24]:
train.shape, test.shape

((10880, 16), (6493, 16))

In [25]:
# del_idx_list = []
# idx = train[train['weather']==4].index
# del_idx_list.extend(idx)
# idx = train[train['temp']>40].index
# del_idx_list.extend(idx)
# idx = train[train['windspeed']>50].index
# del_idx_list.extend(idx)
# train.drop(del_idx_list, axis=0, inplace=True)

In [26]:
train.shape, test.shape

((10880, 16), (6493, 16))

In [27]:
my_fit_score(train)

GridSearchCV(cv=5, estimator=LGBMRegressor(random_state=0),
             param_grid={'min_child_samples': [10, 20, 30, 40],
                         'subsample_for_bin': [100000, 200000, 300000, 400000]},
             scoring='neg_mean_squared_error')

## 점수보기

# 학습

## 타켓 선정
* 답안지 : count 제출
* regcount(A패턴)  = registered(A2패턴) + casual (B패턴) 
* <font color=red><b>registered(A2패턴) + casual (B패턴)  --> 이 값을 답안으로 제출

# 최종피쳐 선정

In [28]:
train.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'y', 'm', 'h', 'w', 'day_type', 'regcount',
       'casual', 'registered'],
      dtype='object')

<pre>
타겟(1) : 'casual' + 'registered'
타겟(2) : 'regcount'


'd' 피쳐 삭제
('m', 'h'),  'y', 'd' -->'w'


(+)3개 : 'holiday', 'workingday', 'day_type'
(+)2개 : 'holiday', 'workingday'
(+)1개 : 'day_type'


'weather', 'humidity',  'season', 
(+)'windspeed' : 'h' 비례? / 'm' 반비례?


다중공선 : (상관계수가 높은 경우)
'regcount' 'registered' 'casual' : 0.97  --> 'regcount' vs. 'registered'+'casual'
'temp' 'atemp'                   : 0.98  --> 'atemp' drop

선택적으로 모델에 적용
(+)'w' 'day_type' : -0.78
(+)'season' 'm'   : 0.97


* ('m', 'h'),  'y', 'w', 'weather', 'humidity',  'season', 'temp'
* (+)'windspeed'
* (+)('holiday', 'workingday'), ('day_type')
* (+)('w' 'day_type') : -0.78
* (+)('m') 'season'    : 0.97


## 점수보기

# 학습 & 평가
* ref : https://suboptimal.wiki/explanation/mse/

*  $ RMSLE = \sqrt{\frac{1}{n}\Sigma_{i=1}^{n}{\Big(\frac{log(Y_i+1) - log(\hat{Y_i}+1)}{N}\Big)^2}}$

In [29]:
# ! pip install xgboost
# ! pip install lightgbm 

In [30]:
       # score_list.append([tpl[0], y_col[i], mse_score, np.sqrt(mse_score) ] )

In [31]:
# score_df = pd.DataFrame(score_list, columns=["model","col","mse","rmse"])
# score_df

In [32]:
# plt.figure(figsize=(8,3))
# ax1 = plt.subplot(1,2,1)
# ax1.set_title("RMSE")
# sns.barplot(
#     data= score_df,
#     x= "model",
#     y= "rmse" ,
#     hue = "col",
#     ax = ax1
    
# )
# ax2 = plt.subplot(1,2,2)
# ax2.set_title("MSE")
# sns.barplot(
#     data= score_df,
#     x= "model",
#     y= "mse" ,
#     hue = "col",
#     ax = ax2
# )
# plt.show()

* by 규환

In [33]:
# y = train[['regcount','casual','registered']]
# X = train.drop(['regcount','casual','registered'], axis=1)


In [34]:
# X.head(), y.head()

In [35]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [36]:
# model = DecisionTreeRegressor(random_state=0)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)


In [37]:
# print( y_test.shape )
# y_test.head(3)

In [38]:
# print( y_pred.shape )

In [39]:
# y_pred[0]

In [40]:
# print( y_pred.T[0].shape )
# y_pred.T

In [41]:
# y_test['casual']

In [42]:
# # regcount(A패턴)  = registered(A2패턴) + casual (B패턴) 
# score_regcount   = mean_squared_error(y_test['regcount']    , y_pred.T[0])
# score_casual     = mean_squared_error(y_test['casual'], y_pred.T[1])
# score_registered = mean_squared_error(y_test['registered']  , y_pred.T[2])

# print(score_regcount, score_casual, score_registered)

# 실제 점수내기

In [68]:
ss = pd.read_csv("./sampleSubmission.csv" , parse_dates=['datetime'])
ss.head(2)

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,0
1,2011-01-20 01:00:00,0


In [69]:
train.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,y,m,h,w,day_type
0,0.693147,0.0,0.0,0.693147,2.383243,2.734043,4.406719,2.099526,7.606885,0.693147,0.0,1.791759,0.0
1,0.693147,0.0,0.0,0.693147,2.304583,2.683416,4.394449,2.069682,7.606885,0.693147,0.693147,1.791759,0.0


In [70]:
test_c =  np.log1p(test['casual'] )
test_r =  np.log1p(test['registered'] )
test = np.log1p(test)
test_1 = test.drop(['casual','registered','regcount'], axis = 1)

In [71]:
train_c = np.log1p( train['casual'] )
train_r =  np.log1p(train['registered'] )
train = np.log1p( train)
train = train.drop(['casual','registered','regcount'], axis = 1)

KeyError: 'casual'

In [None]:
test.head(2)

In [72]:
models_LGBM.fit(train, train_c)
res_c = models_LGBM.predict(test_1)
res_c

array([0.20968394, 0.07602109, 0.07602109, ..., 0.34902993, 0.34902993,
       0.34902993])

In [73]:
models_LGBM.fit(train, train_r)
res_r = models_LGBM.predict(test_1)
res_r

array([1.96813163, 1.38718235, 1.38718235, ..., 1.09960962, 1.09960962,
       1.09960962])

In [83]:
res_sum = res_c + res_r
res_sum =np.expm1(res_sum)
res_sum=res_sum.astype('int')

In [84]:
res_sum = pd.Series( res_sum)
res_sum

0       7
1       3
2       3
3       3
4       1
       ..
6488    3
6489    3
6490    3
6491    3
6492    3
Length: 6493, dtype: int32

In [85]:
ss['count'] = res_sum

In [86]:
ss.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,7
1,2011-01-20 01:00:00,3
2,2011-01-20 02:00:00,3
3,2011-01-20 03:00:00,3
4,2011-01-20 04:00:00,1


In [87]:
ss.to_csv("submission.csv", index = False)


In [None]:
print(test.index.values)

In [None]:
train.shape, test.shape

In [None]:
test.info()