In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import pickle

In [17]:
import ipynb_path
now_file_name = ipynb_path.get().split('/')[-1].replace('.ipynb','')

input_path = '../input/'
status_file_name = 'status.csv'
station_file_name = 'station.csv'
trip_file_name = 'trip.csv'
weather_file_name = 'weather.csv'
output_path = '../output/'
model_path = '../model/'

#ジェネレータ使えそう
submit_file_name_1 = f'submission_1_{now_file_name}.csv'
submit_file_name_2 = f'submission_2_{now_file_name}.csv'

#modelの名前
model_name_1 = f'model_1_{now_file_name}.sav'

In [18]:
status = pd.read_csv(input_path + status_file_name)
station = pd.read_csv(input_path + station_file_name)
weather = pd.read_csv(input_path + weather_file_name)


#statusのyear, month, dayを結合してdatetime型に
status['date'] = status['year'].astype(str) + '/' + status['month'].astype(str).str.zfill(2).astype(str) + '/' + status['day'].astype(str).str.zfill(2).astype(str)
status['date'] = pd.to_datetime(status['date'])

weather['date'] = pd.to_datetime(weather['date'])

train_sta_wea = pd.merge(status,weather, on = 'date', how ='left')

train_sta_wea['events'] = train_sta_wea['events'].fillna('なし')

le = preprocessing.LabelEncoder()
train_sta_wea['events'] = le.fit_transform(train_sta_wea['events'].values.tolist())
train_sta_wea = pd.merge(train_sta_wea, station[['station_id', 'city']], how = 'left')

train_sta_wea_pre_1 = train_sta_wea[train_sta_wea['predict'] == 1]

train_sta_wea_pre_0 = train_sta_wea[train_sta_wea['predict'] == 0]

#曜日を追加
train_sta_wea['date']=pd.to_datetime(train_sta_wea['date'])
train_sta_wea['weekday']=train_sta_wea['date'].dt.weekday

In [70]:
#XGboost編
from datetime import date
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta
class make_tr_va_te():
    def __init__(self, df, train_end_next_date):
        self.df = df
        self.train_end_next_date = train_end_next_date
        
    def make_train_data(self):
        train_all = self.df[self.df['predict'] == 0]
        train = train_all[train_all['date'] < self.train_end_next_date]
        train_notna = train[train['bikes_available'].notna()]
        return train_notna
        
    def make_valid_data(self):
        valid_all = self.df[self.df['predict'] == 0]
        valid = valid_all[(self.train_end_next_date <= valid_all['date']) & (valid_all['date'] < (self.train_end_next_date + relativedelta(months = 1)))]
        valid_notna = valid[valid['bikes_available'].notna()]
        return valid_notna
                                      
    def make_test_data(self):
        test_all = self.df[self.df['predict'] == 1]
        test = test_all[((self.train_end_next_date + relativedelta(months = 1)) <= test_all['date']) & (test_all['date']< (self.train_end_next_date + relativedelta(months = 2)))]
        return test
    
    def model_for_data(self):
        train = self.make_train_data()
        valid = self.make_valid_data()
        
        tr_X = train.drop(['id','predict','bikes_available','city','date'],axis=1)
        tr_y = train['bikes_available']
        va_X = valid.drop(['id','predict','bikes_available','city','date'],axis=1)
        va_y = valid['bikes_available']   
        return tr_X, tr_y, va_X, va_y
    
    def predict_for_data(self):
        test = self.make_test_data()
        te_X = test.drop(['id','predict','bikes_available','city','date'],axis=1)       
        return te_X
    
    def make_fit_model(self):
        tr_X, tr_y, va_X, va_y = self.model_for_data()
        import xgboost as xgb
        dtrain = xgb.DMatrix(tr_X, label=tr_y, feature_names = tr_X.columns)
        dtest = xgb.DMatrix(va_X, label=va_y, feature_names = tr_X.columns)

        #先にxgb_paramsとしてパラメータを設定しておきます
        xgb_params = {#目的関数
                      'objective': 'reg:squarederror',
                      #学習に用いる評価指標
                      'eval_metric': 'rmse',
                      #boosterに何を用いるか
                      'booster': 'gbtree',
                      #learning_rateと同義
                      'eta': 0.1,
                      #木の最大深さ
                      'max_depth': 10,
                      #random_stateと同義
                      'seed': 0}

        #学習過程を取得するための変数を用意
        evals_result = {}
        model = xgb.train(#上で設定した学習パラメータを使用
                        params=xgb_params,
                        dtrain=dtrain,
                        #学習のラウンド数
                        num_boost_round=1000,
                        #early stoppinguのラウンド数
                        early_stopping_rounds=20,
                        #検証用データ
                        evals=[(dtrain, 'train'), (dtest, 'eval')],
                        #上で用意した変数を設定
                        evals_result=evals_result)
        valid_best_score = evals_result['eval']['rmse'][-1]
        
        return model, valid_best_score
    
    def model_and_valid_score(self):
        model = self.make_fit_model()
        valid_best_score = model.best_score['valid_1']['rmse']
        
        return model, valid_best_score
    
    def predict(self, model_file_name):
        import pickle
        import xgboost as xgb
        test = self.make_test_data()
        te_X = self.predict_for_data()
        dtest = xgb.DMatrix(te_X)
        model, valid_best_score  = self.make_fit_model()
        pickle.dump(model, open(model_path + model_file_name, 'wb'))
        valid_best_score = valid_best_score
        y_pred = model.predict(dtest)
        sub_index = test['id']
        sub_df = pd.DataFrame(list(zip(sub_index, y_pred)))
        print('*****')
        print(valid_best_score)
        return sub_df,valid_best_score
    


def month_range(start, stop, step = relativedelta(months = 1)):
    current = start
    while current < stop:
        yield current
        current += step

In [71]:
def month_range(start, stop, step = relativedelta(months = 1)):
    current = start
    while current < stop:
        yield current
        current += step

In [72]:
from datetime import date
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta
sub_df_all = pd.DataFrame()
valid_score_list = []
#train_end_next_dateにはvalidationの一ヶ月の初めを入れる
for d in month_range(dt(2014,8,1), dt(2015,8,1)):
    make_data = make_tr_va_te(train_sta_wea, d)
    sub_df, valid_best_score = make_data.predict(model_name_1)
    sub_df_all = pd.concat([sub_df_all, sub_df])
    valid_score_list.append(valid_best_score)
cv_score = sum(valid_score_list)/len(valid_score_list)
print(f'CV score is {cv_score}')

[0]	train-rmse:8.13668	eval-rmse:8.04548
[1]	train-rmse:7.43331	eval-rmse:7.38141
[2]	train-rmse:6.81056	eval-rmse:6.79679
[3]	train-rmse:6.25986	eval-rmse:6.28094
[4]	train-rmse:5.77403	eval-rmse:5.82594
[5]	train-rmse:5.34604	eval-rmse:5.43324
[6]	train-rmse:4.97401	eval-rmse:5.10494
[7]	train-rmse:4.64948	eval-rmse:4.80917
[8]	train-rmse:4.36603	eval-rmse:4.55817
[9]	train-rmse:4.11974	eval-rmse:4.34253
[10]	train-rmse:3.91010	eval-rmse:4.16414
[11]	train-rmse:3.72832	eval-rmse:4.00585
[12]	train-rmse:3.57529	eval-rmse:3.88616
[13]	train-rmse:3.44135	eval-rmse:3.77770
[14]	train-rmse:3.32636	eval-rmse:3.67919
[15]	train-rmse:3.22789	eval-rmse:3.60590
[16]	train-rmse:3.14507	eval-rmse:3.53777
[17]	train-rmse:3.07327	eval-rmse:3.48359
[18]	train-rmse:3.01096	eval-rmse:3.44039
[19]	train-rmse:2.96050	eval-rmse:3.40422
[20]	train-rmse:2.91777	eval-rmse:3.37326
[21]	train-rmse:2.88024	eval-rmse:3.34884
[22]	train-rmse:2.84739	eval-rmse:3.32708
[23]	train-rmse:2.81846	eval-rmse:3.30997
[2

In [73]:
# sub_df_all.to_csv(output_path+submit_file_name_1, index=False, header=False)

In [37]:
make_data = make_tr_va_te(train_sta_wea, dt(2014,8,1))
model, evals_result = make_data.make_fit_model()

[0]	train-rmse:8.13668	eval-rmse:8.04548
[1]	train-rmse:7.43331	eval-rmse:7.38141
[2]	train-rmse:6.81056	eval-rmse:6.79679
[3]	train-rmse:6.25986	eval-rmse:6.28094
[4]	train-rmse:5.77403	eval-rmse:5.82594
[5]	train-rmse:5.34604	eval-rmse:5.43324
[6]	train-rmse:4.97401	eval-rmse:5.10494
[7]	train-rmse:4.64948	eval-rmse:4.80917
[8]	train-rmse:4.36603	eval-rmse:4.55817
[9]	train-rmse:4.11974	eval-rmse:4.34253
[10]	train-rmse:3.91010	eval-rmse:4.16414
[11]	train-rmse:3.72832	eval-rmse:4.00585
[12]	train-rmse:3.57529	eval-rmse:3.88616
[13]	train-rmse:3.44135	eval-rmse:3.77770
[14]	train-rmse:3.32636	eval-rmse:3.67919
[15]	train-rmse:3.22789	eval-rmse:3.60590
[16]	train-rmse:3.14507	eval-rmse:3.53777
[17]	train-rmse:3.07327	eval-rmse:3.48359
[18]	train-rmse:3.01096	eval-rmse:3.44039
[19]	train-rmse:2.96050	eval-rmse:3.40422
[20]	train-rmse:2.91777	eval-rmse:3.37326
[21]	train-rmse:2.88024	eval-rmse:3.34884
[22]	train-rmse:2.84739	eval-rmse:3.32708
[23]	train-rmse:2.81846	eval-rmse:3.30997
[2

In [42]:
evals_result

{'train': OrderedDict([('rmse',
               [8.136678,
                7.43331,
                6.81056,
                6.259858,
                5.774028,
                5.346041,
                4.974012,
                4.649478,
                4.366031,
                4.119739,
                3.910095,
                3.728321,
                3.575285,
                3.441346,
                3.326364,
                3.227893,
                3.145071,
                3.073267,
                3.010957,
                2.960499,
                2.917767,
                2.880244,
                2.847389,
                2.818463,
                2.793426,
                2.774729,
                2.749967,
                2.730716,
                2.71147,
                2.700451,
                2.688217,
                2.679865,
                2.667901,
                2.656528,
                2.652183,
                2.639302,
                2.625795,
         

In [44]:
evals_result['eval']['rmse'][-1]

3.219704