In [12]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import pickle

In [13]:
import ipynb_path
now_file_name = ipynb_path.get().split('/')[-1].replace('.ipynb','')

input_path = '../input/'
status_file_name = 'status.csv'
station_file_name = 'station.csv'
trip_file_name = 'trip.csv'
weather_file_name = 'weather.csv'
output_path = '../output/'
model_path = '../model/'


In [14]:
#実行したらコメントアウトする
#保存ファイルの重複が起こる可能性がある
###########################################################
# もう一度notebookを動かす際はモデルリストのstart_numとend_numを
# 設定してください
###########################################################
st_in = input('初めの数字を入力してください')
end_in = input('終わりの数字を入力してください')
start_num = int(st_in)
end_num = int(end_in)

model_name_list = [f'model_{i}_{now_file_name}.sav' for i in range(start_num, end_num)]
submit_file_name_list = [f'submission_{i}_{now_file_name}.csv' for i in range(start_num,end_num)]
model_submit_dict = dict(zip(model_name_list,submit_file_name_list))

初めの数字を入力してください 1
終わりの数字を入力してください 21


In [15]:
status = pd.read_csv(input_path + status_file_name)
station = pd.read_csv(input_path + station_file_name)
weather = pd.read_csv(input_path + weather_file_name)


#statusのyear, month, dayを結合してdatetime型に
status['date'] = status['year'].astype(str) + '/' + status['month'].astype(str).str.zfill(2).astype(str) + '/' + status['day'].astype(str).str.zfill(2).astype(str)
status['date'] = pd.to_datetime(status['date'])

weather['date'] = pd.to_datetime(weather['date'])

train_sta_wea = pd.merge(status,weather, on = 'date', how ='left')

train_sta_wea['events'] = train_sta_wea['events'].fillna('なし')

le = preprocessing.LabelEncoder()
train_sta_wea['events'] = le.fit_transform(train_sta_wea['events'].values.tolist())
train_sta_wea = pd.merge(train_sta_wea, station[['station_id', 'city']], how = 'left')

train_sta_wea_pre_1 = train_sta_wea[train_sta_wea['predict'] == 1]

train_sta_wea_pre_0 = train_sta_wea[train_sta_wea['predict'] == 0]

#曜日を追加
train_sta_wea['date']=pd.to_datetime(train_sta_wea['date'])
train_sta_wea['weekday']=train_sta_wea['date'].dt.weekday

In [46]:
#Catboost編
#Classリレー式
from datetime import date
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

from catboost import CatBoost
from catboost import Pool

class make_tr_va_te():
    def __init__(self, df, train_end_next_date):
        self.df = df
        self.train_end_next_date = train_end_next_date
        
    def make_train_data(self):
        train_all = self.df[self.df['predict'] == 0]
        train = train_all[train_all['date'] < self.train_end_next_date]
        train_notna = train[train['bikes_available'].notna()]
        return train_notna
        
    def make_valid_data(self):
        valid_all = self.df[self.df['predict'] == 0]
        valid = valid_all[(self.train_end_next_date <= valid_all['date']) & (valid_all['date'] < (self.train_end_next_date + relativedelta(months = 1)))]
        valid_notna = valid[valid['bikes_available'].notna()]
        return valid_notna
                                      
    def make_test_data(self):
        test_all = self.df[self.df['predict'] == 1]
        test = test_all[((self.train_end_next_date + relativedelta(months = 1)) <= test_all['date']) & (test_all['date']< (self.train_end_next_date + relativedelta(months = 2)))]
        return test
    
    def model_for_data(self, train, valid):      
        tr_X = train.drop(['id','predict','bikes_available','city','date'],axis=1)
        tr_y = train['bikes_available']
        va_X = valid.drop(['id','predict','bikes_available','city','date'],axis=1)
        va_y = valid['bikes_available']   
        return tr_X, tr_y, va_X, va_y
    
    def predict_for_data(self, test):
        te_X = test.drop(['id','predict','bikes_available','city','date'],axis=1)       
        return te_X
    
    def make_fit_model(self, tr_X, tr_y, va_X, va_y ):
        ptrain = Pool(tr_X, label=tr_y)
        ptest = Pool(va_X, label=va_y)

        #先にxgb_paramsとしてパラメータを設定しておきます
        cat_params = {
            #学習のラウンド数
            'num_boost_round' : 1000,
            #目的関数
            'objective': 'RMSE',
            #学習に用いる評価指標
            'eval_metric': 'RMSE',
            #learning_rateと同義
            'eta': 0.1,
            #木の最大深さ
            'max_depth': 10,
            #random_stateと同義
            'random_seed': 0}

        
        model = CatBoost(cat_params)
        model.fit(
            ptrain,
            #early stoppingのラウンド数
            early_stopping_rounds=20,
            #検証用データ
            eval_set=[ptrain, ptest],
            verbose_eval = 100
            )
        
        return model
    
    def model_and_valid_score(self):
        model = self.make_fit_model()
        valid_best_score = model.best_score['valid_1']['rmse']
        
        return model, valid_best_score
    
    def predict(self, model_file_name):
        import pickle
        train = self.make_train_data()
        valid = self.make_valid_data()
        test = self.make_test_data()
        tr_X, tr_y, va_X, va_y = self.model_for_data(train, valid)
        te_X = self.predict_for_data(test)
        ptest = Pool(te_X)
        model  = self.make_fit_model(tr_X, tr_y, va_X, va_y )
        pickle.dump(model, open(model_path + model_file_name, 'wb'))
        valid_best_score = model.get_best_score()['validation_1']['RMSE']
        y_pred = model.predict(ptest)
        sub_index = test['id']
        sub_df = pd.DataFrame(list(zip(sub_index, y_pred)))
        print('*****')
        print(valid_best_score)
        return sub_df,valid_best_score
    


In [47]:
def month_range(start, stop, step = relativedelta(months = 1)):
    current = start
    while current < stop:
        yield current
        current += step

In [48]:
from datetime import date
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta
sub_df_all = pd.DataFrame()
valid_score_list = []
#train_end_next_dateにはvalidationの一ヶ月の初めを入れる
for d in month_range(dt(2014,8,1), dt(2015,8,1)):
    make_data = make_tr_va_te(train_sta_wea, d)
    train_notna = make_data.make_train_data()
    valid_notna = make_data.make_valid_data()
    test =make_data.make_test_data()
    tr_X, tr_y, va_X, va_y = make_data.model_for_data(train_notna, valid_notna)
    te_X = make_data.predict_for_data(test)
    model = make_data.make_fit_model(tr_X, tr_y, va_X, va_y)
    model_name = model_name_list[0]
    sub_df, valid_best_score = make_data.predict(model_name)
    sub_df_all = pd.concat([sub_df_all, sub_df])
    valid_score_list.append(valid_best_score)
cv_score = sum(valid_score_list)/len(valid_score_list)
print('***')
print('***')
print(f'CV score is {cv_score}')
print(f'model_name is {model_name}')
model_name_list.pop(0)

0:	learn: 3.6957461	test: 3.6957461	test1: 3.8974706	best: 3.8974706 (0)	total: 93.9ms	remaining: 1m 33s
1:	learn: 3.6543711	test: 3.6543711	test1: 3.8553479	best: 3.8553479 (1)	total: 177ms	remaining: 1m 28s
2:	learn: 3.6152254	test: 3.6152254	test1: 3.8160633	best: 3.8160633 (2)	total: 257ms	remaining: 1m 25s
3:	learn: 3.5826030	test: 3.5826030	test1: 3.7818416	best: 3.7818416 (3)	total: 339ms	remaining: 1m 24s
4:	learn: 3.5534975	test: 3.5534975	test1: 3.7508701	best: 3.7508701 (4)	total: 424ms	remaining: 1m 24s
5:	learn: 3.5215916	test: 3.5215916	test1: 3.7266194	best: 3.7266194 (5)	total: 507ms	remaining: 1m 23s
6:	learn: 3.4892089	test: 3.4892089	test1: 3.6962400	best: 3.6962400 (6)	total: 595ms	remaining: 1m 24s
7:	learn: 3.4666576	test: 3.4666576	test1: 3.6808297	best: 3.6808297 (7)	total: 675ms	remaining: 1m 23s
8:	learn: 3.4458956	test: 3.4458956	test1: 3.6600125	best: 3.6600125 (8)	total: 754ms	remaining: 1m 23s
9:	learn: 3.4263381	test: 3.4263381	test1: 3.6395712	best: 3.63

'model_1_No_3_my.sav'

In [51]:
print('***')
print('***')
print(f'CV score is {cv_score}')
print(f'model_name is {model_name}')

***
***
CV score is 3.4664585436021453
model_name is model_1_No_3_my.sav


In [50]:
# submit_file_name = model_submit_dict[model_name]
# sub_df_all.to_csv(output_path+submit_file_name, index=False, header=False)
#print(submit_file_name)

submission_1_No_3_my.csv
