https://mp.weixin.qq.com/s/0qOGevwTltu2SJ-QanW0VA

# 工具包导入&数据读取
## 工具包导入

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import date, timedelta
from time import time
from tqdm import tqdm_notebook, tqdm
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.svm import NuSVR, SVR
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

import gc

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats

from dateutil.parser import parse

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from itertools import combinations, product

import ast

## 数据读取

### air_reserve
- air_store_id - the restaurant's id in the air system
- visit_datetime - the time of the reservation
- reserve_datetime - the time the reservation was made
- reserve_visitors - the number of visitors for that reservation

### hpg_reserve
- hpg_store_id - the restaurant's id in the hpg system
- visit_datetime - the time of the reservation
- reserve_datetime - the time the reservation was made
- reserve_visitors - the number of visitors for that reservation

### air_store_info
- air_store_id
- air_genre_name
- air_area_name
- latitude
- longitude

### hpg_store_info
- hpg_store_id
- hpg_genre_name
- hpg_area_name
- latitude
- longitude

### date_info
- calendar_date
- day_of_week
- holiday_flg - is the day a holiday in Japan

### store_id_relation
This file allows you to join select restaurants that have both the air and hpg system.
- hpg_store_id
- air_store_id

### air_visit_data
- air_store_id
- visit_date - the date
- visitors - the number of visitors to the restaurant on the date

### sample_submission
This file shows a submission in the correct format, including the days for which you must forecast.

- id - the id is formed by concatenating the air_store_id and visit_date with an underscore
- visitors - the number of visitors forecasted for the store and date combination

In [2]:
path = '../input/'

air_visit_data      =  pd.read_csv(path + 'air_visit_data.csv') 

hpg_store_info      =  pd.read_csv(path + 'hpg_store_info.csv')
air_store_info      =  pd.read_csv(path + 'air_store_info.csv')

hpg_reserve         =  pd.read_csv(path + 'hpg_reserve.csv')
air_reserve         =  pd.read_csv(path + 'air_reserve.csv') 

date_info           =  pd.read_csv(path + 'date_info.csv')  

store_id_relation   =  pd.read_csv(path + 'store_id_relation.csv')     

sample_submission   =  pd.read_csv(path + 'sample_submission.csv')

**air_visit_data.csv**

- air_store_id
- visit_date - the date
- visitors - the number of visitors to the restaurant on the date

In [3]:
air_visit_data.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6


**hpg_store_info.csv**

- hpg_store_id
- hpg_genre_name
- hpg_area_name
- latitude
- longitude

Note: latitude and longitude are the latitude and longitude of the area to which the store belongs

In [4]:
hpg_store_info.head()

Unnamed: 0,hpg_store_id,hpg_genre_name,hpg_area_name,latitude,longitude
0,hpg_6622b62385aec8bf,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
1,hpg_e9e068dd49c5fa00,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
2,hpg_2976f7acb4b3a3bc,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
3,hpg_e51a522e098f024c,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
4,hpg_e3d0e1519894f275,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221


**air_store_info.csv**

- air_store_id
- air_genre_name
- air_area_name
- latitude
- longitude

Note: latitude and longitude are the latitude and longitude of the area to which the store belongs

In [5]:
air_store_info.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599


**hpg_reserve.csv**

This file contains reservations made in the hpg system.

- hpg_store_id - the restaurant's id in the hpg system
- visit_datetime - the time of the reservation
- reserve_datetime - the time the reservation was made
- reserve_visitors - the number of visitors for that reservation

In [6]:
hpg_reserve.head()

Unnamed: 0,hpg_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,hpg_c63f6f42e088e50f,2016-01-01 11:00:00,2016-01-01 09:00:00,1
1,hpg_dac72789163a3f47,2016-01-01 13:00:00,2016-01-01 06:00:00,3
2,hpg_c8e24dcf51ca1eb5,2016-01-01 16:00:00,2016-01-01 14:00:00,2
3,hpg_24bb207e5fd49d4a,2016-01-01 17:00:00,2016-01-01 11:00:00,5
4,hpg_25291c542ebb3bc2,2016-01-01 17:00:00,2016-01-01 03:00:00,13


**air_reserve.csv**
- air_store_id - the restaurant's id in the air system
- visit_datetime - the time of the reservation
- reserve_datetime - the time the reservation was made
- reserve_visitors - the number of visitors for that reservation

In [7]:
air_reserve.head()

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5


**date_info.csv**
- calendar_date
- day_of_week
- holiday_flg - is the day a holiday in Japan

In [8]:
date_info.head()

Unnamed: 0,calendar_date,day_of_week,holiday_flg
0,2016-01-01,Friday,1
1,2016-01-02,Saturday,1
2,2016-01-03,Sunday,1
3,2016-01-04,Monday,0
4,2016-01-05,Tuesday,0


**store_id_relation.csv**

This file allows you to join select restaurants that have both the air and hpg system.

- hpg_store_id
- air_store_id

In [9]:
store_id_relation.head()

Unnamed: 0,air_store_id,hpg_store_id
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a
1,air_a24bf50c3e90d583,hpg_c34b496d0305a809
2,air_c7f78b4f3cba33ff,hpg_cd8ae0d9bbd58ff9
3,air_947eb2cae4f3e8f2,hpg_de24ea49dc25d6b8
4,air_965b2e0cf4119003,hpg_653238a84804d8e7


**sample_submission.csv**

This file shows a submission in the correct format, including the days for which you must forecast.

- id - the id is formed by concatenating the air_store_id and visit_date with an underscore
- visitors - the number of visitors forecasted for the store and date combination

In [10]:
sample_submission.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,0
1,air_00a91d42b08b08d9_2017-04-24,0
2,air_00a91d42b08b08d9_2017-04-25,0
3,air_00a91d42b08b08d9_2017-04-26,0
4,air_00a91d42b08b08d9_2017-04-27,0


## 数据合并

In [11]:
sample_submission['air_store_id'] = sample_submission['id'].apply(lambda x: x[:-11]).values
sample_submission['visit_date'] = sample_submission['id'].apply(lambda x: x[-10:]).values

del sample_submission['id']
gc.collect()

60

In [12]:
df_meta = pd.concat([air_visit_data, sample_submission],
                    axis=0,
                    ignore_index=True)

# 合并date_info
df_meta = df_meta.merge(date_info,
                        left_on=['visit_date'],
                        right_on=['calendar_date'],
                        how='left')
# 合并air_store_info
df_meta = df_meta.merge(air_store_info,
                        on=['air_store_id'],
                        how='left')

del df_meta['calendar_date'], df_meta['day_of_week']
gc.collect()

0

In [13]:
df_meta['visit_date'] = pd.to_datetime(df_meta['visit_date'])
# Monday=0, Sunday=6
df_meta['weekday'] = df_meta['visit_date'].dt.weekday

### 类别层次特征

In [14]:
df_meta['air_store_id_weekday'] = df_meta['air_store_id'].astype(str) + '_' + df_meta['weekday'].astype(str)

df_meta['air_store_id_holiday'] = df_meta['air_store_id'].astype(str) + '_' + df_meta['holiday_flg'].astype(str)
df_meta['air_store_id_weekday_holiday'] = df_meta['air_store_id_holiday'].astype(str) + '_' + df_meta['weekday'].astype(str)

df_meta['air_genre_name_weekday'] = df_meta['air_area_name'].astype(str) + df_meta['weekday'].astype(str)
df_meta['air_genre_name_weekday_holiday'] = df_meta['air_genre_name_weekday'].astype(str) + df_meta['holiday_flg'].astype(str)

In [15]:
df_meta['visitors'] = df_meta['visitors'].apply(np.log1p).values

df_meta.head()

Unnamed: 0,air_store_id,visit_date,visitors,holiday_flg,air_genre_name,air_area_name,latitude,longitude,weekday,air_store_id_weekday,air_store_id_holiday,air_store_id_weekday_holiday,air_genre_name_weekday,air_genre_name_weekday_holiday
0,air_ba937bf13d40fb24,2016-01-13,3.258097,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,2,air_ba937bf13d40fb24_2,air_ba937bf13d40fb24_0,air_ba937bf13d40fb24_0_2,Tōkyō-to Minato-ku Shibakōen2,Tōkyō-to Minato-ku Shibakōen20
1,air_ba937bf13d40fb24,2016-01-14,3.496508,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,3,air_ba937bf13d40fb24_3,air_ba937bf13d40fb24_0,air_ba937bf13d40fb24_0_3,Tōkyō-to Minato-ku Shibakōen3,Tōkyō-to Minato-ku Shibakōen30
2,air_ba937bf13d40fb24,2016-01-15,3.401197,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,4,air_ba937bf13d40fb24_4,air_ba937bf13d40fb24_0,air_ba937bf13d40fb24_0_4,Tōkyō-to Minato-ku Shibakōen4,Tōkyō-to Minato-ku Shibakōen40
3,air_ba937bf13d40fb24,2016-01-16,3.135494,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,5,air_ba937bf13d40fb24_5,air_ba937bf13d40fb24_0,air_ba937bf13d40fb24_0_5,Tōkyō-to Minato-ku Shibakōen5,Tōkyō-to Minato-ku Shibakōen50
4,air_ba937bf13d40fb24,2016-01-18,1.94591,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0,air_ba937bf13d40fb24_0,air_ba937bf13d40fb24_0,air_ba937bf13d40fb24_0_0,Tōkyō-to Minato-ku Shibakōen0,Tōkyō-to Minato-ku Shibakōen00


## 预定表，包括air_reserve和hpg_reserve

In [16]:
# air系统
air_reserve['visit_date'] = air_reserve['visit_datetime'].apply(lambda x: x.split(' ')[0]).values
air_reserve['air_store_id_visit_date'] = air_reserve['air_store_id'] + '_' + air_reserve['visit_datetime'].astype(str)

air_reserve['visit_datetime'] = pd.to_datetime(air_reserve['visit_datetime'])
air_reserve['reserve_datetime'] = pd.to_datetime(air_reserve['reserve_datetime'])

# hpg系统
hpg_reserve['visit_date'] = hpg_reserve['visit_datetime'].apply(lambda x: x.split(' ')[0]).values


# hpg系统关联得到air_store_id
hpg_to_air_storeid = store_id_relation.set_index('hpg_store_id')
hpg_reserve['air_store_id'] = hpg_reserve['hpg_store_id'].map(hpg_to_air_storeid['air_store_id']).values
hpg_air_reserve = hpg_reserve.loc[hpg_reserve.air_store_id.notnull()]

hpg_air_reserve['air_store_id_visit_date'] = hpg_air_reserve['air_store_id'] + '_' + hpg_air_reserve['visit_date'].astype(str)
hpg_air_reserve['visit_datetime'] = pd.to_datetime(hpg_air_reserve['visit_datetime'])
hpg_air_reserve['reserve_datetime'] = pd.to_datetime(hpg_air_reserve['reserve_datetime'])

# 特征工程
## 一阶特征
### store_id的特征

1. store_id的基础信息，包括类型，地址，经纬度
2. store_id的营业天数，开业到现在的时间，最后一天到现在的时间

In [17]:
def get_storeid_fea(df_visit_data, df_store_info, df_store_id_relation):
    df_storeid = df_store_info.copy()
    
    # 对air_genre_name和air_area_name做LabelEncoder
    df_storeid['air_genre_name'] = LabelEncoder().fit_transform(df_storeid['air_genre_name'].values)
    df_storeid['air_area_name'] = LabelEncoder().fit_transform(df_storeid['air_area_name'].values)
    
    # 每个air_store_id日期的最小值和最大值
    first_day_dict = df_visit_data.groupby(['air_store_id'])['visit_date'].min().to_dict()
    last_day_dict = df_visit_data.groupby(['air_store_id'])['visit_date'].max().to_dict()
    
    # 每个air_store_id的营业天数
    run_days_dict = df_visit_data.groupby(['air_store_id'])['visit_date'].nunique().to_dict()
    
    # 每个air_store_id (2017-04-22 - first_day)相差的天数
    df_storeid['first_day_to_now'] = df_storeid['air_store_id'].map(first_day_dict).values
    df_storeid['first_day_to_now'] = df_storeid['first_day_to_now'].apply(lambda x: (parse('2017-04-22') - parse(x)).days)
    
    # 每个air_store_id (2017-04-22 - last_day)相差的天数
    df_storeid['last_day_to_now'] = df_storeid['air_store_id'].map(last_day_dict).values
    df_storeid['last_day_to_now'] = df_storeid['last_day_to_now'].apply(lambda x: (parse('2017-04-22') - parse(x)).days)
    
    # 每个air_store_id的营业天数
    df_storeid['run_days'] = df_storeid['air_store_id'].map(run_days_dict).values
    
    # air_store_id是否在store_id_relation里面
    df_storeid['two_systems'] = df_storeid['air_store_id'].isin(df_store_id_relation['air_store_id']).values
    df_storeid['two_systems'] = df_storeid['two_systems'].astype(int)
    
    return df_storeid

In [18]:
df_storeid = get_storeid_fea(air_visit_data, air_store_info, store_id_relation)

### date的特征
1. 星期几
2. 是否节假日
3. 距离月末，月初的时间
4. 距离最近的节假日的时间

In [19]:
def get_last_next_holiday(dates, holiday_flags):
    last_holiday = []
    next_holiday = []
    len_ = len(dates)
    for i, (d, f) in enumerate(zip(dates, holiday_flags)):
        if i == 0:
            last_holiday.append(dates[i])
        else:
            # last holiday
            last_flag = 0
            for j in range(i - 1, 0, -1):
                if holiday_flags[j] == 1:
                    last_holiday.append(dates[j])
                    last_flag = 1
                    break
            if last_flag == 0:
                last_holiday.append(dates[0])
                
        # next holiday
        next_flag = 0
        for j in range(i + 1, len_):
            if holiday_flags[j] == 1:
                next_holiday.append(dates[j])
                next_flag = 1
                break
        if next_flag == 0:
            next_holiday.append(dates[-1])
    return last_holiday, next_holiday

In [20]:
def get_days_to_month_end(x):
    """
    计算每个日期距离月底的天数
    """
    month = int(x.split('-')[1])
    day = int(x.split('-')[2])
    if month in [1, 3, 5, 7, 8, 10, 12]:
        return 31 - day
    elif month in [2]:
        return 28 - day
    else:
        return 30 - day

In [21]:
def get_time_fea(df_date_info):
    """
    date特征
    1. 每个日期距月底、月初的天数
    2. 每个日期是周几，是否是周末
    3. 每个日期距离上一个和下一个节假日的天数
    4. 每个日期的前几天、后几天是否是节假日
    """
    df_date = df_date_info.copy()
    
    # 每个日期距离月底的天数
    df_date['day_to_month_end'] = df_date['calendar_date'].apply(lambda x: get_days_to_month_end(x)).values
    # 每个日期距离月初的天数
    # df_date['day_to_month_start'] = df_date['calendar_date'].apply(lambda x: int(x.split('-')[2])).values
    df_date['day_to_month_start'] = df_date['calendar_date'].apply(lambda x: int(x.split('-')[0])).values
    
    # 每个日期是周几
    df_date['day_of_week'] = df_date['calendar_date'].apply(lambda x: parse(x).weekday()).values
    # 每个日期是否是周末
    df_date['is_weekend'] = df_date['day_of_week'].apply(lambda x: x > 4).values
    
    # 每个日期的上一个和下一个节假日日期
    last_holiday, next_holiday = get_last_next_holiday(df_date['calendar_date'].values, df_date['holiday_flg'].values)
    df_date['last_holiday'] = last_holiday
    df_date['next_holiday'] = next_holiday

    # 每个日期距离上一个节假日的天数
    df_date['day_to_last_holiday'] = (df_date['calendar_date'].apply(lambda x: parse(x))
                                      - df_date['last_holiday'].apply(lambda x: parse(x)))
    df_date['day_to_last_holiday'] = df_date['day_to_last_holiday'].apply(lambda x: x.days).values
    
    # 每个日期距离下一个节假日的天数
    df_date['day_to_next_holiday'] = (df_date['next_holiday'].apply(lambda x: parse(x))
                                      - df_date['calendar_date'].apply(lambda x: parse(x)))
    df_date['day_to_next_holiday'] = df_date['day_to_next_holiday'].apply(lambda x: x.days).values
    
    df_date.rename(columns={'calendar_date': 'visit_date'}, inplace=True)
    
    df_date['visit_date'] = pd.to_datetime(df_date['visit_date'])
    
    # 每个日期的前几天，后几天是否是节假日
    for i in [3, 2, 1, -1]:
        date_info_temp = df_date.copy()
        # visit_date = visit_date + i
        date_info_temp['visit_date'] = date_info_temp['visit_date'].apply(lambda x: x + timedelta(i))
        date_info_temp.rename(columns={'holiday_flg': 'ahead_holiday_{}'.format(i)},
                              inplace=True)
        
        df_date = df_date.merge(date_info_temp[['visit_date', 'ahead_holiday_{}'.format(i)]],
                                on=['visit_date'],
                                how='left')
    del df_date['last_holiday'], df_date['next_holiday']
    gc.collect()
    return df_date

In [22]:
df_date = get_time_fea(date_info)

## 二阶特征：store_id & day

因为day这个东西只有一个,而且未来和之前的是不一致的,那么做特征的时候就需要找day相关的属性,同样的store_id的属性也可以构建特征

### store_id+day(related特征/holiday/weekday)

In [23]:
def get_st_date(en_date, days):
    """
    对每个日期减几天
    """
    # st_date <= date < end_date
    st_date = en_date - timedelta(days)
    return st_date

def get_label_st_date(st_date, days):
    """
    对每个日期加几天
    """
    # st_date <= date < end_date
    end_date = st_date + timedelta(days)
    return st_date, end_date

def get_df_label(df, date, days=39, is_te=False):
    """
    时间窗口，窗口在date和date+days之间
    """
    st_date, en_date = get_label_st_date(date, days)  # 对date加days，返回st_date和en_date
    print('label date, ', st_date, en_date)
    ind1 = df['visit_date'] < en_date
    ind2 = df['visit_date'] >= st_date
    if is_te:
        return df.loc[(ind2), ['air_store_id', 'visit_date', 'air_store_id_holiday',
                               'air_genre_name', 'air_genre_name_weekday', 'air_genre_name_weekday_holiday']].copy()
    return df.loc[(ind1 & ind2), ['air_store_id', 'visit_date', 'air_store_id_holiday',
                                  'air_genre_name', 'air_genre_name_weekday', 'air_genre_name_weekday_holiday',
                                  'visitors']].copy()

### 特征工程1：visitors相关特征
#### air_store_id+visitors特征（recent days）

最近$N(N=14,28,56,1000)$天air_store_id的统计特征

In [24]:
def get_store_id_visitors_sts_features(df, en_date, key='air_store_id', date_col='visit_date', days_list=[1000, 56, 28, 14,]):
    """
    时间滑窗
    计算每个窗口内，每个店铺的visitors的统计量
    """
    t0 = time()
    df_features       = pd.DataFrame()
    df_features[key]  = df[key].unique()

    df_tmp            = df.copy()
    #######################  Recent days sts features #######################
    for days in days_list:
        # 时间窗口
        st_date           = get_st_date(en_date, days)  # 对日期减几天 
        df_tmp            = df_tmp.loc[((df[date_col] >= st_date) &  (df[date_col] < en_date))].copy() 
        df_features_tmp   = df_tmp.groupby([key], as_index=False)['visitors'].agg({
            '{}_visitors_min_{}'.format(key,days): 'min',
            '{}_visitors_mean_{}'.format(key,days): 'mean',
            '{}_visitors_sum_{}'.format(key,days): 'sum',
            '{}_visitors_median_{}'.format(key,days): 'median',
            '{}_visitors_max_{}'.format(key,days): 'max',
            '{}_visitors_count_{}'.format(key,days): 'count',
            '{}_visitors_std_{}'.format(key,days): 'std',
            '{}_visitors_quantile_{}'.format(key,days): 'quantile',
            '{}_visitors_skew_{}'.format(key,days): 'skew'
        }) 
        df_features  = df_features.merge(df_features_tmp, on =key, how = 'left')
    print('For store id sts features, we spend {} seconds.'.format(time() - t0)) 

    return df_features

#### air_store_id + visitors diff特征(recent days)
最近N(N=28,56,1000)天的趋势统计特征

In [25]:
def get_store_id_visitors_diff_sts_features(df, en_date, key='air_store_id', date_col='visit_date', days_list=[1000, 56, 28]):
    """
    visitors_diff的统计量
    1. 时间窗口
    2. 对每个日期减1，在用日期去关联，相当于得到昨天的visitors
    3. visitors_diff = 当天的vistors - 昨天的visitors
    4. visitors_diff的统计量
    5. visitors_diff的绝对值
    6. visitors_diff的绝对值的平均值
    """
    t0 = time()
    df_features = pd.DataFrame()
    df_features[key] = df[key].unique()
    
    for days in days_list:
        df_tmp = df.copy()
        st_date = get_st_date(en_date, days)  # 对日期减几天
        # 时间窗口
        df_tmp = df_tmp.loc[((df[date_col] >= st_date) & (df[date_col] < en_date))].copy()
        df_old_tmp = df_tmp.copy()
        
        # 每个air_store_id昨天的visitors
        df_old_tmp[date_col] = df_tmp[date_col].apply(lambda x: x + timedelta(1)).values  # 用昨天的日期去关联
        df_old_tmp.rename(columns={'visitors': 'visitors_yesterday'},
                          inplace=True)
        df_tmp = df_tmp.merge(df_old_tmp[[key, date_col, 'visitors_yesterday']],
                              on=[key, date_col],
                              how='left')
        
        # 当天的visitors - 昨天的visitors
        df_tmp['visitors_diff'] = df_tmp['visitors'].values - df_tmp['visitors_yesterday'].values
        
        df_tmp = df_tmp.loc[df_tmp['visitors_diff'].notnull()]
        
        df_features_tmp = df_tmp.groupby([key], as_index=False)['visitors_diff'].agg({
            '{}_visitors_diff_min_{}'.format(key, days): 'min',
            '{}_visitors_diff_mean_{}'.format(key, days): 'mean',
            '{}_visitors_diff_median_{}'.format(key, days): 'median',
            '{}_visitors_diff_sum_{}'.format(key, days): 'sum',
            '{}_visitors_diff_max_{}'.format(key, days): 'max',
            '{}_visitors_diff_count_{}'.format(key, days): 'count',
            '{}_visitors_diff_std_{}'.format(key, days): 'std',
            '{}_visitors_diff_quantile_{}'.format(key, days): 'quantile',
            '{}_visitors_diff_skew_{}'.format(key, days): 'skew',
        })
        
        df_tmp['visitors_diff_abs'] = df_tmp['visitors_diff'].map(abs).values
        
        # 绝对值的平均值
        df_features_tmp2 = df_tmp.groupby([key], as_index=False)['visitors_diff_abs'].agg({
            '{}_visitors_diff_abs_mean_{}'.format(key, days): 'mean'
        })
        
        df_features = df_features.merge(df_features_tmp, on=key, how='left')
        df_features = df_features.merge(df_features_tmp2, on=key, how='left')
    print('For store id diff sts features, we spend {} seconds.'.format(time() - t0))
    
    return df_features

#### air_store_id + weekday + visitors特征(周期特征)
过去每段时间的工作日的的统计特征

In [26]:
def get_weekday_visitors_sts_features(df, key, end_date, date_col='visit_date', val_name='visitor'):
    """
    时间滑窗，groupby对象不同，这里用的是key=air_store_id_weekday
    1. 时间滑窗，前4、8、12、100周
    2. 时间窗内的visitor的统计量
    3. 最大值-最小值
    """
    df_features = pd.DataFrame()
    df_features[key] = df[key].unique()
    
    # 日期小于end_date
    ind1 = df[date_col] < end_date
    
    # 时间滑窗，日期前4周，8周，12周，100周
    for i in [4, 8, 12, 100]:
        t_st_date = end_date - timedelta(7 * i)
        ind2 = df[date_col] >= t_st_date
        df_grp = df.loc[(ind1 & ind2)].groupby(key)
        dic_mean = df_grp[val_name].mean().to_dict()
        dic_std = df_grp[val_name].std().to_dict()
        dic_max = df_grp[val_name].max().to_dict()
        dic_min = df_grp[val_name].min().to_dict()
        dic_count = df_grp[val_name].count().to_dict()
        dic_sum = df_grp[val_name].sum().to_dict()
        dic_quantile = df_grp[val_name].quantile().to_dict()
        
        df_features['{}_weekdays_mean_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_mean).values
        df_features['{}_weekdays_std_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_std).values
        df_features['{}_weekdays_max_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_max).values
        df_features['{}_weekdays_min_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_min).values
        df_features['{}_weekdays_count_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_count).values
        df_features['{}_weekdays_sum_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_sum).values
        df_features['{}_weekdays_quantile_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_quantile).values
        
        # 最大值 - 最小值
        df_features['{}_weekdays_gap_{}_{}'.format(key, i + 1, val_name)] = (df_features['{}_weekdays_max_{}_{}'.format(key, i + 1, val_name)] -
                                                                             df_features['{}_weekdays_min_{}_{}'.format(key, i + 1, val_name)])
        
        return df_features

#### air_store_id + weekday + visitors diff特征(周期特征)
过去每段时间的工作日的差值统计特征

In [27]:
def get_weekday_visitors_diff_sts_features(df, key, end_date, date_col='visit_date', val_name='visitors'): 
    """
    时间滑窗，groupby对象不同，这里用的是key=air_store_id_weekday
    1. 时间滑窗，前4周、8周、12周、100周
    2. 前1周那天的visitors_diff
    3. visitors_diff的统计量
    """
    df_features = pd.DataFrame() 
    df_features[key] = df[key].unique()

    ind1 = df[date_col] < end_date  
    # 时间滑窗，前4周，8周，12周，100周
    for i in [4, 8, 12, 100]:
        t_st_date = end_date - timedelta(7 * i) 
        ind2 = df[date_col] >= t_st_date 

        df_tmp = df.loc[ind1 & ind2].copy() 

        df_old_tmp = df_tmp.copy()
        # 每个air_store_id 前1周的那天的visitors
        df_old_tmp[date_col] = df_tmp[date_col].apply(lambda x: x + timedelta(7)).values
        df_old_tmp.rename(columns={'visitors': 'visitors_lastweek'},
                          inplace=True)
        df_tmp = df_tmp.merge(df_old_tmp[[key, date_col, 'visitors_lastweek']],
                              on=[key, date_col],
                              how='left')
        # 当天visitors - 前1周的那天的visitors
        df_tmp['visitors_diff'] = df_tmp['visitors'].values - df_tmp['visitors_lastweek'].values
        df_tmp = df_tmp.loc[df_tmp['visitors_diff'].notnull()]

        df_tmp['visitors_diff_abs'] = df_tmp['visitors_diff'].map(abs).values

        df_grp              = df_tmp.groupby(key) 
        dic_mean            = df_grp['visitors_diff'].mean().to_dict()
        dic_std             = df_grp['visitors_diff'].std().to_dict()
        dic_max             = df_grp['visitors_diff'].max().to_dict()
        dic_min             = df_grp['visitors_diff'].min().to_dict()
        dic_count           = df_grp['visitors_diff'].count().to_dict()
        dic_sum             = df_grp['visitors_diff'].sum().to_dict()   
        dic_quantile        = df_grp['visitors_diff'].quantile().to_dict()   
        dic_abs_mean        = df_grp['visitors_diff_abs'].mean().to_dict()   
        
        df_features['{}_weekday_quantile_{}_{}'.format(key, i + 1, 'visitorsdiff')]  = df_features[key].map(dic_quantile).values 
        df_features['{}_weekday_mean_{}_{}'.format(key, i + 1, 'visitorsdiff')]      = df_features[key].map(dic_mean).values 
        df_features['{}_weekday_std_{}_{}'.format(key, i + 1, 'visitorsdiff')]       = df_features[key].map(dic_std).values 
        df_features['{}_weekday_max_{}_{}'.format(key, i + 1, 'visitorsdiff')]       = df_features[key].map(dic_max).values 
        df_features['{}_weekday_min_{}_{}'.format(key, i + 1, 'visitorsdiff')]       = df_features[key].map(dic_min).values 
        df_features['{}_weekday_count_{}_{}'.format(key, i + 1, 'visitorsdiff')]     = df_features[key].map(dic_count).values 
        df_features['{}_weekday_sum_{}_{}'.format(key, i + 1, 'visitorsdiff')]       = df_features[key].map(dic_sum).values  
        df_features['{}_weekday_absmean_{}_{}'.format(key, i + 1, 'visitorsdiff')]   = df_features[key].map(dic_abs_mean).values  

    return df_features

#### air_store_id + visitors exp特征(周期特征)
加权指数衰减统计特征

权重可以表示为店铺的运营的时间信息，所以记录一次即可

In [28]:
def get_store_id_visitors_exp_features(df, en_date, key='air_store_id', date_col='visit_date', days_list=[1000]): #, 56, 28, 14,
    """
    时间窗口，groupby对象不同，这里用的是key=air_store_id
    1. 时间窗口
    2. day_lags = visit_date与en_date相差的天数
    3. day_lags的权重 = weight ** day_lags
    4. visitors_weight = visitors * weight
    5. 对weight(加权日期差之和)求和
    6. 加权平均每天的visitors
    """
    t0 = time()
    df_features       = pd.DataFrame()
    df_features[key]  = df[key].unique()

    df_tmp            = df.copy()
    for weight in [0.9, 0.95, 0.975, 0.985]:
        for days in days_list:
            st_date            = get_st_date(en_date, days) 
            df_tmp             = df_tmp.loc[((df[date_col] >= st_date) & (df[date_col] < en_date))].copy()

            # en_date与visit_date两个日期相差的天数
            df_tmp['day_lags']        = df_tmp['visit_date'].apply(lambda x: (en_date - (x)).days)
            # 相差的天数做一个权重，weight ** day_lags
            df_tmp['weight']          = df_tmp['day_lags'].apply(lambda x: weight ** x)
            # visitors * weight
            df_tmp['visitors_weight'] = df_tmp['visitors'] * df_tmp['weight']
            
            # groupby(air_store_id), 对visitors_weight(加权visitors之和)求和
            result1 = df_tmp.groupby([key], as_index=False)['visitors_weight'].agg({
                '{}_exp_mean{}_w_{}'.format(key, days, weight): 'sum'
            })
            
            # groupby(air_store_id), 对weight(加权日期差之和)求和
            result2 = df_tmp.groupby([key], as_index=False)['weight'].agg({
                '{}_exp_weight_sum{}_w_{}'.format(key, days, weight): 'sum'
            })

            result = result1.merge(result2, on=key, how='left')
            # 加权平均每天的visitors
            result['{}_exp_mean2{}_w_{}'.format(key, days, weight)] = (result['{}_exp_mean{}_w_{}'.format(key, days, weight)]
                                                                       / result['{}_exp_weight_sum{}_w_{}'.format(key, days, weight)])

            if weight == 0.9:
                df_features = df_features.merge(result[[
                    '{}_exp_mean2{}_w_{}'.format(key, days, weight),  # 加权平均每天的visitors
                    '{}_exp_weight_sum{}_w_{}'.format(key, days, weight),  # 对weight(加权日期差之和)求和，只需记录一次
                    key  # air_store_id
                ]], on=key, how='left')
            else:
                df_features = df_features.merge(result[[
                    '{}_exp_mean2{}_w_{}'.format(key, days, weight),  # 加权平均每天的visitors
                    key  # air_store_id
                ]], on=key, how='left')
    print('For store id exp features, we spend {} seconds.'.format(time() - t0)) 

    return df_features

#### air_store_id + weekday + visitors exp特征(7)(周期特征)
加权指数衰减(按周进行衰减)统计特征

权重可以表示为店铺的运营的时间信息(周)，所以记录一次即可

In [29]:
def get_weekday_visitors_exp_sts_features(df, en_date, key, date_col='visit_date', days_list=[1000]): #, 56, 28, 14,
    """
    时间窗口
    1. 时间窗口
    2. day_lags = (visit_date与en_date相差的天数) // 7
    3. weight = weight ** day_lags
    4. visitors_weight = visitors * weight
    5. visitors_weight求和
    6. weight求和
    7. 加权平均每天的visitors
    """
    t0 = time()
    df_features       = pd.DataFrame()
    df_features[key]  = df[key].unique()
    
    df_tmp            = df.copy()
    for weight in [0.95, 0.975, 0.985]:   
        for days in days_list:
            # 时间窗口
            st_date            = get_st_date(en_date, days) 
            df_tmp             = df_tmp.loc[((df[date_col] >= st_date) & (df[date_col] < en_date))].copy()

            # day_lags = (en_date - visit_date) / 7
            df_tmp['day_lags']        = df_tmp['visit_date'].apply(lambda x: (en_date - (x)).days // 7)
            # day_lags加权
            df_tmp['weight']          = df_tmp['day_lags'].apply(lambda x: weight ** x)
            # visitors_weight = visitors * weight
            df_tmp['visitors_weight'] = df_tmp['visitors'] * df_tmp['weight']
            
            # visitors_weight求和
            result1 = df_tmp.groupby([key], as_index=False)['visitors_weight'].agg({
                '{}_exp_mean{}_{}'.format(key, days, weight): 'sum'
            })
            # weight求和
            result2 = df_tmp.groupby([key], as_index=False)['weight'].agg({
                '{}_exp_weight_sum{}_{}'.format(key,days,weight): 'sum'
            })

            result = result1.merge(result2, on=key, how='left')
            
            # 加权平均每天的visitors
            result['{}_exp_mean2{}_{}'.format(key,days,weight)] = (result['{}_exp_mean{}_{}'.format(key,days,weight)]
                                                                   / result['{}_exp_weight_sum{}_{}'.format(key,days,weight)])
            
            df_features = df_features.merge(result, on=key, how='left')
    print('For store id exp features, we spend {} seconds.'.format(time() - t0)) 
    
    return df_features

#### air_store_id + holiday +visitors特征
过去每个节假日的的统计特征

In [30]:
def get_holiday_visitors_sts_features(df, key, end_date, date_col='visit_date', val_name='visitors'): 
    """
    时间窗口，groupby('air_store_id_holiday')
    时间窗口内的统计量
    """
    df_features      = pd.DataFrame() 
    df_features[key] = df[key].unique() 
    ind1    = df[date_col] < end_date  
    for i in [4, 8, 12, 100]: 
        # 时间窗口，前4、8、12、100周
        t_st_date           = end_date - timedelta(7 * i) 
        ind2                = df[date_col]  >= t_st_date 
        df_grp              = df.loc[(ind1 & ind2)].groupby(key) 
        
        # 统计量
        dic_mean            = df_grp[val_name].mean().to_dict()
        dic_std             = df_grp[val_name].std().to_dict()
        dic_max             = df_grp[val_name].max().to_dict()
        dic_min             = df_grp[val_name].min().to_dict()
        dic_count           = df_grp[val_name].count().to_dict()
        dic_sum             = df_grp[val_name].sum().to_dict()   
        df_features['{}_holiday_mean_{}_{}'.format(key, i + 1, val_name)]  = df_features[key].map(dic_mean).values 
        df_features['{}_holiday_std_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_std).values 
        df_features['{}_holiday_max_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_max).values 
        df_features['{}_holiday_min_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_min).values 
        df_features['{}_holiday_count_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_count).values 
        df_features['{}_holiday_sum_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_sum).values  
    return df_features

#### air_store_id + weekday + holiday +visitors特征
过去每个节假日与工作日的交叉统计特征

In [31]:
def get_weekday_holiday_visitors_sts_features(df, key, end_date, date_col='visit_date', val_name='visitors'): 
    """
    时间窗口，groupby(air_store_id_weekday_holiday)
    窗口内的统计量
    """
    df_features      = pd.DataFrame() 
    df_features[key] = df[key].unique() 
    ind1    = df[date_col] < end_date  
    for i in [4, 8, 12, 100]: 
        t_st_date           = end_date - timedelta(7 * i) 
        ind2                = df[date_col] >= t_st_date 
        df_grp              = df.loc[(ind1 & ind2)].groupby(key) 
        dic_mean            = df_grp[val_name].mean().to_dict()
        dic_std             = df_grp[val_name].std().to_dict()
        dic_max             = df_grp[val_name].max().to_dict()
        dic_min             = df_grp[val_name].min().to_dict()
        dic_quantile        = df_grp[val_name].quantile().to_dict()
        # dic_count           = df_grp[val_name].count().to_dict()
        dic_sum             = df_grp[val_name].sum().to_dict()   
        df_features['{}_weekday_holiday_mean_{}_{}'.format(key, i + 1, val_name)]       = df_features[key].map(dic_mean).values 
        df_features['{}_weekday_holiday_std_{}_{}'.format(key, i + 1, val_name)]        = df_features[key].map(dic_std).values 
        df_features['{}_weekday_holiday_max_{}_{}'.format(key, i + 1, val_name)]        = df_features[key].map(dic_max).values 
        df_features['{}_weekday_holiday_min_{}_{}'.format(key, i + 1, val_name)]        = df_features[key].map(dic_min).values 
        # df_features['{}_weekday_holiday_count_{}_{}'.format(key, i + 1, val_name)]    = df_features[key].map(dic_count).values 
        df_features['{}_weekday_holiday_sum_{}_{}'.format(key, i + 1, val_name)]        = df_features[key].map(dic_sum).values  
        df_features['{}_weekday_holiday_quantile_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_quantile).values  
    return df_features

#### air_store_id + timediff
过去每个商店关于时间的差值统计特征

In [32]:
def get_store_id_time_diff_sts_features(df, en_date, key='air_store_id', date_col='visit_date', days_list=[1000, 56, 28]): #, 14,
    """
    时间窗口，时间差
    1. 时间窗口
    2. 窗口内visit_date的diff
    3. diff的统计量
    """
    t0 = time()
    df_features       = pd.DataFrame()
    df_features[key]  = df[key].unique()
    df = df.sort_values([key, date_col])
    for days in days_list:    
        df_tmp                = df.copy()       
        # 时间窗口
        st_date               = get_st_date(en_date, days) 
        df_tmp                = df_tmp.loc[((df[date_col] >= st_date) & (df[date_col] < en_date))].copy()
        
        # 根据值排序
        df_tmp                = df_tmp.sort_values([key, date_col])
        # 当前项减前一项
        df_tmp['time_diff']   = df_tmp.groupby(key)[date_col].diff().values
        df_tmp['time_diff']   = df_tmp['time_diff'].apply(lambda x: x.days) 
        
        df_tmp                = df_tmp.loc[df_tmp['time_diff'].notnull()] 
        # 统计量
        df_features_tmp   = df_tmp.groupby([key], as_index=False)['time_diff'].agg({
            '{}_time_diff_min{}'.format(key,days): 'min',
            '{}_time_diff_mean{}'.format(key,days): 'mean',
            '{}_time_diff_median{}'.format(key,days): 'median',
            '{}_time_diff_max{}'.format(key,days): 'max',
            '{}_time_diff_std{}'.format(key,days): 'std',
            '{}_time_diff_quantile{}'.format(key,days): 'quantile',
            '{}_time_diff_skew{}'.format(key,days): 'skew'
        }) 
        df_features  = df_features.merge(df_features_tmp, on=key, how='left')
    print('For store id diff sts features, we spend {} seconds.'.format(time() - t0)) 
    
    return df_features

### 特征工程2（reserve相关特征）
####  air_store_id + visit_date + timediff & reserve_visitors特征
每天的预定的统计信息,包括预定人数以及时间差的统计信息

In [33]:
def get_store_date_reserve_features(df_air_reserves, df_hpg_reserves, en_date, days, key='air_store_id_visit_date'):
    """
    时间窗口，预定信息相关特征，分为air系统和hpg系统
    1. 时间窗口
    2. 预定入店日期与预定日期相差的天数，相差天数的统计量
    3. 预定人数的统计量
    """
    t0 = time()
    st_date, en_date = get_label_st_date(en_date, days)
    
    # 时间窗口
    # st_date <= 预定入店时间 < en_date and 预定时间 < st_date
    df_air_reserve = df_air_reserves.loc[((df_air_reserves['visit_datetime'] < en_date)
                                          & (df_air_reserves['visit_datetime'] >= st_date)
                                          & (df_air_reserves['reserve_datetime'] < st_date))].copy()
    
    df_hpg_reserve = df_hpg_reserves.loc[((df_hpg_reserves['visit_datetime'] < en_date)
                                          & (df_hpg_reserves['visit_datetime'] >= st_date)
                                          & (df_hpg_reserves['reserve_datetime'] < st_date))].copy()
    
    df_features       = pd.DataFrame()
    df_features[key]  = df_air_reserve[key].unique()  
    
    # air系统
    try:  
        # 预定入店日期与预定日期相差的天数
        df_air_reserve['time_diff'] = df_air_reserve['visit_datetime'] - df_air_reserve['reserve_datetime']
        df_air_reserve['time_diff'] = df_air_reserve['time_diff'].apply(lambda x: x.days).values
        
        # time_diff的统计量
        df_air_timediff_fea = df_air_reserve.groupby(key, as_index=False)['time_diff'].agg({
            'store_air_timediff_mean': 'mean',
            'store_air_timediff_median': 'median',
            'store_air_timediff_max': 'max',
            'store_air_timediff_std': 'std'
        }) 
        
        # reserve_visitors的统计量
        df_air_reserve_fea = df_air_reserve.groupby(['air_store_id', 'visit_date'])['reserve_visitors'].agg({
            'store_air_reserve_visitors_sum': 'sum',
            'store_air_reserve_visitors_mean': 'mean',
            # 'store_air_reserve_visitors_std': 'std',
            'store_air_reserve_visitors_max': 'max',
            'store_air_reserve_visitors_count': 'count',
        }) 

        df_air_reserve_fea          = df_air_reserve_fea.unstack().fillna(0).stack().reset_index()    
        df_air_reserve_fea[key]     = df_air_reserve_fea['air_store_id'].astype(str) + '_' + df_air_reserve_fea['visit_date'].astype(str)
        del df_air_reserve_fea['air_store_id']
        del df_air_reserve_fea['visit_date']
        df_features = df_features.merge(df_air_timediff_fea, on=key, how='left')
        df_features = df_features.merge(df_air_reserve_fea,  on=key, how='left')

    except:
        pass
    
    # hpg系统
    try:
        df_hpg_reserve['time_diff'] = df_hpg_reserve['visit_datetime'] - df_hpg_reserve['reserve_datetime']
        df_hpg_reserve['time_diff'] = df_hpg_reserve['time_diff'].apply(lambda x: x.days).values
        
        df_hpg_timediff_fea = df_hpg_reserve.groupby(key, as_index=False)['time_diff'].agg({
            'store_hpg_timediff_mean': 'mean',
            'store_hpg_timediff_median': 'median',
            'store_hpg_timediff_max': 'max',
            'store_hpg_timediff_std': 'std',
        }) 

        df_hpg_reserve_fea = df_hpg_reserve.groupby(['air_store_id', 'visit_date'])['reserve_visitors'].agg({
            'store_hpg_reserve_visitors_sum': 'sum',
            'store_hpg_reserve_visitors_mean': 'mean',
            # 'store_hpg_reserve_visitors_std': 'std',
            'store_hpg_reserve_visitors_max': 'max',
            'store_hpg_reserve_visitors_count': 'count',
        }) 

        df_hpg_reserve_fea = df_hpg_reserve_fea.unstack().fillna(0).stack().reset_index()    
        df_hpg_reserve_fea[key] = df_hpg_reserve_fea['air_store_id'].astype(str) + '_' + df_hpg_reserve_fea['visit_date'].astype(str)
        del df_hpg_reserve_fea['air_store_id']
        del df_hpg_reserve_fea['visit_date']
        df_features = df_features.merge(df_hpg_timediff_fea, on=key, how='left')
        df_features = df_features.merge(df_hpg_reserve_fea, on=key, how='left')
        
    except:
        pass
    
    print('For store id reserve visitors features, we spend {} seconds.'.format(time() - t0)) 
    
    return df_features

#### air_store_id + timediff & reserve_visitors特征
整体的预定统计信息,包括预定人数以及时间差的统计信息

In [34]:
def get_store_reserve_features(df_air_reserves, df_hpg_reserves, en_date, days, key='air_store_id'):
    """
    时间窗口，预定信息相关特征，分为air系统和hpg系统
    与上面不同的是groupby对象不同，这里groupby('air_store_id')，上面groupby('air_store_id_visit_date')
    1. 时间窗口
    2. 预定入店日期与预定日期相差的天数，相差天数的统计量
    3. 预定人数的统计量
    """
    t0 = time()
    
    st_date,en_date = get_label_st_date(en_date, days) 
   
    df_air_reserve  = df_air_reserves.loc[((df_air_reserves['visit_datetime']   < en_date)
                                           & (df_air_reserves['visit_datetime']   >= st_date)
                                           & (df_air_reserves['reserve_datetime'] < st_date))].copy()
    df_hpg_reserve  = df_hpg_reserves.loc[((df_hpg_reserves['visit_datetime']   < en_date)
                                           & (df_hpg_reserves['visit_datetime']   >= st_date)
                                           & (df_hpg_reserves['reserve_datetime'] < st_date))].copy()
    
    
    df_features       = pd.DataFrame()
    df_features[key]  = df_air_reserve[key].unique()
    
    df_air_reserve['time_diff']   = df_air_reserve['visit_datetime'] - df_air_reserve['reserve_datetime']
    df_air_reserve['time_diff']   = df_air_reserve['time_diff'].apply(lambda x: x.days)
    
    df_hpg_reserve['time_diff']   = df_hpg_reserve['visit_datetime'] - df_hpg_reserve['reserve_datetime']
    df_hpg_reserve['time_diff']   = df_hpg_reserve['time_diff'].apply(lambda x: x.days)
    
    try:
        df_air_timediff_fea = df_air_reserve.groupby(key, as_index=False)['time_diff'].agg({
            'store2_air_timediff_mean': 'mean',
            'store2_air_timediff_median': 'median',
            'store2_air_timediff_max': 'max',
            'store2_air_timediff_count': 'count',
            'store2_air_timediff_std': 'std',
        }) 

        df_air_reserve_fea = df_air_reserve.groupby(key, as_index=False)['reserve_visitors'].agg({
            'store2_air_reserve_visitors_sum': 'sum',
            'store2_air_reserve_visitors_median': 'median',
            'store2_air_reserve_visitors_mean': 'mean',
            'store2_air_reserve_visitors_max': 'max',
            'store2_air_reserve_visitors_count': 'count',
            'store2_air_reserve_visitors_std': 'std',
        }) 
        df_features = df_features.merge(df_air_timediff_fea, on=key, how='left')
        df_features = df_features.merge(df_air_reserve_fea, on=key, how='left')
        
    except:
        pass
    
    try:
        df_hpg_timediff_fea = df_hpg_reserve.groupby(key, as_index = False)['time_diff'].agg({
            'store2_hpg_timediff_mean': 'mean',
            'store2_hpg_timediff_median': 'median',
            'store2_hpg_timediff_max': 'max',
            'store2_hpg_timediff_count': 'count',
            'store2_hpg_timediff_skew': 'skew',
            'store2_hpg_timediff_std': 'std',
        }) 

        df_hpg_reserve_fea = df_hpg_reserve.groupby(key, as_index = False)['reserve_visitors'].agg({
            'store2_hpg_reserve_visitors_sum': 'sum',
            'store2_hpg_reserve_visitors_mean': 'mean',
            'store2_hpg_reserve_visitors_median': 'median',
            'store2_hpg_reserve_visitors_max': 'max',
            'store2_hpg_reserve_visitors_count': 'count',
            'store2_hpg_reserve_visitors_std': 'std',
        }) 
    
        df_features = df_features.merge(df_hpg_timediff_fea, on=key, how='left')
        df_features = df_features.merge(df_hpg_reserve_fea,  on=key, how='left')
    except:
        pass
    
    print('For store id reserve visitors features, we spend {} seconds.'.format(time() - t0)) 
    
    return df_features

#### visit_date + timediff & reserve_visitors特征
每个时间段的统计信息,包括预定人数以及时间差的统计信息

In [1]:
def get_date_reserve_features(df_air_reserves, df_hpg_reserves, en_date, days, key='visit_date'):
    """
    时间窗口，预定信息相关特征，分为air系统和hpg系统
    groupby('visit_date')
    1. 时间窗口
    2. 预定入店日期与预定日期相差的天数，相差天数的统计量
    3. 预定人数的统计量
    """
    t0 = time()
    st_date,en_date = get_label_st_date(en_date, days) 

    df_air_reserve  = df_air_reserves.loc[((df_air_reserves['visit_datetime'] < en_date)
                                           & (df_air_reserves['visit_datetime'] >= st_date)
                                           & (df_air_reserves['reserve_datetime'] < st_date))].copy()
    
    df_hpg_reserve  = df_hpg_reserves.loc[((df_hpg_reserves['visit_datetime'] < en_date)
                                           & (df_hpg_reserves['visit_datetime'] >= st_date)
                                           & (df_hpg_reserves['reserve_datetime'] < st_date))].copy()
    
    
    df_features       = pd.DataFrame()
    df_features[key]  = df_air_reserve[key].unique()
    
    df_air_reserve['time_diff'] = df_air_reserve['visit_datetime'] - df_air_reserve['reserve_datetime']
    df_air_reserve['time_diff'] = df_air_reserve['time_diff'].apply(lambda x: x.days)
    
    df_hpg_reserve['time_diff'] = df_hpg_reserve['visit_datetime'] - df_hpg_reserve['reserve_datetime']
    df_hpg_reserve['time_diff'] = df_hpg_reserve['time_diff'].apply(lambda x: x.days)
    try:
        df_air_timediff_fea = df_air_reserve.groupby(key, as_index=False)['time_diff'].agg({
            'date_air_timediff_mean': 'mean',
            'date_air_timediff_median': 'median',
            'date_air_timediff_max': 'max',
            'date_air_timediff_count': 'count',
            'date_air_timediff_std': 'std',
        }) 

        df_air_reserve_fea = df_air_reserve.groupby(key, as_index=False)['reserve_visitors'].agg({
            'date_air_reserve_visitors_sum': 'sum',
            'date_air_reserve_visitors_median': 'median',
            'date_air_reserve_visitors_max': 'max',
            'date_air_reserve_visitors_count': 'count',
            'date_air_reserve_visitors_std': 'std',
        }) 
        df_features = df_features.merge(df_air_timediff_fea, on=key, how='left')
        df_features = df_features.merge(df_air_reserve_fea,  on=key, how='left')
        
    except:
        pass
    try:
        df_hpg_timediff_fea = df_hpg_reserve.groupby(key, as_index=False)['time_diff'].agg({
            'date_hpg_timediff_mean': 'mean',
            'date_hpg_timediff_max': 'max',
        }) 

        df_hpg_reserve_fea = df_hpg_reserve.groupby(key, as_index=False)['reserve_visitors'].agg({
            'date_hpg_reserve_visitors_sum': 'sum',
            'date_hpg_reserve_visitors_median': 'median',
            'date_hpg_reserve_visitors_max': 'max',
            'date_hpg_reserve_visitors_count': 'count',
        }) 
    
        df_features = df_features.merge(df_hpg_timediff_fea, on=key, how='left')
        df_features = df_features.merge(df_hpg_reserve_fea,  on=key, how='left')
    except:
        pass
    
    print('For store id reserve visitors features, we spend {} seconds.'.format(time() - t0)) 
    
    return df_features

### 特征工程3：基于其他属性的特征，例如air_genre_name
#### air_genre_name + visitors特征
最近N(N=14,28,56,1000)天air_genre_name的统计特征

In [36]:
def get_genre_visitors_features(df, en_date, key='air_genre_name', date_col='visit_date', days_list=[1000, 56, 28]): #, 14,
    """
    时间窗口，groupby('air_genre_name')
    1. 时间窗口，st_date <= visit_date < en_date
    2. groupby('air_genre_name')，visitors的统计量
    """
    t0 = time()
    df_features       = pd.DataFrame()
    df_features[key]  = df[key].unique()
    
    df_tmp            = df.copy()
    for days in days_list:
        st_date           = get_st_date(en_date, days) 
        df_tmp            = df_tmp.loc[((df[date_col] >= st_date) & (df[date_col] < en_date))].copy()
        df_features_tmp   = df_tmp.groupby([key], as_index=False)['visitors'].agg({
            '{}_visitors_min{}'.format(key,days): 'min',
            '{}_visitors_mean{}'.format(key,days): 'mean',
            '{}_visitors_median{}'.format(key,days): 'median',
            '{}_visitors_max{}'.format(key,days): 'max',
            '{}_visitors_sum{}'.format(key,days): 'sum',
            '{}_visitors_std{}'.format(key,days): 'std',
            '{}_visitors_quantile{}'.format(key,days): 'quantile',
            '{}_visitors_skew{}'.format(key,days): 'skew'
        }) 
        df_features  = df_features.merge(df_features_tmp, on=key, how='left')
    print('For area sts features, we spend {} seconds.'.format(time() - t0)) 
    
    return df_features

#### air_genre_name + weekday + visitors特征
过去每段时间的工作日的统计特征天air_genre_name对应的统计特征

In [37]:
def get_genre_weekday_visitors_sts_features(df, end_date, key='air_genre_name_weekday', date_col='visit_date', val_name='visitors'): 
    """
    时间窗口，groupby('air_genre_name_weekday')
    1. 时间窗口，end_date的前4、8、12、100周
    2. groupby('air_genre_name_weekday')的统计量
    """
    df_features      = pd.DataFrame() 
    df_features[key] = df[key].unique()
    t0 = time()
    ind1 = df[date_col] < end_date  
    
    for i in [4, 8, 12, 100]:  #2,6
        t_st_date    = end_date  - timedelta(7 * i) 
        ind2         = df[date_col]  >= t_st_date 
        df_grp       = df.loc[(ind1 & ind2)].groupby(key) 
        
        dic_mean     = df_grp[val_name].mean().to_dict()
        dic_std      = df_grp[val_name].std().to_dict()
        dic_max      = df_grp[val_name].max().to_dict()
        dic_min      = df_grp[val_name].min().to_dict()
        dic_count    = df_grp[val_name].count().to_dict()
        dic_sum      = df_grp[val_name].sum().to_dict()   
        dic_quantile = df_grp[val_name].quantile().to_dict()   
        
        df_features['{}_mean_{}_{}'.format(key, i + 1, val_name)]  = df_features[key].map(dic_mean).values 
        df_features['{}_quantile_{}_{}'.format(key, i + 1, val_name)]  = df_features[key].map(dic_quantile).values 
        df_features['{}_std_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_std).values 
        df_features['{}_max_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_max).values 
        df_features['{}_min_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_min).values 
        df_features['{}_gap_{}_{}'.format(key, i + 1, val_name)]   = (df_features['{}_max_{}_{}'.format(key, i + 1, val_name)]
                                                                      - df_features['{}_min_{}_{}'.format(key, i+1, val_name)])
        df_features['{}_count_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_count).values 
        df_features['{}_sum_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_sum).values  
    print('For area name sts features, we spend {} seconds.'.format(time() - t0)) 
    return df_features

#### air_area_name + weekday + holiday + visitors特征
最近N(N=14,28,56,1000)天air_area_name+weekday以及holiday的统计特征

In [38]:
def get_genre_weekday_holiday_visitors_sts_features(df, end_date, key='air_genre_name_weekday_holiday', date_col='visit_date', val_name='visitors'):
    """
    时间窗口，groupby('air_genre_name_weekday_holiday')
    1. 时间窗口，end_date的前4、8、12、100周
    2. groupby('air_genre_name_weekday')的统计量
    """
    df_features      = pd.DataFrame() 
    df_features[key] = df[key].unique()
    t0 = time()
    ind1    = df[date_col] < end_date  
    for i in [4, 8, 12, 100]:  #2,6
        t_st_date    = end_date  - timedelta(7 * i) 
        ind2         = df[date_col]  >= t_st_date 
        df_grp       = df.loc[(ind1 & ind2)].groupby(key) 
        
        dic_mean     = df_grp[val_name].mean().to_dict()
        dic_std      = df_grp[val_name].std().to_dict()
        dic_max      = df_grp[val_name].max().to_dict()
        dic_min      = df_grp[val_name].min().to_dict()
        dic_count    = df_grp[val_name].count().to_dict()
        dic_sum      = df_grp[val_name].sum().to_dict()   
        dic_quantile = df_grp[val_name].quantile().to_dict()   
          
        df_features['{}_mean_{}_{}'.format(key, i + 1, val_name)]  = df_features[key].map(dic_mean).values 
        df_features['{}_quantile_{}_{}'.format(key, i + 1, val_name)]  = df_features[key].map(dic_quantile).values 
        df_features['{}_std_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_std).values 
        df_features['{}_max_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_max).values 
        df_features['{}_min_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_min).values 
        df_features['{}_gap_{}_{}'.format(key, i + 1, val_name)]   = (df_features['{}_max_{}_{}'.format(key, i+1, val_name)]
                                                                    - df_features['{}_min_{}_{}'.format(key, i+1, val_name)])
        df_features['{}_count_{}_{}'.format(key, i + 1, val_name)] = df_features[key].map(dic_count).values 
        df_features['{}_sum_{}_{}'.format(key, i + 1, val_name)]   = df_features[key].map(dic_sum).values  
    print('For area name sts features, we spend {} seconds.'.format(time() - t0)) 
    return df_features

## 提取特征

In [39]:
def get_features(df_visitors, air_reserve, hpg_air_reserve, label_st_date, days=39, is_te=False):
     
    df_label = get_df_label(df_visitors, label_st_date, days, is_te) 
    df_label['visit_date']                   = pd.to_datetime(df_label['visit_date'])
    df_label['weekday']                      = df_label['visit_date'].dt.weekday  
    df_label['air_store_id_weekday']         = df_label['air_store_id'].astype(str) + '_' + df_label['weekday'].astype(str) 
    df_label['air_store_id_weekday_holiday'] = df_label['air_store_id_holiday'] + '_' + df_label['weekday'].astype(str) 
    
    df_label['air_store_id_visit_date']  = df_label['air_store_id']  + '_' + df_label['visit_date'].astype(str)
    
    ####################
    # 以下特征只是groupby对象不同
    df_genre_visitors_fea             = get_genre_visitors_features(df=df_visitors,
                                                                    key='air_genre_name',
                                                                    en_date=label_st_date,
                                                                    date_col='visit_date')
    df_genre_dow_visitors_fea         = get_genre_weekday_visitors_sts_features(df=df_visitors,
                                                                                key='air_genre_name_weekday',
                                                                                end_date=label_st_date,
                                                                                date_col='visit_date',
                                                                                val_name='visitors')
    df_genre_dow_holiday_visitors_fea = get_genre_weekday_holiday_visitors_sts_features(df=df_visitors,
                                                                                        key='air_genre_name_weekday_holiday',
                                                                                        end_date=label_st_date,
                                                                                        date_col='visit_date',
                                                                                        val_name='visitors')
    
    ####################
    # 以下特征只是groupby对象不同
    df_air_hpg_features  = get_store_date_reserve_features(df_air_reserves=air_reserve,
                                                           df_hpg_reserves=hpg_air_reserve,
                                                           en_date=label_st_date,
                                                           days=days) 
    df_air_hpg_features2 = get_store_reserve_features(df_air_reserves=air_reserve,
                                                      df_hpg_reserves=hpg_air_reserve,
                                                      en_date=label_st_date,
                                                      days=days) 
    df_air_hpg_features3 = get_date_reserve_features(df_air_reserves=air_reserve,
                                                     df_hpg_reserves=hpg_air_reserve,
                                                     en_date=label_st_date,
                                                     days=days) 
    
    ##################
    # 店铺在时间窗口内每天的visitors的统计量
    df_storeid_visitors_fea         = get_store_id_visitors_sts_features(df=df_visitors,
                                                                         key='air_store_id',
                                                                         en_date=label_st_date)
    
    # 店铺在时间窗口内的time_diff的统计量
    df_storeid_timediff_fea         = get_store_id_time_diff_sts_features(df=df_visitors,
                                                                          key='air_store_id',
                                                                          en_date=label_st_date)
    
    # (当天的visitors - 昨天的visitors)的统计量
    df_storeid_visitorsdiff_fea     = get_store_id_visitors_diff_sts_features(df=df_visitors,
                                                                              key='air_store_id',
                                                                              en_date=label_st_date)
    # 加权平均每天的visitors
    df_storeid_visitorsexp_fea      = get_store_id_visitors_exp_features(df=df_visitors,
                                                                         key='air_store_id',
                                                                         en_date=label_st_date)
    
    # 店铺在时间窗口内每周几的visitors的统计量
    df_storeid_dow_visitors_fea     = get_weekday_visitors_sts_features(df=df_visitors,
                                                                        key='air_store_id_weekday',
                                                                        end_date=label_st_date,
                                                                        date_col='visit_date',
                                                                        val_name='visitors')
    # (当天的visitors - 一周前那天的visitors)的统计量
    df_storeid_dow_visitorsdiff_fea = get_weekday_visitors_diff_sts_features(df=df_visitors,
                                                                             key='air_store_id_weekday',
                                                                             end_date=label_st_date,
                                                                             date_col='visit_date',
                                                                             val_name='visitors')
    # day_lags / 7，加权平均每天的visitors
    df_storeid_dow_visitorsexp_fea  = get_weekday_visitors_exp_sts_features(df=df_visitors,
                                                                            key='air_store_id_weekday',
                                                                            en_date=label_st_date,
                                                                            date_col='visit_date')
     
    # groupby('air_store_id_holiday') visitors的统计量
    df_storeid_holiday_visitors_fea = get_holiday_visitors_sts_features(df=df_visitors,
                                                                        key='air_store_id_holiday',
                                                                        end_date=label_st_date,
                                                                        date_col='visit_date',
                                                                        val_name='visitors')
    
    # groupby('air_store_id_weekday_holiday') visitors的统计量
    df_storeid_dow_holiday_visitorsdiff_fea = get_weekday_holiday_visitors_sts_features(df=df_visitors,
                                                                                        key='air_store_id_weekday_holiday',
                                                                                        end_date=label_st_date,
                                                                                        date_col='visit_date',
                                                                                        val_name='visitors')
     
    df_features             = df_label.merge(df_storeid_visitors_fea,            on=['air_store_id'], how='left')
    df_features             = df_features.merge(df_storeid_timediff_fea,         on=['air_store_id'], how='left')   
    df_features             = df_features.merge(df_storeid_visitorsdiff_fea,     on=['air_store_id'], how='left')  
    df_features             = df_features.merge(df_storeid_visitorsexp_fea,      on=['air_store_id'], how='left')  
    
    df_features             = df_features.merge(df_storeid_dow_visitors_fea,     on=['air_store_id_weekday'], how='left')  
    df_features             = df_features.merge(df_storeid_dow_visitorsdiff_fea, on=['air_store_id_weekday'], how='left')  
    df_features             = df_features.merge(df_storeid_dow_visitorsexp_fea,  on=['air_store_id_weekday'], how='left')   
     
    
    df_features             = df_features.merge(df_air_hpg_features,  on=['air_store_id_visit_date'], how='left')
    df_features             = df_features.merge(df_air_hpg_features2, on=['air_store_id'], how='left') 
    df_features['visit_date']          = df_features['visit_date'].astype(str)
    df_air_hpg_features3['visit_date'] = df_air_hpg_features3['visit_date'].astype(str)  
    df_features             = df_features.merge(df_air_hpg_features3, on=['visit_date'], how='left') 
    
    df_features             = df_features.merge(df_storeid_holiday_visitors_fea, on=['air_store_id_holiday'], how='left')  
    df_features             = df_features.merge(df_storeid_dow_holiday_visitorsdiff_fea, on=['air_store_id_weekday_holiday'], how='left')  
    
    df_features             = df_features.merge(df_genre_visitors_fea, on=['air_genre_name'], how='left')   
    df_features             = df_features.merge(df_genre_dow_visitors_fea, on=['air_genre_name_weekday'], how='left')  
    df_features             = df_features.merge(df_genre_dow_holiday_visitors_fea, on=['air_genre_name_weekday_holiday'], how='left')
    
    del df_features['air_genre_name'] 
    return df_features

## 模型验证
### 模型验证的框架

2020-07-11

In [40]:
def lgb_model_test_weight_timebased(df_tr, df_val, features, ws = [1.01], test_df = None): 
    params = {
        'num_leaves': 256,
        'min_child_samples': 79,
        'objective': 'rmse',
        'max_depth': 13,
        'learning_rate': 0.03,
        'boosting_type': 'gbdt',
        'subsample_freq': 3,
        'subsample': 0.9,
        'bagging_seed': 11,
        'metric': 'rmse',
        'verbosity': -1,
        'reg_alpha': 0.3,
        'reg_lambda': 0.3,
        'colsample_bytree': 0.9, 
    } 

    MAX_ROUNDS = 700
    val_pred = []
    test_pred = []
    cate_vars = []
    feature_importance = None
    models = []  
     
    train_df, train_label = df_tr[features].copy(), df_tr['visitors'].values
    val_df, val_label     = df_val[features].copy(), df_val['visitors'].values
    
    for i, w in enumerate(ws):
    
        train_weight = []
        tmp = df_tr['day_to_now'].values
        for d in tmp:
            if d > 365:
                train_weight.append(1) 
            else:
                train_weight.append(w ** ((365 - d) // (60))) 
        
        dtrain = lgb.Dataset(train_df, label = train_label, weight=train_weight)
        dval   = lgb.Dataset(val_df,   label = val_label, reference=dtrain)  
        
        bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS,
                        valid_sets=[dtrain,dval], early_stopping_rounds=1500,
                        verbose_eval=100)
        models.append(bst)
        f_importance = pd.DataFrame()
        f_importance['fea'] = features
        f_importance['imp'] = bst.feature_importance("gain") 
        f_importance['fold'] = i
        if feature_importance is None:
            feature_importance = f_importance
        else:
            feature_importance = pd.concat([feature_importance, f_importance],
                                           axis=0,
                                           ignore_index=True)
  
        if test_df is not None:
            test_pred.append(bst.predict(test_df[features],
                                         num_iteration=bst.best_iteration or MAX_ROUNDS))
    if test_df is None:
        return models, feature_importance
    else:
        return models, feature_importance, test_pred

### 模型验证
#### 加入store date相关的reserve特征

In [41]:
df_tr_feas = pd.DataFrame()
tr_fea_list = []
for i in tqdm(range(58)): 
    try:
        tr_label_date      = parse('2017-04-23') - timedelta(7 * 6 + 7 * i)   #*2
        tr_fea_date        = parse('2017-04-23') - timedelta(7 * 6 + 7 * i + 1) #**2

        df_tr_tmp          = get_features(df_meta, air_reserve, hpg_air_reserve, label_st_date=tr_label_date)    
        df_tr_feas         = pd.concat([df_tr_feas, df_tr_tmp], axis=0, ignore_index=True) 
        tr_fea_list.append(df_tr_tmp)
    except:
        print('Problems happen at step {}'.format(i))

  0%|                                                                                           | 0/58 [00:00<?, ?it/s]

label date,  2017-03-12 00:00:00 2017-04-20 00:00:00
For area sts features, we spend 0.27629852294921875 seconds.
For area name sts features, we spend 0.1984727382659912 seconds.
For area name sts features, we spend 0.2064061164855957 seconds.
For store id reserve visitors features, we spend 0.0469202995300293 seconds.
For store id reserve visitors features, we spend 0.07256531715393066 seconds.
For store id reserve visitors features, we spend 0.048873186111450195 seconds.
For store id sts features, we spend 0.7879033088684082 seconds.
For store id diff sts features, we spend 3.523646354675293 seconds.
For store id diff sts features, we spend 3.1218926906585693 seconds.
For store id exp features, we spend 14.643365621566772 seconds.
For store id exp features, we spend 11.172038555145264 seconds.


  2%|█▍                                                                                 | 1/58 [00:39<37:25, 39.39s/it]

label date,  2017-03-05 00:00:00 2017-04-13 00:00:00
For area sts features, we spend 0.245375394821167 seconds.
For area name sts features, we spend 0.1944746971130371 seconds.
For area name sts features, we spend 0.20363354682922363 seconds.
For store id reserve visitors features, we spend 0.03192901611328125 seconds.
For store id reserve visitors features, we spend 0.06682157516479492 seconds.
For store id reserve visitors features, we spend 0.04487872123718262 seconds.
For store id sts features, we spend 0.7818658351898193 seconds.
For store id diff sts features, we spend 3.426281213760376 seconds.
For store id diff sts features, we spend 3.0157310962677 seconds.
For store id exp features, we spend 14.197028875350952 seconds.
For store id exp features, we spend 10.981896162033081 seconds.


  3%|██▊                                                                                | 2/58 [01:17<36:31, 39.13s/it]

label date,  2017-02-26 00:00:00 2017-04-06 00:00:00
For area sts features, we spend 0.24440240859985352 seconds.
For area name sts features, we spend 0.19565844535827637 seconds.
For area name sts features, we spend 0.2082068920135498 seconds.
For store id reserve visitors features, we spend 0.02995920181274414 seconds.
For store id reserve visitors features, we spend 0.06482744216918945 seconds.
For store id reserve visitors features, we spend 0.04388022422790527 seconds.
For store id sts features, we spend 0.7802150249481201 seconds.
For store id diff sts features, we spend 3.3900699615478516 seconds.
For store id diff sts features, we spend 2.9964394569396973 seconds.
For store id exp features, we spend 14.477341175079346 seconds.
For store id exp features, we spend 10.713850021362305 seconds.


  5%|████▎                                                                              | 3/58 [01:56<35:44, 38.98s/it]

label date,  2017-02-19 00:00:00 2017-03-30 00:00:00
For area sts features, we spend 0.24334311485290527 seconds.
For area name sts features, we spend 0.1974506378173828 seconds.
For area name sts features, we spend 0.2044227123260498 seconds.
For store id reserve visitors features, we spend 0.03293156623840332 seconds.
For store id reserve visitors features, we spend 0.06781673431396484 seconds.
For store id reserve visitors features, we spend 0.046875 seconds.
For store id sts features, we spend 0.7885739803314209 seconds.
For store id diff sts features, we spend 3.4110066890716553 seconds.
For store id diff sts features, we spend 3.0029780864715576 seconds.
For store id exp features, we spend 13.681862115859985 seconds.
For store id exp features, we spend 10.488358974456787 seconds.


  7%|█████▋                                                                             | 4/58 [02:34<34:43, 38.58s/it]

label date,  2017-02-12 00:00:00 2017-03-23 00:00:00
For area sts features, we spend 0.2425212860107422 seconds.
For area name sts features, we spend 0.19249963760375977 seconds.
For area name sts features, we spend 0.211472749710083 seconds.
For store id reserve visitors features, we spend 0.03687644004821777 seconds.
For store id reserve visitors features, we spend 0.06885600090026855 seconds.
For store id reserve visitors features, we spend 0.047868967056274414 seconds.
For store id sts features, we spend 0.7709441184997559 seconds.
For store id diff sts features, we spend 3.2874913215637207 seconds.
For store id diff sts features, we spend 2.893800735473633 seconds.
For store id exp features, we spend 13.736128091812134 seconds.
For store id exp features, we spend 10.371964454650879 seconds.


  9%|███████▏                                                                           | 5/58 [03:11<33:46, 38.23s/it]

label date,  2017-02-05 00:00:00 2017-03-16 00:00:00
For area sts features, we spend 0.23537826538085938 seconds.
For area name sts features, we spend 0.18848252296447754 seconds.
For area name sts features, we spend 0.1984875202178955 seconds.
For store id reserve visitors features, we spend 0.03393721580505371 seconds.
For store id reserve visitors features, we spend 0.06981420516967773 seconds.
For store id reserve visitors features, we spend 0.04886794090270996 seconds.
For store id sts features, we spend 0.7697651386260986 seconds.
For store id diff sts features, we spend 3.322855234146118 seconds.
For store id diff sts features, we spend 2.89680814743042 seconds.
For store id exp features, we spend 13.469204187393188 seconds.
For store id exp features, we spend 10.265994310379028 seconds.


 10%|████████▌                                                                          | 6/58 [03:49<33:00, 38.09s/it]

label date,  2017-01-29 00:00:00 2017-03-09 00:00:00
For area sts features, we spend 0.24632883071899414 seconds.
For area name sts features, we spend 0.19858288764953613 seconds.
For area name sts features, we spend 0.21242713928222656 seconds.
For store id reserve visitors features, we spend 0.034923553466796875 seconds.
For store id reserve visitors features, we spend 0.07081723213195801 seconds.
For store id reserve visitors features, we spend 0.05128598213195801 seconds.
For store id sts features, we spend 0.786168098449707 seconds.
For store id diff sts features, we spend 3.9409008026123047 seconds.
For store id diff sts features, we spend 3.3145041465759277 seconds.
For store id exp features, we spend 17.1669020652771 seconds.
For store id exp features, we spend 12.479012727737427 seconds.


 12%|██████████                                                                         | 7/58 [04:34<34:16, 40.32s/it]

label date,  2017-01-22 00:00:00 2017-03-02 00:00:00
For area sts features, we spend 0.25730037689208984 seconds.
For area name sts features, we spend 0.20348691940307617 seconds.
For area name sts features, we spend 0.22239470481872559 seconds.
For store id reserve visitors features, we spend 0.04288530349731445 seconds.
For store id reserve visitors features, we spend 0.08381342887878418 seconds.
For store id reserve visitors features, we spend 0.05681133270263672 seconds.
For store id sts features, we spend 0.8527216911315918 seconds.
For store id diff sts features, we spend 3.4154980182647705 seconds.
For store id diff sts features, we spend 3.0357630252838135 seconds.
For store id exp features, we spend 14.959917068481445 seconds.
For store id exp features, we spend 9.83201789855957 seconds.


 14%|███████████▍                                                                       | 8/58 [05:15<33:39, 40.40s/it]

label date,  2017-01-15 00:00:00 2017-02-23 00:00:00
For area sts features, we spend 0.2502937316894531 seconds.
For area name sts features, we spend 0.27127575874328613 seconds.
For area name sts features, we spend 0.3361356258392334 seconds.
For store id reserve visitors features, we spend 0.06183433532714844 seconds.
For store id reserve visitors features, we spend 0.11768746376037598 seconds.
For store id reserve visitors features, we spend 0.07579565048217773 seconds.
For store id sts features, we spend 0.8527195453643799 seconds.
For store id diff sts features, we spend 3.43707275390625 seconds.
For store id diff sts features, we spend 2.797011375427246 seconds.
For store id exp features, we spend 12.976984977722168 seconds.
For store id exp features, we spend 9.63834524154663 seconds.


 16%|████████████▉                                                                      | 9/58 [05:52<32:11, 39.42s/it]

label date,  2017-01-08 00:00:00 2017-02-16 00:00:00
For area sts features, we spend 0.2334153652191162 seconds.
For area name sts features, we spend 0.18466496467590332 seconds.
For area name sts features, we spend 0.20245695114135742 seconds.
For store id reserve visitors features, we spend 0.03790020942687988 seconds.
For store id reserve visitors features, we spend 0.07507157325744629 seconds.
For store id reserve visitors features, we spend 0.051868438720703125 seconds.
For store id sts features, we spend 0.8527140617370605 seconds.
For store id diff sts features, we spend 3.9350714683532715 seconds.
For store id diff sts features, we spend 2.7557365894317627 seconds.
For store id exp features, we spend 12.431140899658203 seconds.
For store id exp features, we spend 9.366389989852905 seconds.


 17%|██████████████▏                                                                   | 10/58 [06:29<30:52, 38.60s/it]

label date,  2017-01-01 00:00:00 2017-02-09 00:00:00
For area sts features, we spend 0.23640823364257812 seconds.
For area name sts features, we spend 0.17948102951049805 seconds.
For area name sts features, we spend 0.20944476127624512 seconds.
For store id reserve visitors features, we spend 0.0339658260345459 seconds.
For store id reserve visitors features, we spend 0.06881022453308105 seconds.
For store id reserve visitors features, we spend 0.047873735427856445 seconds.
For store id sts features, we spend 0.7759747505187988 seconds.
For store id diff sts features, we spend 3.2162182331085205 seconds.
For store id diff sts features, we spend 2.7504656314849854 seconds.
For store id exp features, we spend 12.334747552871704 seconds.
For store id exp features, we spend 9.531390190124512 seconds.


 19%|███████████████▌                                                                  | 11/58 [07:05<29:34, 37.76s/it]

label date,  2016-12-25 00:00:00 2017-02-02 00:00:00
For area sts features, we spend 0.2383263111114502 seconds.
For area name sts features, we spend 0.19547176361083984 seconds.
For area name sts features, we spend 0.19148707389831543 seconds.
For store id reserve visitors features, we spend 0.048821449279785156 seconds.
For store id reserve visitors features, we spend 0.08781075477600098 seconds.
For store id reserve visitors features, we spend 0.06278562545776367 seconds.
For store id sts features, we spend 0.7888884544372559 seconds.
For store id diff sts features, we spend 3.1133313179016113 seconds.
For store id diff sts features, we spend 2.6628928184509277 seconds.
For store id exp features, we spend 11.537407875061035 seconds.
For store id exp features, we spend 8.64224123954773 seconds.


 21%|████████████████▉                                                                 | 12/58 [07:39<28:05, 36.64s/it]

label date,  2016-12-18 00:00:00 2017-01-26 00:00:00
For area sts features, we spend 0.23035812377929688 seconds.
For area name sts features, we spend 0.18546533584594727 seconds.
For area name sts features, we spend 0.19349169731140137 seconds.
For store id reserve visitors features, we spend 0.0558621883392334 seconds.
For store id reserve visitors features, we spend 0.09970307350158691 seconds.
For store id reserve visitors features, we spend 0.09470677375793457 seconds.
For store id sts features, we spend 0.8357641696929932 seconds.
For store id diff sts features, we spend 3.3783562183380127 seconds.
For store id diff sts features, we spend 2.640892505645752 seconds.
For store id exp features, we spend 11.32620644569397 seconds.
For store id exp features, we spend 8.466153621673584 seconds.


 22%|██████████████████▍                                                               | 13/58 [08:13<27:02, 36.06s/it]

label date,  2016-12-11 00:00:00 2017-01-19 00:00:00
For area sts features, we spend 0.26030397415161133 seconds.
For area name sts features, we spend 0.2274322509765625 seconds.
For area name sts features, we spend 0.22040963172912598 seconds.
For store id reserve visitors features, we spend 0.07679343223571777 seconds.
For store id reserve visitors features, we spend 0.16455936431884766 seconds.
For store id reserve visitors features, we spend 0.09175610542297363 seconds.
For store id sts features, we spend 1.1335132122039795 seconds.
For store id diff sts features, we spend 3.5078186988830566 seconds.
For store id diff sts features, we spend 2.849364995956421 seconds.
For store id exp features, we spend 11.46633243560791 seconds.
For store id exp features, we spend 9.010498762130737 seconds.


 24%|███████████████████▊                                                              | 14/58 [08:49<26:25, 36.04s/it]

label date,  2016-12-04 00:00:00 2017-01-12 00:00:00
For area sts features, we spend 0.23237943649291992 seconds.
For area name sts features, we spend 0.1874558925628662 seconds.
For area name sts features, we spend 0.19443655014038086 seconds.
For store id reserve visitors features, we spend 0.05489921569824219 seconds.
For store id reserve visitors features, we spend 0.09773731231689453 seconds.
For store id reserve visitors features, we spend 0.07076692581176758 seconds.
For store id sts features, we spend 0.8471946716308594 seconds.
For store id diff sts features, we spend 3.112044095993042 seconds.
For store id diff sts features, we spend 2.561392307281494 seconds.
For store id exp features, we spend 10.192061185836792 seconds.
For store id exp features, we spend 7.5383148193359375 seconds.


 26%|█████████████████████▏                                                            | 15/58 [09:21<24:48, 34.63s/it]

label date,  2016-11-27 00:00:00 2017-01-05 00:00:00
For area sts features, we spend 0.18491578102111816 seconds.
For area name sts features, we spend 0.15819501876831055 seconds.
For area name sts features, we spend 0.1597127914428711 seconds.
For store id reserve visitors features, we spend 0.031241655349731445 seconds.
For store id reserve visitors features, we spend 0.07810854911804199 seconds.
For store id reserve visitors features, we spend 0.04686307907104492 seconds.
For store id sts features, we spend 0.7120773792266846 seconds.
For store id diff sts features, we spend 2.7287437915802 seconds.
For store id diff sts features, we spend 2.415954351425171 seconds.
For store id exp features, we spend 9.705127000808716 seconds.
For store id exp features, we spend 7.492562294006348 seconds.


 28%|██████████████████████▌                                                           | 16/58 [09:51<23:17, 33.27s/it]

label date,  2016-11-20 00:00:00 2016-12-29 00:00:00
For area sts features, we spend 0.2014174461364746 seconds.
For area name sts features, we spend 0.157728910446167 seconds.
For area name sts features, we spend 0.16167688369750977 seconds.
For store id reserve visitors features, we spend 0.04686689376831055 seconds.
For store id reserve visitors features, we spend 0.06246829032897949 seconds.
For store id reserve visitors features, we spend 0.042232513427734375 seconds.
For store id sts features, we spend 0.7169806957244873 seconds.
For store id diff sts features, we spend 2.7650396823883057 seconds.
For store id diff sts features, we spend 2.3126320838928223 seconds.
For store id exp features, we spend 9.357306241989136 seconds.
For store id exp features, we spend 7.0541534423828125 seconds.


 29%|████████████████████████                                                          | 17/58 [10:20<21:57, 32.14s/it]

label date,  2016-11-13 00:00:00 2016-12-22 00:00:00
For area sts features, we spend 0.19060564041137695 seconds.
For area name sts features, we spend 0.1405951976776123 seconds.
For area name sts features, we spend 0.16207051277160645 seconds.
For store id reserve visitors features, we spend 0.031240224838256836 seconds.
For store id reserve visitors features, we spend 0.09373188018798828 seconds.
For store id reserve visitors features, we spend 0.046864986419677734 seconds.
For store id sts features, we spend 0.6931092739105225 seconds.
For store id diff sts features, we spend 2.627532958984375 seconds.
For store id diff sts features, we spend 2.239640474319458 seconds.
For store id exp features, we spend 9.13254189491272 seconds.
For store id exp features, we spend 6.919930458068848 seconds.


 31%|█████████████████████████▍                                                        | 18/58 [10:50<20:52, 31.31s/it]

label date,  2016-11-06 00:00:00 2016-12-15 00:00:00
For area sts features, we spend 0.2008669376373291 seconds.
For area name sts features, we spend 0.15960311889648438 seconds.
For area name sts features, we spend 0.16558241844177246 seconds.
For store id reserve visitors features, we spend 0.031966447830200195 seconds.
For store id reserve visitors features, we spend 0.07059049606323242 seconds.
For store id reserve visitors features, we spend 0.04587435722351074 seconds.
For store id sts features, we spend 0.7069859504699707 seconds.
For store id diff sts features, we spend 2.714900016784668 seconds.
For store id diff sts features, we spend 2.2248382568359375 seconds.
For store id exp features, we spend 8.763129711151123 seconds.
For store id exp features, we spend 6.631699323654175 seconds.


 33%|██████████████████████████▊                                                       | 19/58 [11:18<19:49, 30.49s/it]

label date,  2016-10-30 00:00:00 2016-12-08 00:00:00
For area sts features, we spend 0.1785290241241455 seconds.
For area name sts features, we spend 0.1405959129333496 seconds.
For area name sts features, we spend 0.1714038848876953 seconds.
For store id reserve visitors features, we spend 0.031240224838256836 seconds.
For store id reserve visitors features, we spend 0.062482357025146484 seconds.
For store id reserve visitors features, we spend 0.03241133689880371 seconds.
For store id sts features, we spend 0.7116992473602295 seconds.
For store id diff sts features, we spend 2.5378198623657227 seconds.
For store id diff sts features, we spend 2.1799912452697754 seconds.
For store id exp features, we spend 8.488570213317871 seconds.
For store id exp features, we spend 6.442018508911133 seconds.


 34%|████████████████████████████▎                                                     | 20/58 [11:46<18:49, 29.73s/it]

label date,  2016-10-23 00:00:00 2016-12-01 00:00:00
For area sts features, we spend 0.1865541934967041 seconds.
For area name sts features, we spend 0.14960217475891113 seconds.
For area name sts features, we spend 0.16057538986206055 seconds.
For store id reserve visitors features, we spend 0.02498626708984375 seconds.
For store id reserve visitors features, we spend 0.0568394660949707 seconds.
For store id reserve visitors features, we spend 0.03789782524108887 seconds.
For store id sts features, we spend 0.6809902191162109 seconds.
For store id diff sts features, we spend 2.5709874629974365 seconds.
For store id diff sts features, we spend 2.0868847370147705 seconds.
For store id exp features, we spend 8.31007981300354 seconds.
For store id exp features, we spend 6.068579196929932 seconds.


 36%|█████████████████████████████▋                                                    | 21/58 [12:14<17:53, 29.01s/it]

label date,  2016-10-16 00:00:00 2016-11-24 00:00:00
For area sts features, we spend 0.18239998817443848 seconds.
For area name sts features, we spend 0.1405963897705078 seconds.
For area name sts features, we spend 0.1702561378479004 seconds.
For store id reserve visitors features, we spend 0.020960092544555664 seconds.
For store id reserve visitors features, we spend 0.05285978317260742 seconds.
For store id reserve visitors features, we spend 0.03486776351928711 seconds.
For store id sts features, we spend 0.6949660778045654 seconds.
For store id diff sts features, we spend 2.5435903072357178 seconds.
For store id diff sts features, we spend 2.097756862640381 seconds.
For store id exp features, we spend 7.768974304199219 seconds.
For store id exp features, we spend 5.861369371414185 seconds.


 38%|███████████████████████████████                                                   | 22/58 [12:40<16:58, 28.30s/it]

label date,  2016-10-09 00:00:00 2016-11-17 00:00:00
For area sts features, we spend 0.1825249195098877 seconds.
For area name sts features, we spend 0.14662528038024902 seconds.
For area name sts features, we spend 0.1815333366394043 seconds.
For store id reserve visitors features, we spend 0.018977642059326172 seconds.
For store id reserve visitors features, we spend 0.05086255073547363 seconds.
For store id reserve visitors features, we spend 0.03291177749633789 seconds.
For store id sts features, we spend 0.6861474514007568 seconds.
For store id diff sts features, we spend 2.449674129486084 seconds.
For store id diff sts features, we spend 2.024864435195923 seconds.
For store id exp features, we spend 7.434938430786133 seconds.
For store id exp features, we spend 5.6686437129974365 seconds.


 40%|████████████████████████████████▌                                                 | 23/58 [13:06<16:06, 27.61s/it]

label date,  2016-10-02 00:00:00 2016-11-10 00:00:00
For area sts features, we spend 0.17757964134216309 seconds.
For area name sts features, we spend 0.14460420608520508 seconds.
For area name sts features, we spend 0.15557098388671875 seconds.
For store id reserve visitors features, we spend 0.014971494674682617 seconds.
For store id reserve visitors features, we spend 0.04487800598144531 seconds.
For store id reserve visitors features, we spend 0.02992391586303711 seconds.
For store id sts features, we spend 0.683701753616333 seconds.
For store id diff sts features, we spend 2.3796744346618652 seconds.
For store id diff sts features, we spend 2.0980687141418457 seconds.
For store id exp features, we spend 7.09605860710144 seconds.
For store id exp features, we spend 5.417093753814697 seconds.


 41%|█████████████████████████████████▉                                                | 24/58 [13:32<15:18, 27.03s/it]

label date,  2016-09-25 00:00:00 2016-11-03 00:00:00
For area sts features, we spend 0.17481660842895508 seconds.
For area name sts features, we spend 0.14361166954040527 seconds.
For area name sts features, we spend 0.1575760841369629 seconds.
For store id reserve visitors features, we spend 0.01596379280090332 seconds.
For store id reserve visitors features, we spend 0.045879364013671875 seconds.
For store id reserve visitors features, we spend 0.029918909072875977 seconds.
For store id sts features, we spend 0.6775078773498535 seconds.
For store id diff sts features, we spend 2.4212849140167236 seconds.
For store id diff sts features, we spend 2.000904083251953 seconds.
For store id exp features, we spend 6.745864152908325 seconds.
For store id exp features, we spend 5.172078847885132 seconds.


 43%|███████████████████████████████████▎                                              | 25/58 [13:57<14:30, 26.37s/it]

label date,  2016-09-18 00:00:00 2016-10-27 00:00:00
For area sts features, we spend 0.1865401268005371 seconds.
For area name sts features, we spend 0.15392184257507324 seconds.
For area name sts features, we spend 0.14860773086547852 seconds.
For store id reserve visitors features, we spend 0.010989665985107422 seconds.
For store id reserve visitors features, we spend 0.03490924835205078 seconds.
For store id reserve visitors features, we spend 0.016953468322753906 seconds.
For store id sts features, we spend 0.6940903663635254 seconds.
For store id diff sts features, we spend 2.32000994682312 seconds.
For store id diff sts features, we spend 1.8706681728363037 seconds.
For store id exp features, we spend 6.528420925140381 seconds.
For store id exp features, we spend 4.955225229263306 seconds.


 45%|████████████████████████████████████▊                                             | 26/58 [14:21<13:44, 25.75s/it]

label date,  2016-09-11 00:00:00 2016-10-20 00:00:00
For area sts features, we spend 0.16957545280456543 seconds.
For area name sts features, we spend 0.1376345157623291 seconds.
For area name sts features, we spend 0.14745354652404785 seconds.
For store id reserve visitors features, we spend 0.014989614486694336 seconds.
For store id reserve visitors features, we spend 0.04587721824645996 seconds.
For store id reserve visitors features, we spend 0.026931047439575195 seconds.
For store id sts features, we spend 0.6832988262176514 seconds.
For store id diff sts features, we spend 2.2760910987854004 seconds.
For store id diff sts features, we spend 1.8445098400115967 seconds.
For store id exp features, we spend 6.1721367835998535 seconds.
For store id exp features, we spend 4.75717568397522 seconds.


 47%|██████████████████████████████████████▏                                           | 27/58 [14:45<12:59, 25.15s/it]

label date,  2016-09-04 00:00:00 2016-10-13 00:00:00
For area sts features, we spend 0.15433812141418457 seconds.
For area name sts features, we spend 0.12940359115600586 seconds.
For area name sts features, we spend 0.14625048637390137 seconds.
For store id reserve visitors features, we spend 0.015615224838256836 seconds.
For store id reserve visitors features, we spend 0.04687356948852539 seconds.
For store id reserve visitors features, we spend 0.031238079071044922 seconds.
For store id sts features, we spend 0.6706438064575195 seconds.
For store id diff sts features, we spend 2.2182345390319824 seconds.
For store id diff sts features, we spend 1.806088924407959 seconds.
For store id exp features, we spend 5.996017217636108 seconds.
For store id exp features, we spend 4.467182874679565 seconds.


 48%|███████████████████████████████████████▌                                          | 28/58 [15:08<12:17, 24.57s/it]

label date,  2016-08-28 00:00:00 2016-10-06 00:00:00
For area sts features, we spend 0.16357088088989258 seconds.
For area name sts features, we spend 0.1316690444946289 seconds.
For area name sts features, we spend 0.1475510597229004 seconds.
For store id reserve visitors features, we spend 0.015998125076293945 seconds.
For store id reserve visitors features, we spend 0.04488635063171387 seconds.
For store id reserve visitors features, we spend 0.027923583984375 seconds.
For store id sts features, we spend 0.6712288856506348 seconds.
For store id diff sts features, we spend 2.2857425212860107 seconds.
For store id diff sts features, we spend 1.7819507122039795 seconds.
For store id exp features, we spend 5.720131158828735 seconds.
For store id exp features, we spend 4.318446636199951 seconds.


 50%|█████████████████████████████████████████                                         | 29/58 [15:31<11:37, 24.05s/it]

label date,  2016-08-21 00:00:00 2016-09-29 00:00:00
For area sts features, we spend 0.16079354286193848 seconds.
For area name sts features, we spend 0.12618350982666016 seconds.
For area name sts features, we spend 0.13073229789733887 seconds.
For store id reserve visitors features, we spend 0.01562190055847168 seconds.
For store id reserve visitors features, we spend 0.03123927116394043 seconds.
For store id reserve visitors features, we spend 0.03124380111694336 seconds.
For store id sts features, we spend 0.6795947551727295 seconds.
For store id diff sts features, we spend 2.4833195209503174 seconds.
For store id diff sts features, we spend 1.8670971393585205 seconds.
For store id exp features, we spend 5.294267416000366 seconds.
For store id exp features, we spend 3.9481277465820312 seconds.


 52%|██████████████████████████████████████████▍                                       | 30/58 [15:53<10:59, 23.57s/it]

label date,  2016-08-14 00:00:00 2016-09-22 00:00:00
For area sts features, we spend 0.15561985969543457 seconds.
For area name sts features, we spend 0.12367939949035645 seconds.
For area name sts features, we spend 0.13264679908752441 seconds.
For store id reserve visitors features, we spend 0.014988899230957031 seconds.
For store id reserve visitors features, we spend 0.046875715255737305 seconds.
For store id reserve visitors features, we spend 0.027925729751586914 seconds.
For store id sts features, we spend 0.6552479267120361 seconds.
For store id diff sts features, we spend 2.0729427337646484 seconds.
For store id diff sts features, we spend 1.689157247543335 seconds.
For store id exp features, we spend 4.909710884094238 seconds.
For store id exp features, we spend 3.8814756870269775 seconds.


 53%|███████████████████████████████████████████▊                                      | 31/58 [16:15<10:18, 22.92s/it]

label date,  2016-08-07 00:00:00 2016-09-15 00:00:00
For area sts features, we spend 0.14860844612121582 seconds.
For area name sts features, we spend 0.12369298934936523 seconds.
For area name sts features, we spend 0.12671375274658203 seconds.
For store id reserve visitors features, we spend 0.015958547592163086 seconds.
For store id reserve visitors features, we spend 0.04687929153442383 seconds.
For store id reserve visitors features, we spend 0.027919769287109375 seconds.
For store id sts features, we spend 0.6692113876342773 seconds.
For store id diff sts features, we spend 2.038917303085327 seconds.
For store id diff sts features, we spend 1.56614351272583 seconds.
For store id exp features, we spend 4.656211853027344 seconds.
For store id exp features, we spend 3.5735552310943604 seconds.


 55%|█████████████████████████████████████████████▏                                    | 32/58 [16:35<09:38, 22.25s/it]

label date,  2016-07-31 00:00:00 2016-09-08 00:00:00
For area sts features, we spend 0.14362144470214844 seconds.
For area name sts features, we spend 0.14561200141906738 seconds.
For area name sts features, we spend 0.12268853187561035 seconds.
For store id reserve visitors features, we spend 0.011972427368164062 seconds.
For store id reserve visitors features, we spend 0.0339205265045166 seconds.
For store id reserve visitors features, we spend 0.016942501068115234 seconds.
For store id sts features, we spend 0.6712703704833984 seconds.
For store id diff sts features, we spend 1.9201037883758545 seconds.
For store id diff sts features, we spend 1.551978588104248 seconds.
For store id exp features, we spend 4.433219909667969 seconds.
For store id exp features, we spend 3.2894976139068604 seconds.


 57%|██████████████████████████████████████████████▋                                   | 33/58 [16:55<08:59, 21.60s/it]

label date,  2016-07-24 00:00:00 2016-09-01 00:00:00
For area sts features, we spend 0.12918591499328613 seconds.
For area name sts features, we spend 0.10935044288635254 seconds.
For area name sts features, we spend 0.1149747371673584 seconds.
For store id reserve visitors features, we spend 0.015618324279785156 seconds.
For store id reserve visitors features, we spend 0.04681229591369629 seconds.
For store id reserve visitors features, we spend 0.03130459785461426 seconds.
For store id sts features, we spend 0.6404702663421631 seconds.
For store id diff sts features, we spend 1.8671185970306396 seconds.
For store id diff sts features, we spend 1.3950273990631104 seconds.
For store id exp features, we spend 4.0662829875946045 seconds.
For store id exp features, we spend 3.1304874420166016 seconds.


 59%|████████████████████████████████████████████████                                  | 34/58 [17:15<08:21, 20.92s/it]

label date,  2016-07-17 00:00:00 2016-08-25 00:00:00
For area sts features, we spend 0.132704496383667 seconds.
For area name sts features, we spend 0.11031651496887207 seconds.
For area name sts features, we spend 0.11170315742492676 seconds.
For store id reserve visitors features, we spend 0.016959190368652344 seconds.
For store id reserve visitors features, we spend 0.05086231231689453 seconds.
For store id reserve visitors features, we spend 0.03091740608215332 seconds.
For store id sts features, we spend 0.64813232421875 seconds.
For store id diff sts features, we spend 1.7371139526367188 seconds.
For store id diff sts features, we spend 1.321169376373291 seconds.
For store id exp features, we spend 3.8817801475524902 seconds.
For store id exp features, we spend 2.8746256828308105 seconds.


 60%|█████████████████████████████████████████████████▍                                | 35/58 [17:33<07:45, 20.24s/it]

label date,  2016-07-10 00:00:00 2016-08-18 00:00:00
For area sts features, we spend 0.12771248817443848 seconds.
For area name sts features, we spend 0.1085367202758789 seconds.
For area name sts features, we spend 0.11070442199707031 seconds.
For store id reserve visitors features, we spend 0.017950057983398438 seconds.
For store id reserve visitors features, we spend 0.053894758224487305 seconds.
For store id reserve visitors features, we spend 0.03390812873840332 seconds.
For store id sts features, we spend 0.650930643081665 seconds.
For store id diff sts features, we spend 1.6583259105682373 seconds.
For store id diff sts features, we spend 1.2282869815826416 seconds.
For store id exp features, we spend 3.468172550201416 seconds.
For store id exp features, we spend 2.637803792953491 seconds.


 62%|██████████████████████████████████████████████████▉                               | 36/58 [17:51<07:09, 19.51s/it]

label date,  2016-07-03 00:00:00 2016-08-11 00:00:00
For area sts features, we spend 0.12440276145935059 seconds.
For area name sts features, we spend 0.09174942970275879 seconds.
For area name sts features, we spend 0.10073995590209961 seconds.
For store id reserve visitors features, we spend 0.016959667205810547 seconds.
For store id reserve visitors features, we spend 0.049864768981933594 seconds.
For store id reserve visitors features, we spend 0.0318608283996582 seconds.
For store id sts features, we spend 0.630789041519165 seconds.
For store id diff sts features, we spend 1.5577950477600098 seconds.
For store id diff sts features, we spend 1.1347887516021729 seconds.
For store id exp features, we spend 3.1704976558685303 seconds.
For store id exp features, we spend 2.4446046352386475 seconds.


 64%|████████████████████████████████████████████████████▎                             | 37/58 [18:08<06:34, 18.76s/it]

label date,  2016-06-26 00:00:00 2016-08-04 00:00:00
For area sts features, we spend 0.136749267578125 seconds.
For area name sts features, we spend 0.09372711181640625 seconds.
For area name sts features, we spend 0.10042738914489746 seconds.
For store id reserve visitors features, we spend 0.010091781616210938 seconds.
For store id reserve visitors features, we spend 0.04691958427429199 seconds.
For store id reserve visitors features, we spend 0.031244993209838867 seconds.
For store id sts features, we spend 0.33636927604675293 seconds.
For store id diff sts features, we spend 1.1108872890472412 seconds.
For store id diff sts features, we spend 0.8806850910186768 seconds.
For store id exp features, we spend 3.058558225631714 seconds.
For store id exp features, we spend 2.26994252204895 seconds.


 66%|█████████████████████████████████████████████████████▋                            | 38/58 [18:24<05:56, 17.82s/it]

label date,  2016-06-19 00:00:00 2016-07-28 00:00:00
For area sts features, we spend 0.1237032413482666 seconds.
For area name sts features, we spend 0.08440327644348145 seconds.
For area name sts features, we spend 0.09963726997375488 seconds.
For store id reserve visitors features, we spend 0.01694631576538086 seconds.
For store id reserve visitors features, we spend 0.03677654266357422 seconds.
For store id reserve visitors features, we spend 0.03130030632019043 seconds.
For store id sts features, we spend 0.34340715408325195 seconds.
For store id diff sts features, we spend 1.1144845485687256 seconds.
For store id diff sts features, we spend 0.879366397857666 seconds.
For store id exp features, we spend 2.859858751296997 seconds.
For store id exp features, we spend 2.273834705352783 seconds.


 67%|███████████████████████████████████████████████████████▏                          | 39/58 [18:39<05:25, 17.14s/it]

label date,  2016-06-12 00:00:00 2016-07-21 00:00:00
For area sts features, we spend 0.13360857963562012 seconds.
For area name sts features, we spend 0.09574484825134277 seconds.
For area name sts features, we spend 0.10277605056762695 seconds.
For store id reserve visitors features, we spend 0.01795339584350586 seconds.
For store id reserve visitors features, we spend 0.05385780334472656 seconds.
For store id reserve visitors features, we spend 0.033907175064086914 seconds.
For store id sts features, we spend 0.3227567672729492 seconds.
For store id diff sts features, we spend 1.1080148220062256 seconds.
For store id diff sts features, we spend 0.8546898365020752 seconds.
For store id exp features, we spend 2.8030202388763428 seconds.
For store id exp features, we spend 2.068869113922119 seconds.


 69%|████████████████████████████████████████████████████████▌                         | 40/58 [18:55<04:57, 16.54s/it]

label date,  2016-06-05 00:00:00 2016-07-14 00:00:00
For area sts features, we spend 0.12305307388305664 seconds.
For area name sts features, we spend 0.0937340259552002 seconds.
For area name sts features, we spend 0.09534168243408203 seconds.
For store id reserve visitors features, we spend 0.01695990562438965 seconds.
For store id reserve visitors features, we spend 0.04230499267578125 seconds.
For store id reserve visitors features, we spend 0.03124690055847168 seconds.
For store id sts features, we spend 0.3319823741912842 seconds.
For store id diff sts features, we spend 1.0813217163085938 seconds.
For store id diff sts features, we spend 0.8492603302001953 seconds.
For store id exp features, we spend 2.6535604000091553 seconds.
For store id exp features, we spend 1.9895520210266113 seconds.


 71%|█████████████████████████████████████████████████████████▉                        | 41/58 [19:09<04:32, 16.05s/it]

label date,  2016-05-29 00:00:00 2016-07-07 00:00:00
For area sts features, we spend 0.11773324012756348 seconds.
For area name sts features, we spend 0.0868062973022461 seconds.
For area name sts features, we spend 0.0977470874786377 seconds.
For store id reserve visitors features, we spend 0.015983104705810547 seconds.
For store id reserve visitors features, we spend 0.04985761642456055 seconds.
For store id reserve visitors features, we spend 0.030904293060302734 seconds.
For store id sts features, we spend 0.319561243057251 seconds.
For store id diff sts features, we spend 1.039029598236084 seconds.
For store id diff sts features, we spend 0.8121087551116943 seconds.
For store id exp features, we spend 2.5127811431884766 seconds.
For store id exp features, we spend 1.9493491649627686 seconds.


 72%|███████████████████████████████████████████████████████████▍                      | 42/58 [19:24<04:09, 15.60s/it]

label date,  2016-05-22 00:00:00 2016-06-30 00:00:00
For area sts features, we spend 0.12024617195129395 seconds.
For area name sts features, we spend 0.0804746150970459 seconds.
For area name sts features, we spend 0.1032099723815918 seconds.
For store id reserve visitors features, we spend 0.01699542999267578 seconds.
For store id reserve visitors features, we spend 0.04691958427429199 seconds.
For store id reserve visitors features, we spend 0.031242847442626953 seconds.
For store id sts features, we spend 0.3286778926849365 seconds.
For store id diff sts features, we spend 1.034055233001709 seconds.
For store id diff sts features, we spend 0.7801558971405029 seconds.
For store id exp features, we spend 2.358875036239624 seconds.
For store id exp features, we spend 1.8563482761383057 seconds.


 74%|████████████████████████████████████████████████████████████▊                     | 43/58 [19:38<03:47, 15.19s/it]

label date,  2016-05-15 00:00:00 2016-06-23 00:00:00
For area sts features, we spend 0.12644267082214355 seconds.
For area name sts features, we spend 0.07653474807739258 seconds.
For area name sts features, we spend 0.1023869514465332 seconds.
For store id reserve visitors features, we spend 0.017004966735839844 seconds.
For store id reserve visitors features, we spend 0.043519020080566406 seconds.
For store id reserve visitors features, we spend 0.03128314018249512 seconds.
For store id sts features, we spend 0.3317070007324219 seconds.
For store id diff sts features, we spend 1.0415003299713135 seconds.
For store id diff sts features, we spend 0.7743160724639893 seconds.
For store id exp features, we spend 2.404412269592285 seconds.
For store id exp features, we spend 1.9219672679901123 seconds.


 76%|██████████████████████████████████████████████████████████████▏                   | 44/58 [19:53<03:32, 15.20s/it]

label date,  2016-05-08 00:00:00 2016-06-16 00:00:00
For area sts features, we spend 0.1226339340209961 seconds.
For area name sts features, we spend 0.09178900718688965 seconds.
For area name sts features, we spend 0.10373091697692871 seconds.
For store id reserve visitors features, we spend 0.018947601318359375 seconds.
For store id reserve visitors features, we spend 0.05489826202392578 seconds.
For store id reserve visitors features, we spend 0.03291678428649902 seconds.
For store id sts features, we spend 0.36499595642089844 seconds.
For store id diff sts features, we spend 1.087148666381836 seconds.
For store id diff sts features, we spend 0.8784842491149902 seconds.
For store id exp features, we spend 2.6456265449523926 seconds.
For store id exp features, we spend 1.624523639678955 seconds.


 78%|███████████████████████████████████████████████████████████████▌                  | 45/58 [20:08<03:15, 15.05s/it]

label date,  2016-05-01 00:00:00 2016-06-09 00:00:00
For area sts features, we spend 0.11365437507629395 seconds.
For area name sts features, we spend 0.08477163314819336 seconds.
For area name sts features, we spend 0.1197357177734375 seconds.
For store id reserve visitors features, we spend 0.017936229705810547 seconds.
For store id reserve visitors features, we spend 0.05286049842834473 seconds.
For store id reserve visitors features, we spend 0.03390955924987793 seconds.
For store id sts features, we spend 0.32910799980163574 seconds.
For store id diff sts features, we spend 0.9882543087005615 seconds.
For store id diff sts features, we spend 0.7677180767059326 seconds.
For store id exp features, we spend 2.043654441833496 seconds.
For store id exp features, we spend 1.5666508674621582 seconds.


 79%|█████████████████████████████████████████████████████████████████                 | 46/58 [20:22<02:55, 14.65s/it]

label date,  2016-04-24 00:00:00 2016-06-02 00:00:00
For area sts features, we spend 0.10868239402770996 seconds.
For area name sts features, we spend 0.0827798843383789 seconds.
For area name sts features, we spend 0.0887908935546875 seconds.
For store id reserve visitors features, we spend 0.019900083541870117 seconds.
For store id reserve visitors features, we spend 0.05191326141357422 seconds.
For store id reserve visitors features, we spend 0.034906864166259766 seconds.
For store id sts features, we spend 0.34801745414733887 seconds.
For store id diff sts features, we spend 0.9906506538391113 seconds.
For store id diff sts features, we spend 0.7362551689147949 seconds.
For store id exp features, we spend 1.9513707160949707 seconds.
For store id exp features, we spend 1.491698980331421 seconds.


 81%|██████████████████████████████████████████████████████████████████▍               | 47/58 [20:35<02:37, 14.31s/it]

label date,  2016-04-17 00:00:00 2016-05-26 00:00:00
For area sts features, we spend 0.10935258865356445 seconds.
For area name sts features, we spend 0.07386207580566406 seconds.
For area name sts features, we spend 0.08893394470214844 seconds.
For store id reserve visitors features, we spend 0.018954038619995117 seconds.
For store id reserve visitors features, we spend 0.04363560676574707 seconds.
For store id reserve visitors features, we spend 0.0312497615814209 seconds.
For store id sts features, we spend 0.32544922828674316 seconds.
For store id diff sts features, we spend 0.9701931476593018 seconds.
For store id diff sts features, we spend 0.7388050556182861 seconds.
For store id exp features, we spend 1.8369145393371582 seconds.
For store id exp features, we spend 1.391049861907959 seconds.


 83%|███████████████████████████████████████████████████████████████████▊              | 48/58 [20:49<02:20, 14.00s/it]

label date,  2016-04-10 00:00:00 2016-05-19 00:00:00
For area sts features, we spend 0.10932517051696777 seconds.
For area name sts features, we spend 0.08587980270385742 seconds.
For area name sts features, we spend 0.07814216613769531 seconds.
For store id reserve visitors features, we spend 0.031245708465576172 seconds.
For store id reserve visitors features, we spend 0.04686260223388672 seconds.
For store id reserve visitors features, we spend 0.046126604080200195 seconds.
For store id sts features, we spend 0.32214879989624023 seconds.
For store id diff sts features, we spend 0.9395842552185059 seconds.
For store id diff sts features, we spend 0.7491462230682373 seconds.
For store id exp features, we spend 1.685614824295044 seconds.
For store id exp features, we spend 1.311722755432129 seconds.


 84%|█████████████████████████████████████████████████████████████████████▎            | 49/58 [21:02<02:03, 13.70s/it]

label date,  2016-04-03 00:00:00 2016-05-12 00:00:00
For area sts features, we spend 0.11255741119384766 seconds.
For area name sts features, we spend 0.07850193977355957 seconds.
For area name sts features, we spend 0.10393404960632324 seconds.
For store id reserve visitors features, we spend 0.018955707550048828 seconds.
For store id reserve visitors features, we spend 0.046723127365112305 seconds.
For store id reserve visitors features, we spend 0.040888071060180664 seconds.
For store id sts features, we spend 0.3179028034210205 seconds.
For store id diff sts features, we spend 0.9581284523010254 seconds.
For store id diff sts features, we spend 0.7059383392333984 seconds.
For store id exp features, we spend 1.6634573936462402 seconds.
For store id exp features, we spend 1.195340633392334 seconds.


 86%|██████████████████████████████████████████████████████████████████████▋           | 50/58 [21:15<01:47, 13.47s/it]

label date,  2016-03-27 00:00:00 2016-05-05 00:00:00
For area sts features, we spend 0.10970735549926758 seconds.
For area name sts features, we spend 0.07978558540344238 seconds.
For area name sts features, we spend 0.08781266212463379 seconds.
For store id reserve visitors features, we spend 0.01995682716369629 seconds.
For store id reserve visitors features, we spend 0.053858041763305664 seconds.
For store id reserve visitors features, we spend 0.03490447998046875 seconds.
For store id sts features, we spend 0.30643796920776367 seconds.
For store id diff sts features, we spend 0.9114043712615967 seconds.
For store id diff sts features, we spend 0.6852202415466309 seconds.
For store id exp features, we spend 1.4840610027313232 seconds.
For store id exp features, we spend 1.14418625831604 seconds.


 88%|████████████████████████████████████████████████████████████████████████          | 51/58 [21:27<01:32, 13.23s/it]

label date,  2016-03-20 00:00:00 2016-04-28 00:00:00
For area sts features, we spend 0.10871028900146484 seconds.
For area name sts features, we spend 0.08078265190124512 seconds.
For area name sts features, we spend 0.0857689380645752 seconds.
For store id reserve visitors features, we spend 0.021943092346191406 seconds.
For store id reserve visitors features, we spend 0.0618746280670166 seconds.
For store id reserve visitors features, we spend 0.039895057678222656 seconds.
For store id sts features, we spend 0.3258647918701172 seconds.
For store id diff sts features, we spend 0.8678996562957764 seconds.
For store id diff sts features, we spend 0.694629430770874 seconds.
For store id exp features, we spend 1.3404169082641602 seconds.
For store id exp features, we spend 1.0370523929595947 seconds.


 90%|█████████████████████████████████████████████████████████████████████████▌        | 52/58 [21:40<01:17, 12.99s/it]

label date,  2016-03-13 00:00:00 2016-04-21 00:00:00
For area sts features, we spend 0.10471916198730469 seconds.
For area name sts features, we spend 0.07680368423461914 seconds.
For area name sts features, we spend 0.08177757263183594 seconds.
For store id reserve visitors features, we spend 0.021971464157104492 seconds.
For store id reserve visitors features, we spend 0.058588266372680664 seconds.
For store id reserve visitors features, we spend 0.03991961479187012 seconds.
For store id sts features, we spend 0.3149268627166748 seconds.
For store id diff sts features, we spend 0.8936851024627686 seconds.
For store id diff sts features, we spend 0.6370184421539307 seconds.
For store id exp features, we spend 1.225912094116211 seconds.
For store id exp features, we spend 0.9787020683288574 seconds.


 91%|██████████████████████████████████████████████████████████████████████████▉       | 53/58 [21:52<01:04, 12.83s/it]

label date,  2016-03-06 00:00:00 2016-04-14 00:00:00
For area sts features, we spend 0.1691277027130127 seconds.
For area name sts features, we spend 0.12497496604919434 seconds.
For area name sts features, we spend 0.09049272537231445 seconds.
For store id reserve visitors features, we spend 0.01565718650817871 seconds.
For store id reserve visitors features, we spend 0.07927608489990234 seconds.
For store id reserve visitors features, we spend 0.04686713218688965 seconds.
For store id sts features, we spend 0.3502366542816162 seconds.
For store id diff sts features, we spend 0.8716831207275391 seconds.
For store id diff sts features, we spend 0.6190669536590576 seconds.
For store id exp features, we spend 1.1291053295135498 seconds.
For store id exp features, we spend 0.8430182933807373 seconds.


 93%|████████████████████████████████████████████████████████████████████████████▎     | 54/58 [22:04<00:50, 12.65s/it]

label date,  2016-02-28 00:00:00 2016-04-07 00:00:00
For area sts features, we spend 0.0937352180480957 seconds.
For area name sts features, we spend 0.06923985481262207 seconds.
For area name sts features, we spend 0.07808113098144531 seconds.
For store id reserve visitors features, we spend 0.031244516372680664 seconds.
For store id reserve visitors features, we spend 0.04690408706665039 seconds.
For store id reserve visitors features, we spend 0.031239032745361328 seconds.
For store id sts features, we spend 0.3178846836090088 seconds.
For store id diff sts features, we spend 0.8298208713531494 seconds.
For store id diff sts features, we spend 0.6412744522094727 seconds.
For store id exp features, we spend 1.021803379058838 seconds.
For store id exp features, we spend 0.7862379550933838 seconds.


 95%|█████████████████████████████████████████████████████████████████████████████▊    | 55/58 [22:16<00:37, 12.36s/it]

label date,  2016-02-21 00:00:00 2016-03-31 00:00:00
For area sts features, we spend 0.0865945816040039 seconds.
For area name sts features, we spend 0.0545499324798584 seconds.
For area name sts features, we spend 0.08331704139709473 seconds.
For store id reserve visitors features, we spend 0.015620946884155273 seconds.
For store id reserve visitors features, we spend 0.0624842643737793 seconds.
For store id reserve visitors features, we spend 0.04191946983337402 seconds.
For store id sts features, we spend 0.31394195556640625 seconds.
For store id diff sts features, we spend 0.8395860195159912 seconds.
For store id diff sts features, we spend 0.5466623306274414 seconds.
For store id exp features, we spend 0.8837182521820068 seconds.
For store id exp features, we spend 0.6993393898010254 seconds.


 97%|███████████████████████████████████████████████████████████████████████████████▏  | 56/58 [22:28<00:24, 12.09s/it]

label date,  2016-02-14 00:00:00 2016-03-24 00:00:00
For area sts features, we spend 0.10270071029663086 seconds.
For area name sts features, we spend 0.06881570816040039 seconds.
For area name sts features, we spend 0.0759575366973877 seconds.
For store id reserve visitors features, we spend 0.022934913635253906 seconds.
For store id reserve visitors features, we spend 0.05386471748352051 seconds.
For store id reserve visitors features, we spend 0.03685903549194336 seconds.
For store id sts features, we spend 0.3054957389831543 seconds.
For store id diff sts features, we spend 0.7711505889892578 seconds.
For store id diff sts features, we spend 0.5735859870910645 seconds.
For store id exp features, we spend 0.7718179225921631 seconds.
For store id exp features, we spend 0.6054859161376953 seconds.


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [22:39<00:11, 11.87s/it]

label date,  2016-02-07 00:00:00 2016-03-17 00:00:00
For area sts features, we spend 0.10369634628295898 seconds.
For area name sts features, we spend 0.06685042381286621 seconds.
For area name sts features, we spend 0.062486886978149414 seconds.
For store id reserve visitors features, we spend 0.03124380111694336 seconds.
For store id reserve visitors features, we spend 0.04686307907104492 seconds.
For store id reserve visitors features, we spend 0.032396793365478516 seconds.
For store id sts features, we spend 0.2933840751647949 seconds.
For store id diff sts features, we spend 0.7631344795227051 seconds.
For store id diff sts features, we spend 0.5026836395263672 seconds.
For store id exp features, we spend 0.6685564517974854 seconds.
For store id exp features, we spend 0.5014147758483887 seconds.


100%|██████████████████████████████████████████████████████████████████████████████████| 58/58 [22:50<00:00, 23.64s/it]


In [42]:
for i in tqdm(range(1, 6)): 
    tr_label_date      = parse('2017-04-23') - timedelta(7 * 6 - 7 * i) 
    df_tr_tmp          = get_features(df_meta, air_reserve, hpg_air_reserve, days=42 - 7 * i, label_st_date=tr_label_date)    
    df_tr_feas         = pd.concat([df_tr_feas, df_tr_tmp], axis=0,ignore_index=True) 
    tr_fea_list.append(df_tr_tmp)

val_label_date    = parse('2017-04-23')  - timedelta(7*6) 
te_fea_date       = parse('2017-04-23')
 
df_val = get_features(df_meta, air_reserve, hpg_air_reserve, label_st_date=val_label_date)    
df_te  = get_features(df_meta, air_reserve, hpg_air_reserve, label_st_date=te_fea_date, is_te=True)      

## df_date['visit_date'] = pd.to_datetime(df_date['visit_date'])
df_date['visit_date'] = df_date['visit_date'].astype(str)
df_tr                 = df_tr_feas.merge(df_date, on=['visit_date'], how='left')
df_tr                 = df_tr.merge(df_storeid,   on=['air_store_id'], how='left')  
df_tr['visit_date']   = pd.to_datetime(df_tr['visit_date'])
df_tr['day_to_now']   = df_tr['visit_date'].apply(lambda x: (parse('2017-04-22') - (x)).days) 
 
df_val = df_val.merge(df_date,       on=['visit_date'], how='left')
df_val = df_val.merge(df_storeid,    on=['air_store_id'], how='left')  
 
df_te  = df_te.merge(df_date,       on=['visit_date'], how='left') 
df_te  = df_te.merge(df_storeid,    on=['air_store_id'], how='left')

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

label date,  2017-03-19 00:00:00 2017-04-23 00:00:00
For area sts features, we spend 0.3222973346710205 seconds.
For area name sts features, we spend 0.21712517738342285 seconds.
For area name sts features, we spend 0.21384453773498535 seconds.
For store id reserve visitors features, we spend 0.04041171073913574 seconds.
For store id reserve visitors features, we spend 0.0781104564666748 seconds.
For store id reserve visitors features, we spend 0.056084632873535156 seconds.
For store id sts features, we spend 0.7814249992370605 seconds.
For store id diff sts features, we spend 3.429210901260376 seconds.
For store id diff sts features, we spend 3.0542123317718506 seconds.
For store id exp features, we spend 14.691834688186646 seconds.
For store id exp features, we spend 11.932658195495605 seconds.


 20%|████████████████▊                                                                   | 1/5 [00:47<03:11, 47.96s/it]

label date,  2017-03-26 00:00:00 2017-04-23 00:00:00
For area sts features, we spend 0.2662825584411621 seconds.
For area name sts features, we spend 0.2343745231628418 seconds.
For area name sts features, we spend 0.2092733383178711 seconds.
For store id reserve visitors features, we spend 0.031969547271728516 seconds.
For store id reserve visitors features, we spend 0.06582236289978027 seconds.
For store id reserve visitors features, we spend 0.04288601875305176 seconds.
For store id sts features, we spend 0.7626745700836182 seconds.
For store id diff sts features, we spend 3.453118324279785 seconds.
For store id diff sts features, we spend 3.137575387954712 seconds.
For store id exp features, we spend 15.348093509674072 seconds.
For store id exp features, we spend 11.249058485031128 seconds.


 40%|█████████████████████████████████▌                                                  | 2/5 [01:34<02:22, 47.64s/it]

label date,  2017-04-02 00:00:00 2017-04-23 00:00:00
For area sts features, we spend 0.26803040504455566 seconds.
For area name sts features, we spend 0.2054598331451416 seconds.
For area name sts features, we spend 0.213423490524292 seconds.
For store id reserve visitors features, we spend 0.03047013282775879 seconds.
For store id reserve visitors features, we spend 0.059883832931518555 seconds.
For store id reserve visitors features, we spend 0.039880990982055664 seconds.
For store id sts features, we spend 0.7940318584442139 seconds.
For store id diff sts features, we spend 3.4644792079925537 seconds.
For store id diff sts features, we spend 3.10251522064209 seconds.
For store id exp features, we spend 15.454034328460693 seconds.
For store id exp features, we spend 12.171231985092163 seconds.


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [02:23<01:35, 47.80s/it]

label date,  2017-04-09 00:00:00 2017-04-23 00:00:00
For area sts features, we spend 0.2622971534729004 seconds.
For area name sts features, we spend 0.2074596881866455 seconds.
For area name sts features, we spend 0.20934510231018066 seconds.
For store id reserve visitors features, we spend 0.012047052383422852 seconds.
For store id reserve visitors features, we spend 0.06254124641418457 seconds.
For store id reserve visitors features, we spend 0.04686379432678223 seconds.
For store id sts features, we spend 0.75888991355896 seconds.
For store id diff sts features, we spend 3.5566561222076416 seconds.
For store id diff sts features, we spend 3.1573891639709473 seconds.
For store id exp features, we spend 15.59132981300354 seconds.
For store id exp features, we spend 11.714913606643677 seconds.


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [03:10<00:47, 47.75s/it]

label date,  2017-04-16 00:00:00 2017-04-23 00:00:00
For area sts features, we spend 0.26612162590026855 seconds.
For area name sts features, we spend 0.20562219619750977 seconds.
For area name sts features, we spend 0.2125861644744873 seconds.
For store id reserve visitors features, we spend 0.02498006820678711 seconds.
For store id reserve visitors features, we spend 0.06253457069396973 seconds.
For store id reserve visitors features, we spend 0.031235694885253906 seconds.
For store id sts features, we spend 0.7756028175354004 seconds.
For store id diff sts features, we spend 3.6182496547698975 seconds.
For store id diff sts features, we spend 3.2113964557647705 seconds.
For store id exp features, we spend 15.792046785354614 seconds.
For store id exp features, we spend 12.107356786727905 seconds.


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:59<00:00, 47.87s/it]


label date,  2017-03-12 00:00:00 2017-04-20 00:00:00
For area sts features, we spend 0.2505829334259033 seconds.
For area name sts features, we spend 0.1954958438873291 seconds.
For area name sts features, we spend 0.20346927642822266 seconds.
For store id reserve visitors features, we spend 0.04492449760437012 seconds.
For store id reserve visitors features, we spend 0.07081985473632812 seconds.
For store id reserve visitors features, we spend 0.0488734245300293 seconds.
For store id sts features, we spend 0.756033182144165 seconds.
For store id diff sts features, we spend 3.4510409832000732 seconds.
For store id diff sts features, we spend 3.134182929992676 seconds.
For store id exp features, we spend 14.393874883651733 seconds.
For store id exp features, we spend 10.774326801300049 seconds.
label date,  2017-04-23 00:00:00 2017-06-01 00:00:00
For area sts features, we spend 0.25017881393432617 seconds.
For area name sts features, we spend 0.22463035583496094 seconds.
For area name s

In [43]:
other_cols   = [col for col in df_tr_feas.columns if df_tr_feas[col].dtypes!='object' and col not in ['air_store_id','visit_date','air_store_id_weekday','air_store_id_holiday','air_store_id_weekday_holiday','visitors','weekday',\
                                                                'air_store_id_visit_date','air_area_name','air_area_name_weekday']]
date_cols    = [col for col in df_date.columns if col not in ['visit_date']]
storeid_cols = [col for col in df_storeid.columns if col not in ['air_store_id']]

le = LabelEncoder()
le.fit(list(set(df_tr['air_store_id'].values)) + list(set(df_val['air_store_id'].values)) + list(set(df_te['air_store_id'].values)))   

df_tr['air_store_id_lb']  = le.transform(df_tr['air_store_id'].values)
df_val['air_store_id_lb'] = le.transform(df_val['air_store_id'].values)
df_te['air_store_id_lb']  = le.transform(df_te['air_store_id'].values)

In [44]:
label_col    = 'visitors' 
feature_cols = date_cols + storeid_cols + other_cols + ['air_store_id_lb']  
feature_cols = [col for col in feature_cols if col not in ['day_to_last_holiday', 'day_to_next_holiday']]

models1, feature_importance1, test_pred1 = lgb_model_test_weight_timebased(df_tr=df_tr,
                                                                           df_val=df_val,
                                                                           features=feature_cols,
                                                                           ws=[1.0],
                                                                           test_df=df_te)

submit = df_te[['air_store_id', 'visit_date']].copy()
submit['id'] = submit['air_store_id'] + '_' + submit['visit_date'].astype(str) 
submit['visitors'] =  np.expm1(test_pred1[0])
submit['visitors'] = submit['visitors'] #.astype(int)
submit.loc[submit['visitors'] == 0, 'visitors'] = 1
submit[['id','visitors']].to_csv('../sub/sub.csv', index = None)

Training until validation scores don't improve for 1500 rounds
[100]	training's rmse: 0.512099	valid_1's rmse: 0.499047
[200]	training's rmse: 0.490999	valid_1's rmse: 0.480723
[300]	training's rmse: 0.477733	valid_1's rmse: 0.467757
[400]	training's rmse: 0.467846	valid_1's rmse: 0.457847
[500]	training's rmse: 0.459713	valid_1's rmse: 0.449177
[600]	training's rmse: 0.452696	valid_1's rmse: 0.441954
[700]	training's rmse: 0.446322	valid_1's rmse: 0.435499
Did not meet early stopping. Best iteration is:
[700]	training's rmse: 0.446322	valid_1's rmse: 0.435499
