In [1]:
import sys
from datetime import datetime
from os.path import join
from warnings import warn

import numpy as np
import pandas as pd
import scipy as sp

from utils import *

In [2]:
dir_arg = sys.argv[1]
if dir_arg == '-f':
    file_dir = join('..', 'dataset', '11')
else:
    file_dir = join('..', 'dataset',  dir_arg)

In [3]:
train_df = pd.read_pickle(join(file_dir, 'base_feauture.pkl'))

sample = pd.read_pickle(join(file_dir, 'basicroomid.pkl'))

now_date = train_df.orderdate.max().date()
print(datetime.now(), now_date)

uid_shape, hotelid_shape, basicroomid_shape, roomid_shape = print_shape(
    train_df, ['uid', 'hotelid', 'basicroomid', 'roomid'])

2017-07-31 07:14:19.866835 2017-06-11
2017-07-31 07:14:19.867063
--------------------
uid uniuqe shape 3552
hotelid uniuqe shape 3194
basicroomid uniuqe shape 19542
roomid uniuqe shape 121802
--------------------


In [4]:
feature_path = join(file_dir, 'basic_room_feature.pkl')
print(datetime.now(), 'begin', feature_path)

2017-07-31 07:14:19.892234 begin ../dataset/11/basic_room_feature.pkl


## 基本分类计数特征

In [5]:
train_df.loc[train_df.basic_minarea<0, 'basic_minarea'] = np.nan
train_df.loc[train_df.basic_maxarea<0, 'basic_maxarea'] = np.nan

In [6]:
sample = add_column(train_df, sample, 'basicroomid', 'basic_minarea')
sample = add_column(train_df, sample, 'basicroomid', 'basic_maxarea')

In [7]:
basic_cols = [
    'basic_week_ordernum_ratio', 'basic_recent3_ordernum_ratio',
    'basic_comment_ratio', 'basic_30days_ordnumratio', 'basic_30days_realratio'
]

In [8]:
for col in basic_cols:
    sample = add_column(train_df, sample, 'basicroomid', col)

In [9]:
for i in range(1, 8):
    f = 'roomservice_%d' % (i+1)
    sample = extract_feature_count('basicroomid', f, train_df, sample)

In [10]:
for i in range(4):
    f = 'roomtag_%d' % (i+1)
    sample = extract_feature_count('basicroomid', f, train_df, sample)

In [11]:
sample = extract_feature_count('basicroomid', 'roomid', train_df, sample)

In [12]:
# get_corr(train_df, sample, 'basicroomid')

## 数值统计特征

### 价格

In [13]:
use_describe = ['max', 'min', '75%', 'mean', 'std']

In [14]:
train_df['price_real'] = train_df['price_deduct'] + train_df['returnvalue']

In [15]:
sample = extract_value_describe_feature('basicroomid', 'price_deduct', train_df, sample, use_describe)

sample = extract_value_describe_feature('basicroomid', 'price_real', train_df, sample, ['max', 'mean', 'min', '75%'])

sample = extract_value_describe_feature('basicroomid', 'returnvalue', train_df, sample,['max', 'min', '75%'] )

## 子房型的统计特征 

In [16]:
room_cols = ['room_30days_ordnumratio', 'room_30days_realratio']

In [17]:
sample = extract_value_describe_feature('basicroomid', 'room_30days_ordnumratio', train_df, sample, [
    'max', 'min', '25%', '50%', '75%', 'std', 'mean'
])

  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))


In [18]:
sample = extract_value_describe_feature('basicroomid', 'room_30days_realratio',
                                        train_df, sample,
                                        ['min', '25%', '75%', 'mean', 'max'])

In [19]:
# get_corr(train_df, sample, 'basicroomid').tail(10)

## 历史价格统计特征

In [20]:
price_use_describe = ['max', 'std', 'mean', 'min']

In [21]:
name_fmt = '{}_diff_{}'.format('basicroomid', '{}')

price_diff_name = name_fmt.format('price_last_lastord')
hotel_minprice_diff_name = name_fmt.format('hotel_minprice_lastord')
basic_minprice_diff_name = name_fmt.format('basic_minprice_lastord')

In [22]:
train_df[price_diff_name] = train_df['price_deduct'] - train_df['price_last_lastord']
train_df[hotel_minprice_diff_name] = train_df['price_deduct'] - train_df['hotel_minprice_lastord']
train_df[basic_minprice_diff_name] = train_df['price_deduct'] - train_df['basic_minprice_lastord']

In [23]:
sample = extract_value_describe_feature('basicroomid', price_diff_name, train_df, sample, price_use_describe)

In [24]:
sample = extract_value_describe_feature('basicroomid', hotel_minprice_diff_name, train_df, sample, price_use_describe)
sample = extract_value_describe_feature('basicroomid', basic_minprice_diff_name, train_df, sample, price_use_describe)

In [25]:
# get_corr(train_df, sample, 'basicroomid').tail(20)

Unnamed: 0,orderlabel,basicroomid_basic_minarea,basicroomid_basic_maxarea,basicroomid_basic_week_ordernum_ratio,basicroomid_basic_recent3_ordernum_ratio,basicroomid_basic_comment_ratio,basicroomid_basic_30days_ordnumratio,basicroomid_basic_30days_realratio,basicroomid__roomservice_2_count,basicroomid__roomservice_3_count,...,basicroomid__room_30days_realratio_mean,basicroomid__basicroomid_diff_price_last_lastord_max,basicroomid__basicroomid_diff_price_last_lastord_std,basicroomid__basicroomid_diff_price_last_lastord_mean,basicroomid__basicroomid_diff_hotel_minprice_lastord_max,basicroomid__basicroomid_diff_hotel_minprice_lastord_std,basicroomid__basicroomid_diff_hotel_minprice_lastord_mean,basicroomid__basicroomid_diff_basic_minprice_lastord_max,basicroomid__basicroomid_diff_basic_minprice_lastord_std,basicroomid__basicroomid_diff_basic_minprice_lastord_mean
basicroomid__room_30days_ordnumratio_max,0.157762,-0.237389,-0.238099,0.26057,0.27763,0.020643,0.284655,-0.118055,-0.076432,-0.139054,...,-0.27236,-0.20188,-0.170366,-0.190139,-0.217255,-0.164067,-0.218263,-0.203743,-0.170074,-0.193277
basicroomid__room_30days_ordnumratio_min,0.115639,-0.156012,-0.158047,0.300966,0.27186,0.026919,0.388086,-0.041593,-0.127787,-0.281253,...,-0.1147,-0.151586,-0.174529,-0.115056,-0.166249,-0.175277,-0.136853,-0.15395,-0.173268,-0.119028
basicroomid__room_30days_ordnumratio_25,0.130208,-0.180617,-0.182434,0.293685,0.280553,0.027065,0.368902,-0.058279,-0.136768,-0.30709,...,-0.150721,-0.179945,-0.201166,-0.140391,-0.196542,-0.202142,-0.165384,-0.182472,-0.199702,-0.144724
basicroomid__room_30days_ordnumratio_50,0.142295,-0.206641,-0.208218,0.275615,0.280001,0.026259,0.339576,-0.077515,-0.146833,-0.301376,...,-0.191492,-0.204983,-0.216848,-0.16679,-0.222701,-0.218433,-0.193848,-0.207657,-0.215683,-0.171295
basicroomid__room_30days_ordnumratio_75,0.153079,-0.231321,-0.232345,0.296747,0.300219,0.025396,0.330843,-0.098411,-0.132048,-0.240549,...,-0.234286,-0.215936,-0.205667,-0.187211,-0.233236,-0.205808,-0.215653,-0.218222,-0.204972,-0.191065
basicroomid__room_30days_ordnumratio_std,0.120312,-0.184889,-0.183739,0.107014,0.153577,0.00345,0.070364,-0.104611,-0.036916,-0.042336,...,-0.238654,-0.155899,-0.115909,-0.154004,-0.163353,-0.108251,-0.171102,-0.156716,-0.11563,-0.15563
basicroomid__room_30days_ordnumratio_mean,0.154943,-0.225582,-0.226971,0.308952,0.308532,0.02739,0.366929,-0.089042,-0.137739,-0.278417,...,-0.216769,-0.213397,-0.217104,-0.178749,-0.231515,-0.216486,-0.207714,-0.215973,-0.215987,-0.183123
basicroomid__room_30days_realratio_min,-0.064571,0.493293,0.487357,-0.03259,-0.04063,-0.007193,-0.010834,0.730975,0.041864,-0.017424,...,0.849956,0.33463,0.17265,0.383294,0.351401,0.177825,0.416416,0.335902,0.17326,0.385368
basicroomid__room_30days_realratio_25,-0.070246,0.5094,0.50386,-0.036168,-0.045436,-0.00775,-0.012845,0.727003,0.064112,0.010077,...,0.864099,0.359505,0.195768,0.401417,0.3786,0.200487,0.438123,0.361068,0.196108,0.403988
basicroomid__room_30days_realratio_75,-0.079505,0.524146,0.518877,-0.044413,-0.056616,-0.010064,-0.019363,0.700591,0.102073,0.101807,...,0.865685,0.402507,0.258439,0.423873,0.422548,0.262674,0.462297,0.40449,0.257474,0.427464


## 历史时间间隔统计特征

In [32]:
span_name, t = '{}_span'.format('basicroomid'), 'basicroomid'

In [33]:
# train_df[span_name] = (now_date - train_df.orderdate_lastord).dt.days

# sample = extract_value_describe_feature(t, span_name, train_df, sample, ['max', 'min', 'mean'])

In [34]:
# get_corr(train_df, sample, 'basicroomid')

## 上次订购的特征 

In [35]:
basic_cols = [c for c in train_df.columns if c.startswith('basic') and not c.endswith('area')]
basic_cols

['basicroomid',
 'basic_week_ordernum_ratio',
 'basic_recent3_ordernum_ratio',
 'basic_comment_ratio',
 'basic_30days_ordnumratio',
 'basic_30days_realratio',
 'basicroomid_lastord',
 'basic_minprice_lastord',
 'basicroomid_diff_price_last_lastord',
 'basicroomid_diff_hotel_minprice_lastord',
 'basicroomid_diff_basic_minprice_lastord']

In [36]:
use_cols = ['uid', 'orderdate_lastord', 'hotelid', 'basicroomid_lastord', 'basicroomid'] 

In [37]:
basic_feature = train_df[use_cols].drop_duplicates()

In [38]:
if train_df.drop_duplicates(['uid', 'basicroomid']).shape[0] != basic_feature.shape[0]:
    warn('[uid, basicroomid].shape[0] != basic_feature.shape[0]')

In [39]:
cols = [x for x in train_df.columns if x.endswith('lastord')]

In [40]:
train_df.loc[train_df.orderdate_lastord.isnull()][cols].return_lastord.value_counts()

200.0    11080
Name: return_lastord, dtype: int64

In [41]:
sample = extract_lastord_is_nan(basic_feature, sample, 'basicroomid', 'basicroomid_lastord')

In [42]:
sample = extract_is_lastord(basic_feature, sample, 'basicroomid', 'basicroomid_lastord')

In [43]:
sample.to_pickle(feature_path)

print(datetime.now(), 'save to', feature_path)

2017-07-31 07:22:14.797761 save to ../dataset/11/basic_room_feature.pkl
