In [12]:
import sys
from datetime import datetime
from os.path import join
from warnings import warn

import numpy as np
import pandas as pd
import scipy as sp

from utils import *

In [13]:
# sys.argv[1] = 'test'

In [14]:
dir_arg = sys.argv[1]
if dir_arg == '-f':
    file_dir = join('..', 'dataset', 'train')
else:
    file_dir = join('..', 'dataset',  dir_arg)

In [15]:
train_df = pd.read_pickle(join(file_dir, 'base_feauture.pkl'))

sample = pd.read_pickle(join(file_dir, 'basicroomid.pkl'))

now_date = train_df.orderdate.max().date()
print(datetime.now(), now_date)

uid_shape, hotelid_shape, basicroomid_shape, roomid_shape = print_shape(
    train_df, ['uid', 'hotelid', 'basicroomid', 'roomid'])

2017-08-05 23:00:10.445098 2017-06-12
2017-08-05 23:00:10.445274
--------------------
uid uniuqe shape 34632
hotelid uniuqe shape 34632
basicroomid uniuqe shape 192816
roomid uniuqe shape 974533
--------------------


In [4]:
feature_path = join(file_dir, 'basic_room_feature.pkl')
print(datetime.now(), 'begin', feature_path)

2017-08-05 22:57:56.486219 begin ../dataset/train/basic_room_feature.pkl


## 基本分类计数特征

In [5]:
train_df.loc[train_df.basic_minarea<0, 'basic_minarea'] = np.nan
train_df.loc[train_df.basic_maxarea<0, 'basic_maxarea'] = np.nan

In [6]:
sample = add_column(train_df, sample, 'basicroomid', 'basic_minarea')
sample = add_column(train_df, sample, 'basicroomid', 'basic_maxarea')

In [7]:
basic_cols = [
    'basic_week_ordernum_ratio', 'basic_recent3_ordernum_ratio',
    'basic_comment_ratio', 'basic_30days_ordnumratio', 'basic_30days_realratio'
]

In [10]:
# for col in basic_cols:
#     sample = add_column(train_df, sample, 'basicroomid', col)

In [11]:
for i in range(1, 8):
    f = 'roomservice_%d' % (i+1)
    sample = extract_feature_count('basicroomid', f, train_df, sample)

In [12]:
for i in range(4):
    f = 'roomtag_%d' % (i+1)
    sample = extract_feature_count('basicroomid', f, train_df, sample)

In [13]:
sample = extract_feature_count('basicroomid', 'roomid', train_df, sample)

In [14]:
# get_corr(train_df, sample, 'basicroomid')

## 数值统计特征

### 价格

In [15]:
use_describe = ['max', 'min', 'median', 'mean', 'std', 'nunique']

In [16]:
train_df['price_real'] = train_df['price_deduct'] + train_df['returnvalue']

In [17]:
sample = extract_value_describe_feature('basicroomid', 'price_deduct', train_df, sample, use_describe)

sample = extract_value_describe_feature('basicroomid', 'price_real', train_df, sample, ['max', 'mean', 'min', 'median'])

sample = extract_value_describe_feature('basicroomid', 'returnvalue', train_df, sample,['max', 'min', 'median'] )

### 价格排序

In [18]:
def df_min_orderid(df):
    add = pd.DataFrame(df.groupby(["orderid"]).price_deduct.min()).reset_index()
    add.columns = ["orderid", "orderid_price_deduct_min"]
    df = df.merge(add, on=["orderid"], how="left")
    df = press_date(df, ['orderid_price_deduct_min'])
    return df

In [19]:
def df_rank_mean(df):
    add = pd.DataFrame(df.groupby(["basicroomid"]).orderid_price_deduct_min_rank.mean()).reset_index()
    add.columns = ["basicroomid","orderid_price_deduct_min_rank_mean"]
    df = df.merge(add, on=["basicroomid"], how="left")
    df = press_date(df, ['orderid_price_deduct_min_rank_mean'])
    return df

In [20]:
# train_df = df_min_orderid(df)

# train_df["orderid_price_deduct_min_rank"] = train_df['orderid_price_deduct_min'].groupby(train_df['orderid']).rank()

# train_df["orderid_price_deduct_min_rank"]

# train_df = df_rank_mean(train_df)

In [21]:
# sample['basicroomid__price_deduct_min_rank'] = sample.basicroomid__price_deduct_min.rank()

## 子房型rank统计特征

In [22]:
sample = extract_value_describe_feature('basicroomid', 'rank',
                                        train_df, sample,
                                        ['max', 'min', 'median', 'mean', 'std', 'nunique'])

In [23]:
# get_corr(train_df, sample, 'basicroomid')

## 物理房型统计特征 

In [16]:
tdf = train_df[[
    'basicroomid', 'orderdate', 'basic_week_ordernum_ratio',
    'basic_recent3_ordernum_ratio', 'basic_comment_ratio',
    'basic_30days_ordnumratio', 'basic_30days_realratio'
]]

tdf.orderdate = tdf.orderdate.dt.weekday

ntdf = tdf[[
    'basicroomid', 'orderdate', 'basic_week_ordernum_ratio',
    'basic_recent3_ordernum_ratio', 'basic_comment_ratio',
    'basic_30days_ordnumratio', 'basic_30days_realratio'
]].groupby(['basicroomid', 'orderdate']).mean().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [15]:
stat_cols = [
    'basic_week_ordernum_ratio', 'basic_recent3_ordernum_ratio',
    'basic_comment_ratio', 'basic_30days_ordnumratio', 'basic_30days_realratio'
]

In [24]:
use_describe = ['max', 'mean', 'mad', 'var', 'median', 'sum']

In [26]:
# sample = extract_value_describe_feature('basicroomid', 'basic_week_ordernum_ratio_var', ntdf, sample,
#                                         ['max', 'mean', 'median', 'sum'])

In [18]:
# for c in stat_cols:
#     sample = extract_value_describe_feature(
#         'basicroomid', c, ntdf, sample, use_describe)

  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))
  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))
  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))
  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))
  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))
  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))
  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))
  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))


In [20]:
# get_corr(train_df, sample, 'basicroomid')

Unnamed: 0,orderlabel,basicroomid_basic_minarea,basicroomid_basic_maxarea,basicroomid__basic_week_ordernum_ratio_max,basicroomid__basic_week_ordernum_ratio_mean,basicroomid__basic_week_ordernum_ratio_mad,basicroomid__basic_week_ordernum_ratio_var,basicroomid__basic_week_ordernum_ratio_median,basicroomid__basic_week_ordernum_ratio_sum,basicroomid__basic_recent3_ordernum_ratio_max,...,basicroomid__basic_30days_ordnumratio_mad,basicroomid__basic_30days_ordnumratio_var,basicroomid__basic_30days_ordnumratio_median,basicroomid__basic_30days_ordnumratio_sum,basicroomid__basic_30days_realratio_max,basicroomid__basic_30days_realratio_mean,basicroomid__basic_30days_realratio_mad,basicroomid__basic_30days_realratio_var,basicroomid__basic_30days_realratio_median,basicroomid__basic_30days_realratio_sum
orderlabel,1.0,-0.041291,-0.041386,0.14684,0.14739,0.041587,0.041432,0.147288,0.080671,0.139198,...,0.04048,0.02635,0.145714,0.073987,-0.06491,-0.064577,-0.028636,-0.007956,-0.064495,-0.062173
basicroomid_basic_minarea,-0.041291,1.0,0.993763,-0.184709,-0.184224,-0.068428,-0.055494,-0.184153,-0.101406,-0.176231,...,-0.069553,-0.043805,-0.194442,-0.09624,0.43184,0.431633,0.153403,0.06569,0.43078,0.326468
basicroomid_basic_maxarea,-0.041386,0.993763,1.0,-0.183305,-0.182938,-0.067191,-0.055843,-0.18288,-0.098844,-0.174385,...,-0.067925,-0.043913,-0.193694,-0.093686,0.429874,0.429648,0.152315,0.063645,0.428775,0.329741
basicroomid__basic_week_ordernum_ratio_max,0.14684,-0.184709,-0.183305,1.0,0.996961,0.327565,0.182038,0.996434,0.789158,0.943304,...,0.254914,0.087136,0.928147,0.749898,-0.334706,-0.33318,-0.147007,-0.045374,-0.332811,-0.176516
basicroomid__basic_week_ordernum_ratio_mean,0.14739,-0.184224,-0.182938,0.996961,1.0,0.256124,0.127844,0.999768,0.777908,0.937686,...,0.21484,0.075852,0.930245,0.737612,-0.329718,-0.327983,-0.149673,-0.044388,-0.327627,-0.186708
basicroomid__basic_week_ordernum_ratio_mad,0.041587,-0.068428,-0.067191,0.327565,0.256124,1.0,0.729585,0.255661,0.311841,0.341698,...,0.569156,0.181786,0.223738,0.288625,-0.150571,-0.152437,-0.008938,-0.027786,-0.15222,0.020982
basicroomid__basic_week_ordernum_ratio_var,0.041432,-0.055494,-0.055843,0.182038,0.127844,0.729585,1.0,0.127083,0.06581,0.178715,...,0.292011,0.163568,0.124197,0.057897,-0.071373,-0.071297,-0.025489,-0.009113,-0.071169,-0.084021
basicroomid__basic_week_ordernum_ratio_median,0.147288,-0.184153,-0.18288,0.996434,0.999768,0.255661,0.127083,1.0,0.777823,0.937373,...,0.214597,0.075706,0.92973,0.737167,-0.329638,-0.327905,-0.149707,-0.044462,-0.327555,-0.186531
basicroomid__basic_week_ordernum_ratio_sum,0.080671,-0.101406,-0.098844,0.789158,0.777908,0.311841,0.06581,0.777823,1.0,0.767507,...,0.262902,0.029732,0.751218,0.984794,-0.243424,-0.242849,-0.099286,-0.037141,-0.242596,0.068248
basicroomid__basic_recent3_ordernum_ratio_max,0.139198,-0.176231,-0.174385,0.943304,0.937686,0.341698,0.178715,0.937373,0.767507,1.0,...,0.28246,0.093795,0.878543,0.730598,-0.325596,-0.324461,-0.135799,-0.045565,-0.324095,-0.15598


## 子房型的统计特征 

In [24]:
room_cols = ['room_30days_ordnumratio', 'room_30days_realratio']

In [25]:
sample = extract_value_describe_feature(
    'basicroomid', 'room_30days_ordnumratio', train_df, sample,
    ['max', 'min', 'median', 'mean', 'std', 'nunique', 'var', 'mad', 'sum'])

  warn('column  {} is  may be error when meet percent max:{}'.format(c, c_max))


In [26]:
sample = extract_value_describe_feature('basicroomid', 'room_30days_realratio',
                                        train_df, sample,
                                        ['max', 'min', 'median', 'mean', 'std', 'nunique', 'count',  'var', 'mad', 'sum'])

In [27]:
# get_corr(train_df, sample, 'basicroomid').tail(10)

## 历史价格统计特征

In [28]:
price_use_describe = ['max', 'std', 'mean', 'min']

In [29]:
name_fmt = '{}_diff_{}'.format('basicroomid', '{}')

price_diff_name = name_fmt.format('price_last_lastord')
hotel_minprice_diff_name = name_fmt.format('hotel_minprice_lastord')
basic_minprice_diff_name = name_fmt.format('basic_minprice_lastord')

In [30]:
train_df[price_diff_name] = train_df['price_deduct'] - train_df['price_last_lastord']
train_df[hotel_minprice_diff_name] = train_df['price_deduct'] - train_df['hotel_minprice_lastord']
train_df[basic_minprice_diff_name] = train_df['price_deduct'] - train_df['basic_minprice_lastord']

In [31]:
sample = extract_value_describe_feature('basicroomid', price_diff_name, train_df, sample, price_use_describe)

In [32]:
sample = extract_value_describe_feature('basicroomid', hotel_minprice_diff_name, train_df, sample, price_use_describe)
sample = extract_value_describe_feature('basicroomid', basic_minprice_diff_name, train_df, sample, price_use_describe)

In [33]:
# get_corr(train_df, sample, 'basicroomid').tail(20)

## 历史时间间隔统计特征

In [34]:
span_name, t = '{}_span'.format('basicroomid'), 'basicroomid'

In [35]:
# train_df[span_name] = (now_date - train_df.orderdate_lastord).dt.days

# sample = extract_value_describe_feature(t, span_name, train_df, sample, ['max', 'min', 'mean'])

In [36]:
# get_corr(train_df, sample, 'basicroomid')

## 上次订购的特征 

In [37]:
# basic_cols = [c for c in train_df.columns if c.startswith('basic') and not c.endswith('area')]
# basic_cols

In [79]:
# use_cols = ['uid', 'orderdate_lastord', 'hotelid', 'basicroomid_lastord', 'basicroomid'] 

In [80]:
# basic_feature = train_df[use_cols].drop_duplicates()

In [81]:
# if train_df.drop_duplicates(['uid', 'basicroomid']).shape[0] != basic_feature.shape[0]:
#     warn('[uid, basicroomid].shape[0] != basic_feature.shape[0]')

  from ipykernel import kernelapp as app


In [82]:
# cols = [x for x in train_df.columns if x.endswith('lastord')]

In [88]:
# train_df.loc[train_df.orderdate_lastord.isnull()][cols].return_lastord.value_counts()

In [89]:
# sample = extract_lastord_is_nan(basic_feature, sample, 'basicroomid', 'basicroomid_lastord')

In [85]:
# sample = extract_is_lastord(basic_feature, sample, 'basicroomid', 'basicroomid_lastord')

In [38]:
sample.to_pickle(feature_path)

print(datetime.now(), 'save to', feature_path)

2017-08-05 08:48:01.176270 save to ../dataset/train/basic_room_feature.pkl
