In [1]:
import pandas as pd
import numpy as np
# import ray.dataframe as pd2
import time
#显示所有列
pd.set_option('display.max_columns', None)
start_time=time.time()
print('start_time:',start_time)

input_dir = 'G:\\bigdata\\badou\\00-data//'
out_dir = input_dir + 'out//'
'''
Pandas on Ray
读取数据
priors表示用户的历史购买数据
order_products__train表示用户倒数第二天的购买数据
召回中命中的为1，这个用户所有的购买过的记录作为召回商品，
train的数据为最近一天的商品，也就是从这个用户之前购买过所有商品中，
最近一天购买了属于命中了，这样模型倾向于抓住最近用户的购买需求，淡化时间久远的购买兴趣
'''
# 直接读取会使文件中第一列数据默认为df的index
priors = pd.read_csv(filepath_or_buffer=input_dir + 'order_products__prior.csv', dtype={
    'order_id': np.int32,
    # uint16 无符号16位
    'product_id': np.uint16,
    'add_to_cart_order': np.int16,
    'reordered': np.int8
})

train = pd.read_csv(filepath_or_buffer=input_dir + 'order_products__train.csv',
                    dtype={
                        'order_id': np.int32,
                        'product_id': np.uint16,
                        'add_to_cart_order': np.int16,
                        'reordered': np.int8
                    })
orders = pd.read_csv(filepath_or_buffer=input_dir + 'orders.csv',
                     dtype={
                         'order_id': np.int32,
                         'user_id': np.int32,
                         'eval_set': 'object',
                         'order_number': np.int16,
                         'order_dow': np.int8,
                         'order_hour_of_day': np.int8,
                         'days_since_prior_order': np.float32
                     })

products = pd.read_csv(input_dir + 'products.csv', dtype={
    'product_id': np.uint16,
    'order_id': np.int32,
    'aisle_id': np.uint8,
    'department_id': np.uint8},
                       usecols=['product_id', 'aisle_id', 'department_id'])

print('prior {}:{}'.format(priors.shape, ','.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

start_time: 1597473493.620673
prior (32434489, 4):order_id,product_id,add_to_cart_order,reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [2]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [3]:
priors.dtypes

order_id              int32
product_id           uint16
add_to_cart_order     int16
reordered              int8
dtype: object

In [4]:
prod_feat_df = pd.DataFrame()
# 产品销量
prod_feat_df['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
# 产品再次被购买量
prod_feat_df['reorders'] = priors.groupby('product_id')['reordered'].sum().astype(np.float32)
# 产品再次购买比例
prod_feat_df['reorder_rate'] = (prod_feat_df['reorders'] / prod_feat_df['orders']).astype(np.float32)

In [5]:
prod_feat_df.head()

Unnamed: 0_level_0,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1852,1136.0,0.613391
2,90,12.0,0.133333
3,277,203.0,0.732852
4,329,147.0,0.446809
5,15,9.0,0.6


In [6]:
products = products.join(prod_feat_df, on='product_id')
# 设置product_id为index列，drop表示是否删除product_id列 inplace表示是否在原数据上修改
products.set_index('product_id', drop=False, inplace=True)

In [7]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [14]:
priors = priors.join(orders, on='order_id', rsuffix='_')

In [15]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,order_id_,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,2,202279,prior,3,5,9,8.0
1,2,28985,2,1,2,202279,prior,3,5,9,8.0
2,2,9327,3,0,2,202279,prior,3,5,9,8.0
3,2,45918,4,1,2,202279,prior,3,5,9,8.0
4,2,30035,5,0,2,202279,prior,3,5,9,8.0


In [10]:
orders.set_index('order_id', inplace=True, drop=False)

In [13]:
priors = pd.read_csv(filepath_or_buffer=input_dir + 'order_products__prior.csv', dtype={
    'order_id': np.int32,
    # uint16 无符号16位
    'product_id': np.uint16,
    'add_to_cart_order': np.int16,
    'reordered': np.int8
})