In [1]:
import numpy as np
import pandas as pd
import gc
from dateutil.parser import parse
from datetime import date, timedelta
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
user_order = pd.read_feather('../01-data/user_order.feather')
orders_train = pd.read_feather('../01-data/orders_train.feather')
orders_poi_session = pd.read_feather('../01-data/orders_poi_session.feather')
user_order = pd.read_feather('../01-data/user_order.feather')
pois = pd.read_feather('../01-data/pois.feather')
spus = pd.read_feather('../01-data/spus.feather')

pois.rename(columns={'aor_id': 'poi_aor_id'}, inplace=True)

orders_train = orders_train.merge(pois, on='wm_poi_id', how='left')
orders_train = orders_train.merge(spus, on='wm_food_spu_id', how='left')

In [3]:
orders_train.head()

Unnamed: 0,user_id,wm_order_id,wm_poi_id,aor_id,order_price_interval,order_timestamp,ord_period_name,order_scene_name,aoi_id,takedlvr_aoi_type_name,...,poi_aor_id,poi_score,delivery_comment_avg_score,food_comment_avg_score,price,category,ingredients,taste,stand_food_id,price_interval
0,178557,0,2334,6,1.0,1623061539,3,0.0,,,...,6.0,4.04,5.0,4.16,34.0,6.0,1.0,,959.0,2.0
1,175118,1,3315,0,1.0,1623032193,1,1.0,0.0,0.0,...,1.0,4.63,4.88,4.63,3.99,42.0,,,,1.0
2,175118,1,3315,0,1.0,1623032193,1,1.0,0.0,0.0,...,1.0,4.63,4.88,4.63,35.9,49.0,26.0,1.0,253.0,2.0
3,175118,1,3315,0,1.0,1623032193,1,1.0,0.0,0.0,...,1.0,4.63,4.88,4.63,0.0,33.0,,,,1.0
4,36208,2,2168,0,1.0,1623036350,1,0.0,1.0,0.0,...,0.0,4.75,4.96,4.74,23.0,17.0,159.0,,386.0,1.0


In [4]:
def list_col(df, user_order):
    wm_order_id_list = []
    clicks_list = []
    dt_list = []
    for row in df.values:
        wm_order_id = row[0]
        clicks = row[1]
        dt = row[2]
        wm_order_id_list.append(wm_order_id)
        dt_list.append(dt)
        if pd.isna(clicks):
            clicks_list.append([])
        else:
            clicks_list.append(list(set([int(click) for click in clicks.split('#')])))
    data = pd.DataFrame()
    data['wm_order_id'] = wm_order_id_list
    data['wm_poi_id'] = clicks_list
    data['dt'] = dt_list
    
    data = data.merge(user_order, on='wm_order_id', how='left')
    
    return data


orders_click_poi = list_col(orders_poi_session, user_order)

In [5]:
def explode(data, user_order):
    data = data[data['clicks'].notnull()]
    order_list = []
    poi_list = []
    dt_list = []
    for row in data[['wm_order_id', 'clicks', 'dt']].values:
        order = row[0]
        clicks = row[1]
        dt = row[2]
        for poi in clicks.split('#'):
            order_list.append(order)
            poi_list.append(int(poi))
            dt_list.append(dt)
    
    df = pd.DataFrame()
    df['wm_order_id'] = order_list
    df['wm_poi_id'] = poi_list
    df['dt'] = dt_list
    
    df = df.merge(user_order, on='wm_order_id', how='left')
    return df


orders_click_poi_explode = explode(orders_poi_session, user_order)

In [6]:
def date_sub_days(end_date, days):
    start_date = parse(end_date[:10]) - timedelta(days=days)
    start_date = start_date.strftime('%Y-%m-%d')
    return start_date

In [7]:
def explode_df(train_data):
    order_list = []
    user_list = []
    poi_list = []
    dt_list = []
    for row in train_data[['wm_order_id', 'user_id', 'recall_list', 'dt']].values:
        order = row[0]
        user = row[1]
        recall_list = row[2]
        dt = row[3]
        for recall in recall_list:
            order_list.append(order)
            user_list.append(user)
            poi_list.append(recall)
            dt_list.append(dt)

    df = pd.DataFrame()
    df['wm_order_id'] = order_list
    df['user_id'] = user_list
    df['wm_poi_id'] = poi_list
    df['dt'] = dt_list
    return df

In [8]:
def isNan_2(a):
    return a != a

In [9]:
# 该时间段的点击, 全量召回
def recall_by_click(orders_click_poi, user_order, click_start_date, click_end_date, recall_name='click'):
    orders_click_poi = orders_click_poi[orders_click_poi['user_id'].notnull()]
    orders_click_poi['user_id'] = orders_click_poi['user_id'].astype(int)
    df_recall = orders_click_poi[(orders_click_poi['dt'] >= click_start_date) & (orders_click_poi['dt'] <= click_end_date)]
    df_recall.rename(columns={'wm_poi_id': recall_name}, inplace=True)
    return df_recall

In [10]:
# wm_usewr_id下, wm_poi_id下单率召回
def recall_by_user_buy_rate(orders_click_poi_explode, orders_train, start_date, end_date, topN=15, recall_name='topN_user_buy_rate'):
    # wm_usewr_id下, wm_poi_id的下单次数
    orders_train = orders_train[(orders_train['dt'] >= start_date) & (orders_train['dt'] <= end_date)]
    df_top_buy = orders_train.groupby(['user_id', 'wm_poi_id'], as_index=False)['dt'].agg({
        'buy_counts': 'count'
    })
    
    # wm_usewr_id下, wm_poi_id的点击次数
    orders_click_poi_explode = orders_click_poi_explode[(orders_click_poi_explode['dt'] >= start_date) & (orders_click_poi_explode['dt'] < end_date)]
    df_top_click = orders_click_poi_explode.groupby(['user_id', 'wm_poi_id'], as_index=False)['dt'].agg({
        'click_counts': 'count'
    })
    
    # wm_usewr_id下, wm_poi_id的下单率
    df_top_buy = df_top_buy.merge(df_top_click, on=['user_id', 'wm_poi_id'], how='left')
    df_top_buy['buy_rate'] = df_top_buy['buy_counts'] / df_top_buy['click_counts']
    df_top_buy['buy_rate_rank'] = df_top_buy.groupby('user_id')['buy_rate'].rank(method='dense', ascending=False)
    
    df_recall = df_top_buy[df_top_buy['buy_rate_rank'] <= topN]
    df_recall = df_recall.groupby('user_id')['wm_poi_id'].agg(lambda x: list(x)).reset_index().rename(columns={'wm_poi_id': recall_name})
    
    return df_recall

In [11]:
# wm_usewr_id下, wm_poi_id购买次数召回
def recall_by_user_buys(orders_click_poi_explode, orders_train, start_date, end_date, topN=15, recall_name='topN_user_buy_counts'):
    orders_train = orders_train[(orders_train['dt'] >= start_date) & (orders_train['dt'] <= end_date)]
    df_top_buy = orders_train.groupby(['user_id', 'wm_poi_id'], as_index=False)['dt'].agg({
        'buy_counts': 'count'
    })
    
    df_top_buy['buy_counts_rank'] = df_top_buy.groupby('user_id')['buy_counts'].rank(method='dense', ascending=False)
    
    df_recall = df_top_buy[df_top_buy['buy_counts_rank'] <= topN]
    df_recall = df_recall.groupby('user_id')['wm_poi_id'].agg(lambda x: list(x)).reset_index().rename(columns={'wm_poi_id': recall_name})
    
    return df_recall

In [12]:
# wm_user_id下, wm_poi_id的点击次数
def recall_by_user_clicks(orders_click_poi_explode, orders_train, start_date, end_date, topN=15, recall_name='history_topN_user_clicks'):
    orders_click_poi_explode = orders_click_poi_explode[(orders_click_poi_explode['dt'] >= start_date) & (orders_click_poi_explode['dt'] < end_date)]
    df_top_click = orders_click_poi_explode.groupby(['user_id', 'wm_poi_id'], as_index=False)['dt'].agg({
        'click_counts': 'count'
    })
    
    df_top_click['click_counts_rank'] = df_top_click.groupby('user_id')['click_counts'].rank(method='dense', ascending=False)

    df_recall = df_top_click[df_top_click['click_counts_rank'] <= topN]
    df_recall = df_recall.groupby('user_id')['wm_poi_id'].agg(lambda x: list(x)).reset_index().rename(columns={'wm_poi_id': recall_name})
    
    return df_recall

In [13]:
# wm_usewr_id+col下, wm_poi_id购买次数召回
def recall_by_user_col_buy_counts(col, orders_click_poi_explode, orders_train, start_date, end_date, topN=15, recall_name='topN_user_buy_counts'):
    orders_train = orders_train[(orders_train['dt'] >= start_date) & (orders_train['dt'] <= end_date)]
    df_top_buy = orders_train.groupby(['user_id', col, 'wm_poi_id'], as_index=False)['dt'].agg({
        'buy_counts': 'count'
    })
    
    df_top_buy['buy_counts_rank'] = df_top_buy.groupby(['user_id', col])['buy_counts'].rank(method='dense', ascending=False)
    
    df_recall = df_top_buy[df_top_buy['buy_counts_rank'] <= topN]
    df_recall = df_recall.groupby(['user_id', col])['wm_poi_id'].agg(lambda x: list(x)).reset_index().rename(columns={'wm_poi_id': recall_name})
    
    return df_recall

In [14]:
# wm_user_id下, wm_poi_id的点击次数
def recall_by_poi_score(score, orders_click_poi_explode, orders_train, start_date, end_date, topN=15, recall_name='history_topN_poi_score'):
    orders_train = orders_train[(orders_train['dt'] >= start_date) & (orders_train['dt'] < end_date)]
    df_top_click = orders_train.groupby(['user_id', 'wm_poi_id'], as_index=False)[score].agg({
        '{}_mean'.format(score): 'mean'
    })
    
    df_top_click['click_counts_rank'] = df_top_click.groupby('user_id')['{}_mean'.format(score)].rank(method='dense', ascending=False)

    df_recall = df_top_click[df_top_click['click_counts_rank'] <= topN]
    df_recall = df_recall.groupby('user_id')['wm_poi_id'].agg(lambda x: list(x)).reset_index().rename(columns={'wm_poi_id': recall_name})
    
    return df_recall

In [15]:
def recall_by_history(orders_click_poi, orders_click_poi_explode, orders_train, user_order, click_start_date, click_end_date, start_date, end_date, topN=15):
    # 该时间段的点击, 全量召回
    df_recall_1 = recall_by_click(orders_click_poi, user_order, click_start_date, click_end_date, recall_name='click')
    df_recall_2 = recall_by_user_buy_rate(orders_click_poi_explode, orders_train, start_date, end_date, topN, recall_name='topN_user_buy_rate')
    df_recall_3 = recall_by_user_buys(orders_click_poi_explode, orders_train, start_date, end_date, topN, recall_name='topN_user_buys')
    df_recall_4 = recall_by_user_clicks(orders_click_poi_explode, orders_train, start_date, end_date, topN, recall_name='history_topN_user_clicks')
    df_recall_5 = recall_by_user_clicks(orders_click_poi_explode, orders_train, click_start_date, click_end_date, topN, recall_name='current_topN_user_clicks')
    
    df_recall = df_recall_1.merge(df_recall_2, on='user_id', how='left')
    df_recall = df_recall.merge(df_recall_3, on='user_id', how='left')
    df_recall = df_recall.merge(df_recall_4, on='user_id', how='left')
    df_recall = df_recall.merge(df_recall_5, on='user_id', how='left')
    
    df_recall['topN_user_buy_rate'] = df_recall['topN_user_buy_rate'].apply(lambda x: [] if isNan_2(x) else x)
    df_recall['topN_user_buys'] = df_recall['topN_user_buys'].apply(lambda x: [] if isNan_2(x) else x)
    df_recall['history_topN_user_clicks'] = df_recall['history_topN_user_clicks'].apply(lambda x: [] if isNan_2(x) else x)
    df_recall['current_topN_user_clicks'] = df_recall['current_topN_user_clicks'].apply(lambda x: [] if isNan_2(x) else x)
    
    df_recall['recall_list'] = (df_recall['click'] + df_recall['topN_user_buy_rate'] +
                                df_recall['topN_user_buys'] + df_recall['history_topN_user_clicks'] +
                                df_recall['current_topN_user_clicks']).apply(set).apply(list)
    
    scores = ['poi_score', 'delivery_comment_avg_score', 'food_comment_avg_score']
    for score in tqdm(scores):
        recall_name = 'history_topN_{}'.format(score)
        df_recall_7 = recall_by_poi_score(score, orders_click_poi_explode, orders_train, start_date, end_date, topN=15, recall_name=recall_name)
        df_recall = df_recall.merge(df_recall_7, on=['user_id'], how='left')
        df_recall[recall_name] = df_recall[recall_name].apply(lambda x: [] if isNan_2(x) else x)
        df_recall['recall_list'] = (df_recall['recall_list'] + df_recall[recall_name]).apply(set).apply(list)
    
    
    cols = ['aor_id', 'ord_period_name', 'takedlvr_aoi_type_name']
    for col in tqdm(cols):
        recall_name = 'topN_user_{}_buys'.format(col)
        df_recall_6 = recall_by_user_col_buy_counts(col, orders_click_poi_explode, orders_train, start_date, end_date, topN, recall_name=recall_name)
        df_recall = df_recall.merge(df_recall_6, on=['user_id', col], how='left')
        df_recall[recall_name] = df_recall[recall_name].apply(lambda x: [] if isNan_2(x) else x)
        df_recall['recall_list'] = (df_recall['recall_list'] + df_recall[recall_name]).apply(set).apply(list)
    
    return df_recall

In [16]:
def recall_by_hot_data(orders_click_poi_explode, orders_train, start_date, end_date, topN=15):
    # 下单次数
    orders_train = orders_train[(orders_train['dt'] >= start_date) & (orders_train['dt'] <= end_date)]
    df_top_buy = orders_train.groupby('wm_poi_id', as_index=False)['dt'].agg({
        'buy_counts': 'count'
    }).sort_values('buy_counts', ascending=False)
    
    # 点击次数
    orders_click_poi_explode = orders_click_poi_explode[(orders_click_poi_explode['dt'] >= start_date) & (orders_click_poi_explode['dt'] < end_date)]
    df_top_click = orders_click_poi_explode.groupby('wm_poi_id', as_index=False)['dt'].agg({
        'click_counts': 'count'
    }).sort_values('click_counts', ascending=False)
    
    # 下单率
    df_top_buy_rate = df_top_buy.merge(df_top_click, on='wm_poi_id', how='left')
    df_top_buy_rate['buy_rate'] = df_top_buy_rate['buy_counts'] / df_top_buy_rate['click_counts']
    df_top_buy_rate.sort_values('buy_rate', ascending=False)
    
    hot_recall = (list(df_top_buy['wm_poi_id'].values[:topN]) +
                  list(df_top_click['wm_poi_id'].values[:topN]) +
                  list(df_top_buy_rate['wm_poi_id'].values[:topN]))
    hot_recall = list(set(hot_recall))
    return hot_recall

In [17]:
def recall_by_hot(df_recall, orders_click_poi_explode, orders_train, start_date, end_date, topN):
    hot_poi_recall = recall_by_hot_data(orders_click_poi_explode, orders_train, start_date, end_date, topN)
    df_recall['recall_list'] = df_recall['recall_list'].apply(lambda x: list(set(x + hot_poi_recall)))
    return df_recall

In [18]:
def recall(orders_click_poi, orders_click_poi_explode, orders_train, user_order, end_date, days, topN=15):
    # end_date: 历史找回的end_date, 也即click的第一天的前一天
    start_date = date_sub_days(end_date, days)
    click_start_date = date_sub_days(end_date, -1)
    click_end_date = date_sub_days(end_date, -5)
    print(start_date, end_date, click_start_date, click_end_date)
    df_recall = recall_by_history(orders_click_poi, orders_click_poi_explode, orders_train, user_order, click_start_date, click_end_date, start_date, end_date, topN)
    
    data_recall_list = recall_by_hot(df_recall, orders_click_poi_explode, orders_train, start_date, end_date, topN)
    data = explode_df(data_recall_list)
    
    orders_train = orders_train[(orders_train['dt'] >= click_start_date) & (orders_train['dt'] <= click_end_date)]
    orders_train['label'] = 1
    
    # 可能没召回到, 所以用outer join
    data = data.merge(orders_train[['wm_order_id', 'user_id', 'wm_poi_id', 'dt', 'label']], on=['wm_order_id', 'user_id', 'wm_poi_id', 'dt'], how='left')
    data['label'].fillna(0, inplace=True)
    
    return data

In [19]:
data_train = recall(orders_click_poi, orders_click_poi_explode, orders_train, user_order, '2021-06-13', 6, 15).drop_duplicates().reset_index(drop=True)

2021-06-07 2021-06-13 2021-06-14 2021-06-18


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
data_train.head()

Unnamed: 0,wm_order_id,user_id,wm_poi_id,dt,label
0,379429,18799,2438,2021-06-14,0.0
1,379429,18799,2183,2021-06-14,0.0
2,379429,18799,8,2021-06-14,0.0
3,379429,18799,3209,2021-06-14,0.0
4,379429,18799,15,2021-06-14,0.0


In [21]:
data_valid = recall(orders_click_poi, orders_click_poi_explode, orders_train, user_order, '2021-06-20', 6, 15).drop_duplicates().reset_index(drop=True)

2021-06-14 2021-06-20 2021-06-21 2021-06-25


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
data_test = recall(orders_click_poi, orders_click_poi_explode, orders_train, user_order, '2021-06-27', 6, 15).drop_duplicates().reset_index(drop=True)

2021-06-21 2021-06-27 2021-06-28 2021-07-02


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
data_train['label'].mean(), data_valid['label'].mean()

(0.010100172824067688, 0.010243116495531397)

In [24]:
data_train.shape, data_valid.shape, data_test.shape

((10194182, 5), (10800131, 5), (6792408, 5))

In [25]:
data_train.to_feather('../01-data/train_label.feather')
data_valid.to_feather('../01-data/valid_label.feather')
data_test.to_feather('../01-data/test_label.feather')

In [1]:
import pandas as pd
def calc_recall_percent(df_recall):
    df_temp = df_recall.groupby(['wm_order_id'])['label'].sum().reset_index()
    recall_pct = df_temp[df_temp['label'] == 1].shape[0] / df_temp.shape[0]
    return recall_pct


data_train = pd.read_feather('../01-data/train_label.feather')
data_valid = pd.read_feather('../01-data/valid_label.feather')

calc_recall_percent(data_train), calc_recall_percent(data_valid)

(0.40849903988065955, 0.41106482909301695)