In [1]:
import os, pickle, gc

import numpy as np
import pandas as pd

from tqdm import tqdm

from collections import defaultdict, Counter

import cudf, itertools

In [2]:
train = cudf.read_parquet('../data/local_validation/train.parquet')
test = pd.read_parquet('../data/local_validation/test.parquet')

data = cudf.concat([train])

In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}

### Create co-visitation matrix on GPU using CuDF

In [4]:
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

# print len of sessions
print(len(sessions))

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.sort_values(['session','ts'],ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n', axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = df.type_y.map(type_weight)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())

    del df
    gc.collect()

11098528


100%|██████████| 111/111 [00:47<00:00,  2.33it/s]


### Offload the final processing on CPU using Pandas

In [5]:
tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)

# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
print(df.shape)
with open(f'../data/local_validation/top_15_carts_orders_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

(1788152,)


In [6]:
%%time
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.loc[df.type.isin([1, 2])]

    df = df.sort_values(['session','ts'], ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n',axis=1)

    # CREATE PAIRS
    df = df.merge(df, on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = 1
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())
    
    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)
# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 15
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
with open(f'../data/local_validation/top_15_buy2buy_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 111/111 [00:33<00:00,  3.29it/s]


CPU times: user 47 s, sys: 3.01 s, total: 50 s
Wall time: 55.5 s


In [7]:
%%time
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.sort_values(['session','ts'], ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n',axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())

    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)
# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<20].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
with open(f'../data/local_validation/top_20_clicks_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 111/111 [00:51<00:00,  2.16it/s]


CPU times: user 4min 15s, sys: 23 s, total: 4min 38s
Wall time: 4min 46s


In [8]:
%%time
# LOAD THREE CO-VISITATION MATRICES

VER = 1

top_20_clicks = pd.read_pickle(f'../data/local_validation/top_20_clicks_v{VER}.pkl')
top_15_buys = pd.read_pickle(f'../data/local_validation/top_15_carts_orders_v{VER}.pkl')
top_15_buy2buy = pd.read_pickle(f'../data/local_validation/top_15_buy2buy_v{VER}.pkl')

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test.loc[test['type']==0,'aid'].value_counts().index.values[:20]
top_orders = test.loc[test['type']==2,'aid'].value_counts().index.values[:20]

# print shape of each matrix
print(f'clicks: {len(top_20_clicks)}')
print(f'carts: {len(top_15_buys)}')
print(f'buy2buy: {len(top_15_buy2buy)}')

clicks: 1796684
carts: 1788152
buy2buy: 1033926
CPU times: user 7.46 s, sys: 610 ms, total: 8.07 s
Wall time: 8.89 s


In [9]:
def suggest_clicks(df, top_20_clicks, top_clicks):
    products = df.aid.tolist()
    types = df.type.tolist()
    unique_products = list(set(products))

    if len(unique_products) >= 20:
        weights = np.logspace(0.1, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()

        for product, weight, type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[type]
        
        sorted_products = [product for product, _ in products_tmp.most_common(20)]
        return sorted_products
    else:
        products = list(itertools.chain(*[top_20_clicks.get(product, []) \
                        for product in unique_products if product in top_20_clicks]))
        top_products = [product for product, _ in Counter(products).most_common(20) \
                        if product not in unique_products]
        result = unique_products + top_products[:20 - len(unique_products)]
        return result + list(top_clicks[:20 - len(result)])

def suggest_buys(df, top_15_buy2buy, top_15_buys, top_orders):
    products = df.aid.tolist()
    types = df.type.tolist()
    # filter df for type 1 and 2
    df = df.loc[df.type.isin([1,2])]
    unique_products = list(set(products))

    if len(unique_products) >= 20:
        weights = np.logspace(0.1, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()

        for product, weight, type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[type]

        products_1 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                        for product in unique_products if product in top_15_buy2buy]))

        for product in products_1: products_tmp[product] += 0.1
        sorted_products = [product for product, _ in products_tmp.most_common(20)]
        return sorted_products
    else:
        products_1 = list(itertools.chain(*[top_15_buys.get(product, []) \
                          for product in unique_products if product in top_15_buys]))
        products_2 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                          for product in unique_products if product in top_15_buy2buy]))
        top_products = [product for product, _ in Counter(products_1 + products_2).most_common(20) \
                        if product not in unique_products]
        result = unique_products + top_products[:20 - len(unique_products)]
        return result + list(top_orders[:20 - len(result)])


In [10]:
%%time
pred_df_clicks = test.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_clicks(x, top_20_clicks, top_clicks)
)

pred_df_buys = test.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_buys(x, top_15_buy2buy, top_15_buys, top_orders)
)

CPU times: user 9min 7s, sys: 1.24 s, total: 9min 9s
Wall time: 9min 8s


In [11]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [12]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("submission.csv", index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,11098528_clicks,11830 588923 884502 1732105 485256 1460571 108...
1,11098529_clicks,1105029 1086704 1715900 1504796 217742 1666485...
2,11098530_clicks,409236 264500 1603001 583026 254154 485256 146...
3,11098531_clicks,1309633 1365569 624163 1557766 396199 1449555 ...
4,11098532_clicks,7651 876469 108125 1159379 1202618 77906 97028...


In [14]:
%%time
# COMPUTE METRIC
score = 0
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
for t in ['clicks','carts','orders']:
    sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    test_labels = pd.read_parquet('../data/local_validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==t]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    score += weights[t]*recall
    print(f'{t} recall =',recall)
    
print('=============')
print('Overall Recall =',score)
print('=============')

clicks recall = 0.4437054480289188
carts recall = 0.37876984884176784
orders recall = 0.6318452105469785
Overall Recall = 0.5371086257836093
CPU times: user 41.2 s, sys: 1.09 s, total: 42.3 s
Wall time: 42.1 s
