In [17]:
import os, pickle, gc

import numpy as np
import pandas as pd

from tqdm import tqdm

from collections import defaultdict, Counter
import cudf, itertools

In [18]:
class config:
    data_path = '../data/'
    local_validation = True
    validation_path = '../data/local_validation/'
    train_file = 'train.parquet'
    test_file = 'test.parquet'
    test_labels_file = 'test_labels.parquet'
    n_session_samples = 100
    n_most_common = 50
    debug = True

In [19]:
if config.local_validation:
    train = cudf.read_parquet(config.validation_path + config.train_file)
    test = pd.read_parquet(config.validation_path + config.test_file)
    test_labels = cudf.read_parquet(config.validation_path + config.test_labels_file)
    data = cudf.concat([train])
else:
    train = cudf.read_parquet(config.data_path + config.train_file)
    test = cudf.read_parquet(config.data_path + config.test_file)
    data = cudf.concat([train, test])    

In [20]:
if config.debug:
    data = data.sample(frac=0.1, random_state=42)

In [21]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}

### Create co-visitation matrix on GPU using CuDF

### Carts Orders Co-visitation Matrix

In [22]:
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

# print len of sessions
print(len(sessions))

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.sort_values(['session','ts'],ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    # take only the first 30 rows (tail) of each session
    df = df.loc[df.n<100].drop('n', axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = df.type_y.map(type_weight)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())

    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)

# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
# we only select 15 products for each aid_x
tmp = tmp.loc[tmp.n<50].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
with open(config.data_path + 'top_15_carts_orders_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

5580740


100%|██████████| 56/56 [00:30<00:00,  1.86it/s]


### Buy2Buy Co-visitation matrix

In [None]:
%%time
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.loc[df.type.isin([1, 2])]

    df = df.sort_values(['session','ts'], ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n',axis=1)

    # CREATE PAIRS
    df = df.merge(df, on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs() < 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = 1
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())
    
    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)
# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 15
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
with open(config.data_path + 'top_15_buy2buy_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 111/111 [00:36<00:00,  3.05it/s]


CPU times: user 49.8 s, sys: 2.88 s, total: 52.6 s
Wall time: 58.3 s


### Clicks Co-visitation matrix

In [None]:
%%time
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.sort_values(['session','ts'], ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<100].drop('n',axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())

    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)
# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<50].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
with open(config.data_path + 'top_20_clicks_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 111/111 [01:11<00:00,  1.55it/s]


CPU times: user 6min 39s, sys: 1min 36s, total: 8min 16s
Wall time: 14min 32s


In [None]:
%%time
# LOAD THREE CO-VISITATION MATRICES

VER = 1

top_20_clicks = pd.read_pickle(cofig.data_path + 'top_20_clicks_v{VER}.pkl')
top_15_buys = pd.read_pickle(config.data_path + 'top_15_carts_orders_v{VER}.pkl')
top_15_buy2buy = pd.read_pickle(config.data_path + 'top_15_buy2buy_v{VER}.pkl')

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test.loc[test['type']==type_labels['clicks'],'aid'].value_counts().index.values[:20]
top_orders = test.loc[test['type']==type_labels['orders'],'aid'].value_counts().index.values[:20]

# print shape of each matrix
print(f'clicks: {len(top_20_clicks)}')
print(f'carts: {len(top_15_buys)}')
print(f'buy2buy: {len(top_15_buy2buy)}')

clicks: 1815630
carts: 1788152
buy2buy: 1033926
CPU times: user 12.5 s, sys: 1.09 s, total: 13.6 s
Wall time: 15.1 s


In [None]:
def suggest_clicks(df, top_20_clicks, top_clicks):
    products = df.aid.tolist()
    types = df.type.tolist()
    unique_products = list(dict.fromkeys(products[::-1] ))

    if len(unique_products) >= 20:
        weights = np.logspace(0.1, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()

        for product, weight, _type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[_type]
        
        sorted_products = [product for product, _ in products_tmp.most_common(50)]
        return sorted_products
    else:
        products_1 = list(itertools.chain(*[top_20_clicks[product] \
                        for product in unique_products if product in top_20_clicks]))
        top_products_1 = [product for product, _ in Counter(products_1).most_common(50) \
                        if product not in unique_products]
        result = unique_products + top_products_1[:20 - len(unique_products)]
        return result + list(top_clicks[:20 - len(result)])

def suggest_buys(df, top_15_buy2buy, top_15_buys, top_orders):
    products = df.aid.tolist()
    types = df.type.tolist()
    # filter df for type 1 and 2
    unique_products = list(dict.fromkeys(products[::-1] ))
    df = df.loc[(df['type']==1)|(df['type']==2)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))

    if len(unique_products) >= 20:
        weights = np.logspace(0.5, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()
        for product, weight, _type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[_type]
        products_1 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                        for product in unique_buys if product in top_15_buy2buy]))
        for product in products_1: products_tmp[product] += 0.1
        sorted_products = [product for product, _ in products_tmp.most_common(50)]
        return sorted_products
    else:
        products_1 = list(itertools.chain(*[top_15_buys.get(product, []) \
                          for product in unique_products if product in top_15_buys]))
        products_2 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                          for product in unique_buys if product in top_15_buy2buy]))
        top_products = [product for product, _ in Counter(products_1 + products_2).most_common(50) \
                        if product not in unique_products]
        result = unique_products + top_products[:20 - len(unique_products)]
        return result + list(top_orders[:20 - len(result)])


In [None]:
%%time
pred_df_clicks = test.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_clicks(x, top_20_clicks, top_clicks)
)

pred_df_buys = test.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_buys(x, top_15_buy2buy, top_15_buys, top_orders)
)

CPU times: user 11min 58s, sys: 0 ns, total: 11min 58s
Wall time: 11min 58s


In [None]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [None]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv(config.data_path + 'submission.csv', index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,11098528_clicks,11830 588923 1732105 884502 1157882 876129 118...
1,11098529_clicks,1105029 459126 1715900 1647167 1339838 18819 1...
2,11098530_clicks,409236 264500 1603001 583026 963957 254154 877...
3,11098531_clicks,396199 1271998 452188 1728212 1365569 624163 1...
4,11098532_clicks,876469 7651 108125 1159379 1202618 77906 75819...


In [None]:
%%time
# COMPUTE METRIC

if config.local_validation:
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet(config.validation_path + 'test_labels.parquet')
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
        
    print('=============')
    print('Overall Recall =',score)
    print('=============')

clicks recall = 0.4746288024042827
carts recall = 0.37670560399110464
orders recall = 0.6320111840614357
Overall Recall = 0.5396812718746211
CPU times: user 47.3 s, sys: 471 ms, total: 47.8 s
Wall time: 47.6 s


: 

## Clicks
- 0.461
- 0.474 (current parameters - clicks)
### Carts
- 0.376
### Orders
- 0.632