In [1]:
import os, pickle, gc

import numpy as np
import pandas as pd

from tqdm import tqdm

from collections import defaultdict, Counter
import cudf, itertools

from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
class config:
    data_path = '../data/'
    local_validation = False
    validation_path = '../data/local_validation/'
    train_file = 'train.parquet'
    test_file = 'test.parquet'
    test_labels_file = 'test_labels.parquet'
    n_session_samples = 100
    n_most_common = 50
    debug = True

In [3]:
if config.local_validation:
    train = cudf.read_parquet(config.validation_path + config.train_file)
    test = pd.read_parquet(config.validation_path + config.test_file)
    test_labels = cudf.read_parquet(config.validation_path + config.test_labels_file)
    data = cudf.concat([train])
else:
    train = cudf.read_parquet(config.data_path + config.train_file)
    test = cudf.read_parquet(config.data_path + config.test_file)
    data = cudf.concat([train, test])    

In [4]:
if config.debug:
    data = data.sample(frac=0.02, random_state=42)

In [5]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}

### Create co-visitation matrix on GPU using CuDF

### Carts Orders Co-visitation Matrix

In [6]:
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

# print len of sessions
print(len(sessions))

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.sort_values(['session','ts'],ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    # take only the first 30 rows (tail) of each session
    df = df.loc[df.n<30].drop('n', axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = df.type_y.map(type_weight)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())

    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)

# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
# we only select 15 products for each aid_x
tmp = tmp.loc[tmp.n<50].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.parallel_apply(list)
with open(config.data_path + f'top_15_carts_orders_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

14571582


100%|██████████| 146/146 [01:09<00:00,  2.10it/s]


### Buy2Buy Co-visitation matrix

In [7]:
%%time
# data_copy = data.copy()
# data_copy = data_copy.set_index('session')
# sessions = data_copy.index.unique()

# chunk_size = 100_000

# tmp = list()
# for i in tqdm(range(0, sessions.shape[0], chunk_size)):
#     df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
#     df = df.loc[df.type.isin([1, 2])]

#     df = df.sort_values(['session','ts'], ascending=[True, False])

#     # USE TAIL OF SESSION
#     df = df.reset_index(drop=True)
#     df['n'] = df.groupby('session').cumcount()
#     df = df.loc[df.n<30].drop('n',axis=1)

#     # CREATE PAIRS
#     df = df.merge(df, on='session')
#     df = df.loc[ ((df.ts_x - df.ts_y).abs() < 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

#     # ASSIGN WEIGHTS
#     df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
#     df['wgt'] = 1
#     df = df[['aid_x','aid_y','wgt']]
#     df.wgt = df.wgt.astype('float32')
#     df = df.groupby(['aid_x','aid_y']).wgt.sum()

#     tmp.append(df.reset_index())
    
#     del df
#     gc.collect()

# tmp = list(map(lambda x: x.to_pandas(), tmp))
# tmp = pd.concat(tmp)
# # CONVERT MATRIX TO DICTIONARY
# tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# # SAVE TOP 15
# tmp = tmp.reset_index(drop=True)
# tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
# tmp = tmp.loc[tmp.n<50].drop('n',axis=1)
# # SAVE TO DISK
# df = tmp.groupby('aid_x').aid_y.parallel_apply(list)
with open(config.data_path + f'top_15_buy2buy_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 146/146 [00:52<00:00,  2.79it/s]


CPU times: user 1min 7s, sys: 4.08 s, total: 1min 11s
Wall time: 1min 20s


### Clicks Co-visitation matrix

In [8]:
%%time
# data_copy = data.copy()
# data_copy = data_copy.set_index('session')
# sessions = data_copy.index.unique()

# chunk_size = 100_000

# tmp = list()
# for i in tqdm(range(0, sessions.shape[0], chunk_size)):
#     df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
#     df = df.sort_values(['session','ts'], ascending=[True, False])

#     # USE TAIL OF SESSION
#     df = df.reset_index(drop=True)
#     df['n'] = df.groupby('session').cumcount()
#     df = df.loc[df.n<30].drop('n',axis=1)

#     # CREATE PAIRS
#     df = df.merge(df,on='session')
#     df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

#     # ASSIGN WEIGHTS
#     df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
#     df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
#     df = df[['aid_x','aid_y','wgt']]
#     df.wgt = df.wgt.astype('float32')
#     df = df.groupby(['aid_x','aid_y']).wgt.sum()

#     tmp.append(df.reset_index())

#     del df
#     gc.collect()

# tmp = list(map(lambda x: x.to_pandas(), tmp))
# tmp = pd.concat(tmp)
# # CONVERT MATRIX TO DICTIONARY
# tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# # SAVE TOP 40
# tmp = tmp.reset_index(drop=True)
# tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
# tmp = tmp.loc[tmp.n<50].drop('n',axis=1)
# # SAVE TO DISK
# df = tmp.groupby('aid_x').aid_y.parallel_apply(list)
with open(config.data_path + f'top_20_clicks_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 146/146 [01:07<00:00,  2.16it/s]


CPU times: user 4min 8s, sys: 19.8 s, total: 4min 27s
Wall time: 4min 39s


In [9]:
%%time
# LOAD THREE CO-VISITATION MATRICES
top_20_clicks = pd.read_pickle(config.data_path + f'top_20_clicks_v{version}.pkl')
top_15_buys = pd.read_pickle(config.data_path + f'top_15_carts_orders_v{version}.pkl')
top_15_buy2buy = pd.read_pickle(config.data_path + f'top_15_buy2buy_v{version}.pkl')

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test.loc[test['type']==type_labels['clicks'],'aid'].value_counts().index.values[:20]
top_orders = test.loc[test['type']==type_labels['orders'],'aid'].value_counts().index.values[:20]

# print shape of each matrix
print(f'clicks: {len(top_20_clicks)}')
print(f'carts: {len(top_15_buys)}')
print(f'buy2buy: {len(top_15_buy2buy)}')

clicks: 1837166
carts: 1837166
buy2buy: 1168768
CPU times: user 13.3 s, sys: 1.54 s, total: 14.8 s
Wall time: 16.6 s


In [10]:
def suggest_clicks(df, top_20_clicks, top_clicks):
    products = df.aid.tolist()
    types = df.type.tolist()
    unique_products = list(dict.fromkeys(products[::-1] ))

    if len(unique_products) >= 20:
        weights = np.logspace(0.1, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()

        for product, weight, _type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[_type]
        
        sorted_products = [product for product, _ in products_tmp.most_common(50)]
        return sorted_products
    else:
        products_1 = list(itertools.chain(*[top_20_clicks[product] \
                        for product in unique_products if product in top_20_clicks]))
        top_products_1 = [product for product, _ in Counter(products_1).most_common(50) \
                        if product not in unique_products]
        result = unique_products + top_products_1[:20 - len(unique_products)]
        return result + list(top_clicks[:20 - len(result)])

def suggest_buys(df, top_15_buy2buy, top_15_buys, top_orders):
    products = df.aid.tolist()
    types = df.type.tolist()
    # filter df for type 1 and 2
    unique_products = list(dict.fromkeys(products[::-1] ))
    df = df.loc[(df['type']==1)|(df['type']==2)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))

    if len(unique_products) >= 20:
        weights = np.logspace(0.5, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()
        for product, weight, _type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[_type]
        products_1 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                        for product in unique_buys if product in top_15_buy2buy]))
        for product in products_1: products_tmp[product] += 0.1
        sorted_products = [product for product, _ in products_tmp.most_common(50)]
        return sorted_products
    else:
        products_1 = list(itertools.chain(*[top_15_buys.get(product, []) \
                          for product in unique_products if product in top_15_buys]))
        products_2 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                          for product in unique_buys if product in top_15_buy2buy]))
        top_products = [product for product, _ in Counter(products_1 + products_2).most_common(50) \
                        if product not in unique_products]
        result = unique_products + top_products[:20 - len(unique_products)]
        return result + list(top_orders[:20 - len(result)])

In [11]:
%%time
if not config.local_validation:
    test = test.to_pandas()

pred_df_clicks = test.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_clicks(x, top_20_clicks, top_clicks)
)

pred_df_buys = test.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_buys(x, top_15_buy2buy, top_15_buys, top_orders)
)

CPU times: user 12min 10s, sys: 1.42 s, total: 12min 12s
Wall time: 12min 11s


In [12]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [13]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.parallel_apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv(config.data_path + 'submission.csv', index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 1660529 94230 742709 6205...
1,12899780_clicks,1142000 736515 973453 582732 487136 1502122 17...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 1681...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...


In [14]:
%%time
# COMPUTE METRIC

if config.local_validation:
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.parallel_apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.parallel_apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet(config.validation_path + 'test_labels.parquet')
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.parallel_apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
        
    print('=============')
    print('Overall Recall =',score)
    print('=============')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


## Clicks
- 0.461
- 0.474 (current parameters - clicks)
### Carts
- 0.376
### Orders
- 0.632