In [1]:
import os, pickle, gc

import numpy as np
import pandas as pd

from tqdm import tqdm

from collections import defaultdict, Counter

import cudf, itertools

In [12]:
# train = cudf.read_parquet('../data/train.parquet')
test = pd.read_parquet('../data/test.parquet')

# data = cudf.concat([train, test])

In [7]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}

### Create co-visitation matrix on GPU using CuDF

In [7]:
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

# print len of sessions
print(len(sessions))

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.sort_values(['session','ts'],ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n', axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = df.type_y.map(type_weight)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())

    del df
    gc.collect()

14571582


100%|██████████| 146/146 [01:16<00:00,  1.91it/s]


### Offload the final processing on CPU using Pandas

In [8]:
tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)

# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
print(df.shape)
with open(f'../data/top_15_carts_orders_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

(1837166,)


In [12]:
%%time
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.loc[df.type.isin([1, 2])]

    df = df.sort_values(['session','ts'], ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n',axis=1)

    # CREATE PAIRS
    df = df.merge(df, on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = 1
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())
    
    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)
# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 15
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
with open(f'../data/top_15_buy2buy_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 146/146 [00:57<00:00,  2.54it/s]


CPU times: user 1min 11s, sys: 3.72 s, total: 1min 14s
Wall time: 1min 23s


In [4]:
%%time
version = 1
data_copy = data.copy()
data_copy = data_copy.set_index('session')
sessions = data_copy.index.unique()

chunk_size = 100_000

tmp = list()
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data_copy.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()
    df = df.sort_values(['session','ts'], ascending=[True, False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n',axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    tmp.append(df.reset_index())

    del df
    gc.collect()

tmp = list(map(lambda x: x.to_pandas(), tmp))
tmp = pd.concat(tmp)
# CONVERT MATRIX TO DICTIONARY
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<20].drop('n',axis=1)
# SAVE TO DISK
df = tmp.groupby('aid_x').aid_y.apply(list)
with open(f'../data/top_20_clicks_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████| 146/146 [01:48<00:00,  1.35it/s]


In [15]:
%%time
# LOAD THREE CO-VISITATION MATRICES

VER = 1

top_20_clicks = pd.read_pickle(f'../data/top_20_clicks_v{VER}.pkl')
top_15_buys = pd.read_pickle(f'../data/top_15_carts_orders_v{VER}.pkl')
top_15_buy2buy = pd.read_pickle(f'../data/top_15_buy2buy_v{VER}.pkl')

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test.loc[test['type']==0,'aid'].value_counts().index.values[:20]
top_orders = test.loc[test['type']==2,'aid'].value_counts().index.values[:20]

# print shape of each matrix
print(f'clicks: {len(top_20_clicks)}')
print(f'carts: {len(top_15_buys)}')
print(f'buy2buy: {len(top_15_buy2buy)}')

clicks: 1842149
carts: 1837166
buy2buy: 1168768
CPU times: user 2.82 s, sys: 722 ms, total: 3.54 s
Wall time: 4.34 s


In [16]:
def suggest_clicks(df, top_20_clicks, top_clicks):
    products = df.aid.tolist()
    types = df.type.tolist()
    unique_products = list(set(products))

    if len(unique_products) >= 20:
        weights = np.logspace(0.1, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()

        for product, weight, type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[type]
        
        sorted_products = [product for product, _ in products_tmp.most_common(20)]
        return sorted_products
    else:
        products = list(itertools.chain(*[top_20_clicks.get(product, []) \
                        for product in unique_products if product in top_20_clicks]))
        top_products = [product for product, _ in Counter(products).most_common(20) \
                        if product not in unique_products]
        result = unique_products + top_products[:20 - len(unique_products)]
        return result + list(top_clicks[:20 - len(result)])

def suggest_buys(df, top_15_buy2buy, top_15_buys, top_orders):
    products = df.aid.tolist()
    types = df.type.tolist()
    # filter df for type 1 and 2
    df = df.loc[df.type.isin([1,2])]
    unique_products = list(set(products))

    if len(unique_products) >= 20:
        weights = np.logspace(0.1, 1, len(products), base=2, endpoint=True) - 1
        products_tmp = Counter()

        for product, weight, type in zip(products, weights, types):
            products_tmp[product] += weight * type_weight[type]

        products_1 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                        for product in unique_products if product in top_15_buy2buy]))

        for product in products_1: products_tmp[product] += 0.1
        sorted_products = [product for product, _ in products_tmp.most_common(20)]
        return sorted_products
    else:
        products_1 = list(itertools.chain(*[top_15_buys.get(product, []) \
                          for product in unique_products if product in top_15_buys]))
        products_2 = list(itertools.chain(*[top_15_buy2buy.get(product, []) \
                          for product in unique_products if product in top_15_buy2buy]))
        top_products = [product for product, _ in Counter(products_1 + products_2).most_common(20) \
                        if product not in unique_products]
        result = unique_products + top_products[:20 - len(unique_products)]
        return result + list(top_orders[:20 - len(result)])


In [17]:
%%time
pred_df_clicks = test.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_clicks(x, top_20_clicks, top_clicks)
)

pred_df_buys = test.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_buys(x, top_15_buy2buy, top_15_buys, top_orders)
)

CPU times: user 8min 24s, sys: 978 ms, total: 8min 25s
Wall time: 8min 25s


In [18]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [19]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("submission.csv", index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 666350 1740050 1124932 94230 369...
1,12899780_clicks,1142000 736515 582732 973453 1502122 487136 17...
2,12899781_clicks,199008 57315 141736 918667 194067 1460571 1681...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,255297 1114789 1817895 198385 1729553 1754419 ...


In [20]:
pred_df.shape

(5015409, 2)