In [1]:
import numpy as np
import pandas as pd
import os, glob
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
import math
from operator import itemgetter
from functions.itemCF import itemCFTrain, ItemMatrix_fn, ItemSimilarityMatrix_fn, recommend

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [22]:
SET = 3
# predType = 'orders'

In [23]:
def load_test(path):    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(path)):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

In [24]:
if SET == 1:
    test_df = load_test('../input/split_2-1_pqt/test_parquets/*')
elif SET == 2:
    test_df = pd.read_parquet('../input/splited/test.parquet')

elif SET == 3:
    test_df = load_test('../input/parquets/test_parquets/*')
test_df

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0
...,...,...,...,...
6928118,13099776,1159407,1661844072,0
6928119,13099776,546448,1661844142,0
6928120,13099777,468584,1661795832,0
6928121,13099778,926609,1661795832,0


In [25]:
# if predType == 'carts': 
#     test_df = test_df[test_df.type.isin([1, 2])].reset_index(drop=True)
# elif predType == 'orders':
#     test_df = test_df[test_df.type == 2].reset_index(drop=True)

In [26]:
uidict = itemCFTrain(test_df)
itemMatrix, N = ItemMatrix_fn(uidict)
itemSimMatrix = ItemSimilarityMatrix_fn(itemMatrix, N)

100%|██████████| 65480/65480 [00:01<00:00, 57088.38it/s]
100%|██████████| 65480/65480 [00:00<00:00, 3216450.11it/s]
100%|██████████| 35849/35849 [00:00<00:00, 587731.12it/s]
100%|██████████| 42709/42709 [00:00<00:00, 635641.32it/s]
100%|██████████| 42709/42709 [00:00<00:00, 927034.22it/s]


In [27]:
save_path = Path(f'../output/newSplited/cf_matrix/set_{SET}')
try: 
    os.makedirs(save_path) 
except OSError as error: 
    print(error) 

np.save(save_path / 'uidict.npy', uidict)
np.save(save_path / 'itemMatrix.npy', itemMatrix)
np.save(save_path / 'N.npy', N)
np.save(save_path / 'itemSimMatrix.npy', itemSimMatrix)

## Recommend

In [5]:
save_path = Path(f'../output/newSplited/cf_matrix/set_{SET}')
uidict = np.load(save_path / 'uidict.npy', allow_pickle='TRUE').item()
itemMatrix = np.load(save_path / 'itemMatrix.npy', allow_pickle='TRUE').item()
N = np.load(save_path / 'N.npy', allow_pickle='TRUE').item()
itemSimMatrix = np.load(save_path / 'itemSimMatrix.npy', allow_pickle='TRUE').item()

In [23]:
pop = list(dict(sorted(N.items(), key=itemgetter(1), reverse=True)[:20]).keys())
users = list(uidict.keys())
re_items = []

In [24]:
for user in tqdm(users):
    result = list(recommend(uidict, itemSimMatrix, user, pop))
    re_items.append(result)
np.save(save_path / 're_items.npy', re_items)

100%|██████████| 1801251/1801251 [12:33<00:00, 2390.50it/s]


In [45]:
sub = pd.DataFrame({
    'session': users,
    'labels': re_items
    })
sub

Unnamed: 0,session,labels
0,11098528,"[11830, 588923, 1732105, 630636, 828829, 87612..."
1,11098529,"[1105029, 528290, 333991, 1384035, 952682, 139..."
2,11098530,"[409236, 264500, 639815, 983539, 364155, 58302..."
3,11098531,"[1271998, 1365569, 1728212, 1557766, 452188, 3..."
4,11098532,"[7651, 876469, 1273333, 1402537, 358039, 97028..."
...,...,...
1801246,12899774,"[33035, 356112, 433555, 1228185, 100048, 74521..."
1801247,12899775,"[1743151, 1414967, 1163166, 1022572, 955514, 1..."
1801248,12899776,"[548599, 487078, 1599137, 1259052, 1045568, 26..."
1801249,12899777,"[384045, 1308634, 479563, 1838401, 1494551, 51..."


In [47]:
predTypes = ['clicks', 'carts', 'orders']
recall20 = {}
for predType in predTypes:
    sub['type'] = predType
    test_labels = pd.read_parquet('../input/splited/test_labels.parquet')
    type_labels = test_labels[test_labels['type'] == predType]
    type_labels = type_labels.merge(sub, how='left', on=['session', 'type']).dropna()
    type_labels['hits'] = type_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels[:20]))), axis=1)
    type_labels['gt_count'] = type_labels.ground_truth.str.len().clip(0,20)
    recall_per_type = type_labels['hits'].sum() / type_labels['gt_count'].sum()
    recall20[predType] = recall_per_type
recall20

{'clicks': 0.4109450457809419,
 'carts': 0.3526597534701864,
 'orders': 0.6019412517594789}

In [48]:
score = recall20['clicks']*0.1 + recall20['carts']*0.3 + recall20['orders']*0.6
score

0.5080571816748374