In [24]:
import gc
import json
import os
import pickle


from implicit.nearest_neighbours import ItemItemRecommender, BM25Recommender, TFIDFRecommender, bm25_weight
import numpy as np
import pandas as pd
import scipy.sparse as sp
from tqdm.auto import tqdm

RAW_DATA_PATH = './raw_data/'
DATA_PATH = './data'
RANDOM_STATE = 42
TOP_N = 100

# Data

In [2]:
train_clicks = pd.read_hdf(os.path.join(DATA_PATH, 'step1_enriched_train_clicks.h5'), index=None, key='step1')
train_likes = pd.read_hdf(os.path.join(DATA_PATH, 'step1_enriched_train_likes.h5'), index=None, key='step1')
train_shares = pd.read_hdf(os.path.join(DATA_PATH, 'step1_enriched_train_shares.h5'), index=None, key='step1')

In [3]:
val_clicks = pd.read_hdf(os.path.join(DATA_PATH, 'step1_val_clicks.h5'), index=None, key='step1')

In [4]:
test_users = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_users.csv'))

In [5]:
with open(os.path.join(DATA_PATH, 'val1_users.json'), 'r') as f:
    val1_users = set(json.load(f)['users'])
    
with open(os.path.join(DATA_PATH, 'val2_users.json'), 'r') as f:
    val2_users = set(json.load(f)['users'])
    
val_users = val1_users | val2_users

In [6]:
user_mapping = pd.read_csv(os.path.join(DATA_PATH, 'step0_user_mapping.csv'))
picture_mapping = pd.read_csv(os.path.join(DATA_PATH, 'step0_picture_mapping.csv'))

user_mapping_dict = user_mapping.groupby('old')['new'].first().to_dict()
picture_mapping_dict = picture_mapping.groupby('old')['new'].first().to_dict()
inv_picture_mapping_dict = dict([(v, k)for k, v in picture_mapping_dict.items()])

# Candidates

In [7]:
def combine_candidates(val_users, candidates1, candidates2):
    result = {}
    for user_id in tqdm(val_users):
        items1 = candidates1.get(user_id, [])
        items2 = candidates2.get(user_id, [])
        items = list(set(items1) | set(items2))
        if len(items) > 0:
            result[user_id] = items
    return result

In [8]:
# als candidates
def get_als_candidates(
    val_users, 
    train_clicks,
    als_model, 
    already_candidates,
    N
):
    global user_mapping
    global picture_mapping
    global user_mapping_dict
    global picture_mapping_dict
    
    selected_train_clics = train_clicks
    user_ids = selected_train_clics['user_id'].map(user_mapping.set_index('old').new)
    picture_ids = selected_train_clics['picture_id'].map(picture_mapping.set_index('old').new)

    train_picture_user_clicks_matrix = sp.csr_matrix(
        (np.tile(1, selected_train_clics.shape[0]),
            (
                picture_ids,
                user_ids
            )
        ),
        shape=(len(picture_mapping_dict) + 1, len(user_mapping_dict) + 1),
        dtype=np.float
    )
    train_picture_user_clicks_matrix.shape
    
    # Prediction
    val_pred_als = dict()
    
    for user_id in tqdm(list(val_users)):
        user_index = user_mapping_dict[user_id]

        if als_model.user_factors[user_index][0] == 0:
            continue

        _pred = als_model.recommend(
            user_index, 
            train_picture_user_clicks_matrix.T, 
            N,
            filter_already_liked_items=True,
        )
        _pred = [inv_picture_mapping_dict[x[0]] for x in _pred]
        val_pred_als[user_id] = _pred

#     filter_transactions = pd.concat([
#         transactions.loc[:, ['user_uid', 'element_uid']],
#     ]).query('user_uid in @val_users')

#     filtered_elements = filter_transactions.groupby('user_uid')['element_uid'].agg(set).to_dict()
    
#     for user_uid in val_users:    
#         filtered_elements[user_uid] = filtered_elements.get(user_uid, set())
#         filtered_elements[user_uid] = filtered_elements[user_uid].union(already_candidates.get(user_uid, set()))
    
#     user_ids = transactions['user_uid'].map(user_mapping.set_index('old').new)

#     ratings_matrix = sp.csr_matrix(
#         (np.ones(len(transactions)),
#             (
#                 user_ids,
#                 transactions['element_uid']
#             )
#         ),
#         shape=(item_user_matrix_shape[1], item_user_matrix_shape[0])
#     )

#     user_uid_to_cat = user_mapping.set_index('old').new.to_dict()
#     warm_users = set(transactions['user_uid'].unique())

#     als_candidates = {}

#     for user_uid in tqdm.tqdm(val_users):
#         # transform user_uid to model's internal user category
#         if user_uid not in warm_users:
#             continue
            
#         user_cat = user_uid_to_cat[user_uid]

#         # perform inference
#         recs = model.recommend(
#             user_cat,
#             ratings_matrix,
#             N=N,
#             filter_already_liked_items=True,
#             filter_items=filtered_elements.get(user_uid, set())
#         )

#         # drop scores and transform model's internal elelemnt category to element_uid for every prediction
#         # also convert np.uint64 to int so it could be json serialized later
#         als_candidates[user_uid] = [i for i, _ in recs]
        
    candidates = combine_candidates(
        val_users=val_users, 
        candidates1=already_candidates, 
        candidates2=val_pred_als
    )
        
    return candidates

In [9]:
with open(os.path.join(DATA_PATH, 'step3_clicks_als.pkl'), 'rb') as f:
    als_click_model = pickle.load(f)

HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1380), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1380), HTML(value='')))




# Pairs

In [13]:
def get_combine_pairs(candidates):
    users = []
    pictures = []
    for key in tqdm(candidates):
        users += np.repeat([key], repeats=len(candidates[key])).tolist()
        pictures += candidates[key]
            
    return pd.DataFrame({'user_id': users, 'picture_id': pictures})

HBox(children=(IntProgress(value=0, max=8011), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8028), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1380), HTML(value='')))




In [2]:
def generate_als_feature(pairs, model, add_factors, mode):
    global user_mapping
    global picture_mapping
    
    user_factors = model.user_factors
    picture_factors = model.item_factors
    
    user_ids = pairs['user_id'].map(user_mapping.set_index('old').new).values
    picture_ids = pairs['picture_id'].map(picture_mapping.set_index('old').new).values
    
    scores = np.sum(user_factors[user_ids] * picture_factors[picture_ids], axis=1)
    pairs['{}_als_score'.format(mode)] = scores
    
    if add_factors:
        user_factors_df = pd.DataFrame(
            user_factors[user_ids], 
            columns=['user_factor_{}'.format(i) for i in range(user_factors.shape[1])]
        )
        picture_factors_df = pd.DataFrame(
            picture_factors[picture_ids], 
            columns=['picture_factor_{}'.format(i) for i in range(picture_factors.shape[1])]
        )  
        
        user_picture_factors_df = pd.DataFrame(
            np.multiply(user_factors[user_ids], picture_factors[picture_ids], dtype=np.int16), 
            columns=['user_picture_factor_{}'.format(i) for i in range(model.factors)],
        )  
        
        pairs = pd.concat([pairs, user_factors_df, picture_factors_df, user_picture_factors_df], axis=1)
    
    return pairs

In [32]:
# del pairs1
# del pairs2
# del pairs_test
# gc.collect()

79

# Save

In [None]:
val_clicks['y'] = 1
target1 = val_clicks.query('user_id in @val1_users')
target2 = val_clicks.query('user_id in @val2_users')

In [None]:
candidates1 = get_als_candidates(
    list(val1_users), 
    train_clicks,
    als_click_model, 
    already_candidates={},
    N=2000
)
pairs1 = get_combine_pairs(candidates=candidates1)
pairs1 = generate_als_feature(pairs1, als_click_model, add_factors=False, mode='clicks')

cols = ['user_id', 'picture_id', 'y']
pairs1 = pairs1.merge(target1[cols], on=['user_id', 'picture_id'], how='left').fillna(0)

pairs1.to_hdf(os.path.join(DATA_PATH, 'step4_val1_pairs.h5'), key='step4', mode='w')

# del pairs1
# gc.collect()

In [None]:
candidates2 = get_als_candidates(
    list(val2_users), 
    train_clicks,
    als_click_model, 
    already_candidates={},
    N=2000
)
pairs2 = get_combine_pairs(candidates=candidates2)
pairs2 = generate_als_feature(pairs2, als_click_model, add_factors=False, mode='clicks')

cols = ['user_id', 'picture_id', 'y']
pairs2 = pairs2.merge(target2[cols], on=['user_id', 'picture_id'], how='left').fillna(0)

pairs2.to_hdf(os.path.join(DATA_PATH, 'step4_val2_pairs.h5'), key='step4', mode='w')

In [None]:
candidates_test = get_als_candidates(
    list(set(test_users['user_id'])), 
    train_clicks,
    als_click_model, 
    already_candidates={},
    N=2000
)
pairs_test = get_combine_pairs(candidates=candidates_test)
pairs_test = generate_als_feature(pairs_test, als_click_model, add_factors=False, mode='clicks')

pairs_test.to_hdf(os.path.join(DATA_PATH, 'step4_test_pairs.h5'), key='step4', mode='w')

In [38]:
print (np.sum(pairs1['y']==1) / np.sum(pairs1['y']==0))
print (np.sum(pairs2['y']==1) / np.sum(pairs2['y']==0))

5.3990635090280146e-05
5.2443705388204566e-05


In [42]:
!ls -alh data | grep step4

-rwxrwxrwx 1 root root  74M Jun  1 22:40 step4_test_pairs.h5
-rwxrwxrwx 1 root root 551M Jun  1 22:39 step4_val1_pairs.h5
-rwxrwxrwx 1 root root 552M Jun  1 22:40 step4_val2_pairs.h5
