In [1]:
import cudf
import lightgbm as lgb
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [None]:
%%time 
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

In [4]:
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])
customers['FN'].fillna(0.,inplace=True)
customers['Active'].fillna(0.,inplace=True)
customers['club_member_status'].fillna('None',inplace=True)
customers['age'] = customers['age'] / 10
customers['age'] = customers['age'].astype(int)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].str.lower().fillna('none')

In [27]:
def construct_candidate_dict(transactions,key='customer_id'):
    purchase_dict = {}
    for i,x in enumerate(zip(transactions[key], transactions['article_id'])):
        key_id, art_id = x
        if key_id not in purchase_dict:
            purchase_dict[key_id] = {}
        if art_id not in purchase_dict[key_id]:
            purchase_dict[key_id][art_id] = 0
        purchase_dict[key_id][art_id] += 1
    dummy_list = list((transactions['article_id'].value_counts()).index)[:50]
    return purchase_dict,dummy_list

def construct_recent_purchase_df(
        test_df,transactions,
        weights=[1.,1.,0.,0.],
        nweek=3,
    ):
    
    time_decay_weight,popular_weight_1w,_,_ = weights
    
    recent_transactions = {}
    purchase_dict = {}
    for i in range(1,nweek+1):
        recent_transactions[i] = transactions[(transactions.t_dat>transactions.t_dat.max()-i*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-(i-1)*pd.Timedelta(7,unit='day'))]
        purchase_dict[i],_ = construct_candidate_dict(recent_transactions[i])
        
    most_popular_items_1w = list((recent_transactions[1]['article_id'].value_counts()).index)[:12]

    pred_df = test_df[['customer_id']]
    article_id_list,recent_purchase_score_list = [],[]
    
    for i, cust_id in enumerate(
        test_df['customer_id'].values.reshape((-1,))
    ):
        s = []
        total_purchase_dict = {}
        
        for i,purchase_dict_week in purchase_dict.items():
            if cust_id in purchase_dict_week:
                l = sorted((purchase_dict_week[cust_id]).items(), key=lambda x: x[1], reverse=True)
                l = [y[0] for y in l][:12]
                for aid in l:
                    if aid not in total_purchase_dict:
                        total_purchase_dict[aid] = time_decay_weight**(i-1)
                    else:
                        total_purchase_dict[aid] += time_decay_weight**(i-1)
                        
        for aid in most_popular_items_1w[:12]:
            if aid not in total_purchase_dict:
                total_purchase_dict[aid] = popular_weight_1w
            else:
                total_purchase_dict[aid] += popular_weight_1w

        total_purchase_dict = {k: v for k, v in sorted(total_purchase_dict.items(), key=lambda item: item[1], reverse=True)}
        article_ids = list(total_purchase_dict.keys())[:12]
        recent_purchase_scores = list(total_purchase_dict.values())[:12]
        
        article_id_list.append(article_ids)
        recent_purchase_score_list.append(recent_purchase_scores)
        
    pred_df['recent_purchase_article_id'] = article_id_list
    pred_df['recent_purchase_score'] = recent_purchase_score_list
    
    return pred_df

def make_prediction(model,test_df,features,label,k=100,group_name='customer_id'):
    test_x = test_df[features]
    test_pred = model.predict(test_x)
    test_x[group_name] = test_df[group_name]
    test_x['past_purchase_article_id'] = test_df['article_id']
    test_x['past_purchase_score'] = test_pred
    #test_x = test_x.sort_values(['customer_id','past_purchase_score'],ascending=False)
    pred_df = test_x.groupby('customer_id') \
                .apply(lambda x: x.sort_values('past_purchase_score',ascending=False)['past_purchase_article_id'].tolist()) \
                .reset_index()
    pred_df.columns = ['customer_id','past_purchase_article_id']
    past_purchase_score = test_x.groupby('customer_id') \
            .apply(lambda x: x.sort_values('past_purchase_score',ascending=False)['past_purchase_score'].tolist()).reset_index()
    past_purchase_score.columns = ['customer_id','past_purchase_score']
    pred_df = pred_df.merge(past_purchase_score,on='customer_id')
    return pred_df[['customer_id','past_purchase_article_id','past_purchase_score']]
    
def construct_past_purchase_df(test_df_path,model_path,label='label'):
    test_df = pd.read_csv(test_df_path)
    features = [c for c in test_df.columns if c not in ['article_id','customer_id','label','index','group_size']]
    bst = lgb.Booster(model_file=model_path)
    pred_df = make_prediction(bst,test_df,features,label)

    return pred_df

def rerank(x,weights=[1.,1.]):
    past_purchase_weight,recent_purchase_weight = weights
    
    past_purchase_article_ids = x['past_purchase_article_id']
    past_purchase_scores = x['past_purchase_score']
    recent_purchase_article_ids = x['recent_purchase_article_id']
    recent_purchase_scores = x['recent_purchase_score']
    d = {k:v*past_purchase_weight for k,v in zip(past_purchase_article_ids,past_purchase_scores)}
    for k,v in zip(recent_purchase_article_ids,recent_purchase_scores):
        if k not in d:
            d[k] = v*recent_purchase_weight
        else:
            d[k] += v*recent_purchase_weight
    d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    s = list(d.keys())[:12]
    return s

def construct_gt_df(test_transactions):
    gt_df = test_transactions.to_pandas().groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

def evaluate_score(pred_df,gt_df,k=12,verbose=True,group_name='customer_id',prediction_name='prediction'):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on=group_name,how='left')
    score = mapk(eval_df['ground_truth'].tolist(),eval_df[prediction_name].tolist())
    if verbose: print('map@'+str(k),score)
    return score

In [28]:
for i,(trn_start_time,trn_end_time,test_start_time,test_end_time,model_path,test_df_path) in enumerate([

        (
            '2020-01-01','2020-09-01','2020-09-01','2020-09-07',
            'storage/output/220325_lightgbm_training/220325_dataset_2020-05-17_2020-07-17_2020-09-01_2020-09-07_score:0.022701_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
            'storage/output/220325_dataset_2020-05-17_2020-07-17_2020-09-01_2020-09-07/test_df.csv',
        ),
        (
            '2020-01-01','2020-09-07','2020-09-07','2020-09-15',
            'storage/output/220325_lightgbm_training/220325_dataset_2020-05-24_2020-07-24_2020-09-07_2020-09-15_score:0.020357_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
            'storage/output/220325_dataset_2020-05-24_2020-07-24_2020-09-07_2020-09-15/test_df.csv',
        ),
        (
            '2020-01-01','2020-09-15','2020-09-15','2020-09-22',
            'storage/output/220325_lightgbm_training/220325_dataset_2020-06-01_2020-08-01_2020-09-15_2020-09-22_score:0.0230037_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
            'storage/output/220325_dataset_2020-06-01_2020-08-01_2020-09-15_2020-09-22/test_df.csv',
        ),
    ]):
    trn_transactions = transactions[(transactions.t_dat > cudf.to_datetime(trn_start_time)) & (transactions.t_dat <= cudf.to_datetime(trn_end_time))]
    test_transactions = transactions[(transactions.t_dat > cudf.to_datetime(test_start_time)) & (transactions.t_dat <= cudf.to_datetime(test_end_time))]
    gt_df = construct_gt_df(test_transactions)
    test_df = test_transactions[['customer_id']].drop_duplicates()
    recent_purchase_df = construct_recent_purchase_df(test_df.to_pandas(),trn_transactions.to_pandas())
    past_purchase_df = construct_past_purchase_df(test_df_path,model_path)
    for _ in range(1):
        print('-'*100)
        weights = [70.,1.]
        print(weights)
        pred_df = past_purchase_df.merge(recent_purchase_df,on='customer_id',how='outer')
        pred_df['prediction'] = pred_df.apply(lambda x: rerank(x,weights),axis=1)
        evaluate_score(recent_purchase_df,gt_df,prediction_name='recent_purchase_article_id')
        evaluate_score(pred_df,gt_df,prediction_name='past_purchase_article_id')
        evaluate_score(pred_df,gt_df,prediction_name='prediction')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


----------------------------------------------------------------------------------------------------
[70.0, 1.0]
map@12 0.021383701357811636
map@12 0.02270099903743698
map@12 0.02323246554084142
----------------------------------------------------------------------------------------------------
[70.0, 1.0]
map@12 0.01997875651931142
map@12 0.02035695498062064
map@12 0.020887647735950643
----------------------------------------------------------------------------------------------------
[70.0, 1.0]
map@12 0.023248185006228106
map@12 0.02300368158544079
map@12 0.0236313220086023


In [38]:
print('-'*100)
weights = [100000.,1.]
print(weights)
pred_df = past_purchase_df.merge(recent_purchase_df,on='customer_id',how='outer')
pred_df['prediction'] = pred_df.apply(lambda x: rerank(x,weights),axis=1)
evaluate_score(recent_purchase_df,gt_df,prediction_name='recent_purchase_article_id')
evaluate_score(pred_df,gt_df,prediction_name='past_purchase_article_id')
evaluate_score(pred_df,gt_df,prediction_name='prediction')

----------------------------------------------------------------------------------------------------
[100000.0, 1.0]
map@12 0.023248185006228106
map@12 0.02300368158544079
map@12 0.023034120872675912


0.023034120872675912

In [32]:
pred_df = pred_df.merge(gt_df,on='customer_id',how='outer')

In [33]:
pred_df['past_purchase_recent_purchase_intersect'] = pred_df.apply(lambda x: set(x['past_purchase_article_id'][:12]).intersection(x['recent_purchase_article_id']),axis=1)
pred_df['ground_truth_recent_purchase_intersect'] = pred_df.apply(lambda x: set(x['recent_purchase_article_id'][:12]).intersection(x['ground_truth']),axis=1)
pred_df['ground_truth_past_purchase_intersect'] = pred_df.apply(lambda x: set(x['past_purchase_article_id'][:12]).intersection(x['ground_truth']),axis=1)
pred_df['ground_truth_prediction_intersect'] = pred_df.apply(lambda x: set(x['prediction']).intersection(x['ground_truth']),axis=1)
pred_df['n_ground_truth_past_purchase_intersect'] = pred_df['ground_truth_past_purchase_intersect'].apply(len)
pred_df['n_ground_truth_recent_purchase_intersect'] = pred_df['ground_truth_recent_purchase_intersect'].apply(len)
pred_df['n_ground_truth_prediction_intersect'] = pred_df['ground_truth_prediction_intersect'].apply(len)

In [44]:
pred_df

Unnamed: 0,customer_id,past_purchase_article_id,past_purchase_score,recent_purchase_article_id,recent_purchase_score,prediction
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,"[448509014, 850917001, 852584001, 863595006, 9...","[0.10463362702894108, 0.059113335209599195, 0....","[909370001, 865799006, 918522001, 924243001, 4...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[448509014, 850917001, 852584001, 863595006, 9..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"[889036004, 880017001, 640021012, 621381012, 9...","[0.2322831520431229, 0.22992586874318396, 0.22...","[880017001, 640021012, 621381012, 889036004, 9...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[889036004, 880017001, 640021012, 621381012, 9..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[556255001, 816588001, 399136061, 732842015, 7...","[0.1846151834151848, 0.17839812591437085, 0.17...","[399136061, 732842014, 556255001, 852219003, 7...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[556255001, 816588001, 399136061, 732842015, 7..."
3,000525e3fe01600d717da8423643a8303390a055c578ed...,"[448509014, 850917001, 863595006, 918522001, 7...","[0.08536174574269602, 0.059113335209599195, 0....","[909370001, 865799006, 918522001, 924243001, 4...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[448509014, 850917001, 863595006, 918522001, 7..."
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[456163060, 933989002, 929744001, 865799005, 4...","[0.36705824355478534, 0.36705824355478534, 0.3...","[929744001, 865799005, 456163060, 933989002, 9...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[456163060, 933989002, 929744001, 865799005, 4..."
...,...,...,...,...,...,...
68979,fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82b...,"[448509014, 850917001, 852584001, 863595006, 9...","[0.10463362702894108, 0.059113335209599195, 0....","[909370001, 865799006, 918522001, 924243001, 4...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[448509014, 850917001, 852584001, 863595006, 9..."
68980,fffa7d7799eb390a76308454cbdd76e473d65b1497fbe4...,"[751471001, 896152002, 751471043, 850917001, 8...","[0.046478898288815386, 0.046478898288815386, 0...","[909370001, 865799006, 918522001, 924243001, 4...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[751471001, 896152002, 751471043, 850917001, 8..."
68981,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,"[914441004, 448509014, 881244001, 863583002, 7...","[0.24147657947645362, 0.04883112335725286, 0.0...","[914441004, 909370001, 865799006, 918522001, 9...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[914441004, 448509014, 881244001, 863583002, 7..."
68982,fffd870c6324ad3bda24e4d6aeae221c199479086bfdfd...,"[724906019, 867969003, 448509014, 863595006, 9...","[0.06235784912211472, 0.04672266612642703, 0.0...","[909370001, 865799006, 918522001, 924243001, 4...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[724906019, 867969003, 448509014, 863595006, 9..."


In [66]:
from pprint import pprint

tmp = pred_df.iloc[68908].to_dict()
tmp['past_purchase_article_id'] = tmp['past_purchase_article_id'][:12]
tmp['past_purchase_score'] = tmp['past_purchase_score'][:12]
pprint(tmp)

{'customer_id': 'ffbb641ba0e3264ab8b25f5e78630738ab59add1975086f35325d59aafd61069',
 'ground_truth': [896152002],
 'ground_truth_past_purchase_intersect': {896152002},
 'ground_truth_recent_purchase_intersect': set(),
 'intersect': {909370001, 918292001},
 'n': 2,
 'n_ground_truth_past_purchase_intersect': 1,
 'n_ground_truth_recent_purchase_intersect': 0,
 'past_purchase_article_id': [863583001,
                              909370001,
                              885951001,
                              865929003,
                              852584001,
                              918292001,
                              918292004,
                              762846006,
                              898694001,
                              915529003,
                              896152002,
                              915529001],
 'past_purchase_recent_purchase_intersect': {909370001, 918292001},
 'past_purchase_score': [0.011112864210053806,
                         0.009172