In [1]:
import os 
import numpy as np
import pandas as pd
from datetime import date, timedelta

import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## MAP@12 Function

In [2]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
#         print('items 1: ')
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
#             print('num_hits: ',num_hits)
#             print('score: ',score)
#             print('final score:', score / min(len(actual), k))
#             print('='*50)
    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) 

## Load Data

In [3]:
path = '../data/processed'
target = pd.read_pickle(os.path.join(path,'weekly_target.pkl'))
target.head()

Unnamed: 0,customer_id,week,year,total_articles,total_amount,weekly_purchased_products
0,0,12,2020,5,0.0936,"[795440001, 841260003, 887593002, 859416011, 8..."
1,0,18,2019,1,0.0102,[697138006]
2,0,21,2019,2,0.1017,[568601006]
3,0,30,2019,2,0.0339,"[607642008, 745232001]"
4,0,36,2020,1,0.0508,[568601043]


In [5]:
trans = pd.read_pickle(os.path.join(path,'transactions.pkl'))
trans['week'] = trans['t_dat'].dt.isocalendar().week 
trans['year'] = trans['t_dat'].dt.year
trans['month'] = trans['t_dat'].dt.month
trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,year,month
0,2018-09-20,2,663713001,0.0508,2,38,2018,9
1,2018-09-20,2,541518023,0.0305,2,38,2018,9
2,2018-09-20,7,505221004,0.0152,2,38,2018,9
3,2018-09-20,7,685687003,0.0169,2,38,2018,9
4,2018-09-20,7,685687004,0.0169,2,38,2018,9


In [6]:
customers = pd.read_pickle(os.path.join(path,'customers.pkl'))
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,0,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,1,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,2,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,3,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,4,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


### 2020 Target

In [7]:
monthly_purchased = pd.DataFrame(trans.groupby(['customer_id','year','month'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'monthly_purchased_products'})

monthly_purchased.head()

Unnamed: 0,customer_id,year,month,monthly_purchased_products
0,0,2018,12,"[625548001, 627759010, 176209023]"
1,0,2019,5,"[568601006, 697138006]"
2,0,2019,7,"[607642008, 745232001]"
3,0,2019,9,"[797065001, 656719005]"
4,0,2019,11,"[785710001, 812683013, 694736004, 785186005]"


## Test Evaluation function

### Mock-up prediction (Top 12 Products)

In [8]:
# top_product = trans[trans.year == 2019].groupby(['article_id']).agg({'customer_id':'nunique'}).sort_values(by='customer_id',ascending=False)
top_product = trans[trans.year <= 2020].groupby(['article_id']).agg({'customer_id':'nunique'}).sort_values(by='customer_id',ascending=False)
top_product.head(12)

Unnamed: 0_level_0,customer_id
article_id,Unnamed: 1_level_1
706016001,32251
372860001,25559
706016002,25485
610776002,22571
759871002,21613
372860002,20038
464297007,18554
720125001,17611
673396002,17147
610776001,16854


In [19]:
top_product_list = ' '.join(list(top_product.index[:12].astype(str)))
top_product_list

'706016001 372860001 706016002 610776002 759871002 372860002 464297007 720125001 673396002 610776001 673677002 706016003'

### Assign Basic Prediction

In [33]:
test_data = customers[['customer_id']].drop_duplicates(subset=['customer_id'])
test_data['pred'] = top_product_list
test_data['pred'] = test_data['pred'].apply(lambda x: [int(i) for i in x.split()])
test_data.sort_values(by='customer_id',inplace=True)
test_data.reset_index(drop=True,inplace=True)
test_data.head()

Unnamed: 0,customer_id,pred
0,0,"[706016001, 372860001, 706016002, 610776002, 7..."
1,1,"[706016001, 372860001, 706016002, 610776002, 7..."
2,2,"[706016001, 372860001, 706016002, 610776002, 7..."
3,3,"[706016001, 372860001, 706016002, 610776002, 7..."
4,4,"[706016001, 372860001, 706016002, 610776002, 7..."


### Test MAP@12

In [37]:
for i in range(1,10):
    actual_test = monthly_purchased[(monthly_purchased.year==2020)&(monthly_purchased.month==i)].sort_values(by='customer_id').reset_index(drop=True)
    actual = actual_test['monthly_purchased_products']
    predicted = test_data['pred']
    error, list_map = mapk(actual, predicted, k=12)
    print(f'MAP@12 for month {i}: ',error)

MAP@12 for month 1:  0.005573113619576095
MAP@12 for month 2:  0.00476986883278556
MAP@12 for month 3:  0.006395688307666284
MAP@12 for month 4:  0.0056609607595126095
MAP@12 for month 5:  0.004290574074507777
MAP@12 for month 6:  0.0025392513674413275
MAP@12 for month 7:  0.004108774161796482
MAP@12 for month 8:  0.0036810661722133406
MAP@12 for month 9:  0.0037347697962448385


In [23]:
actual = target[(target.week==38)&(target.year==2020)]['weekly_purchased_products'].reset_index(drop=True)
predicted = sample_submit['prediction']
mapk(actual, predicted, k=12)

NameError: name 'sample_submit' is not defined

In [24]:
actual = ['0706016001', '0656719005','0673677002']
predicted = top_product_list.split()
print(actual)
print(predicted)
apk(actual,predicted,k=12)

['0706016001', '0656719005', '0673677002']
['706016001', '372860001', '706016002', '610776002', '759871002', '372860002', '464297007', '720125001', '673396002', '610776001', '673677002', '706016003']


0.0

## Sample Submission

In [22]:
import pickle
# mapping index
path = '../data/processed'

infile = open(os.path.join(path,'index_to_cusId.pkl'),'rb')
index_to_id_dict = pickle.load(infile)
infile.close()


In [23]:
sample_submit = test_data.copy()
sample_submit['prediction'] = sample_submit['pred'].apply(lambda x: ' '.join(x))

sample_submit["customer_id"] = sample_submit["customer_id"].map(index_to_id_dict)
sample_submit.drop(columns=['pred'],inplace=True)
sample_submit

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0372860001 0706016002 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0372860001 0706016002 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0372860001 0706016002 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0372860001 0706016002 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0372860001 0706016002 0610776002 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0372860001 0706016002 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0372860001 0706016002 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0372860001 0706016002 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0372860001 0706016002 0610776002 07...


In [148]:
sample_submit.to_csv('submission.csv',index=False)

In [None]:
top_product.drop_