In [9]:
import os 
import pickle
import datetime
import numpy as np
import pandas as pd
from datetime import date, timedelta

import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## MAP@K Function

In [10]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
#         print('items 1: ')
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
#             print('num_hits: ',num_hits)
#             print('score: ',score)
#             print('final score:', score / min(len(actual), k))
#             print('='*50)

    # remove this case in advance
    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) 

## Load Data

### Submission file

In [11]:
submit = pd.read_csv('submissions.csv',dtype=str)
print('total customers: ' ,len(submit))
submit.head()

total customers:  1371980


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0858856005 0779781015 0399256001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0112679048 0111609001 0111593001 0111586001 01...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321011 0805000007 0706016062 06...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0112679048 0111609001 0111593001 0111586001 01...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0112679048 0111609001 0111593001 0111586001 01...


### Index to Customer_id 

In [12]:
# mapping index
path = '../data/processed'
infile = open(os.path.join(path,'index_to_cusId.pkl'),'rb')
index_to_id_dict = pickle.load(infile)
infile.close()

### Transaction file

In [13]:
path = '../data/processed'
trans = pd.read_pickle(os.path.join(path,'transactions.pkl'))
trans["customer_id"] = trans["customer_id"].map(index_to_id_dict)
trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.0508,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.0305,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.0152,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.0169,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.0169,2


## 7-day target

In [14]:
start_dt =  datetime.datetime(2020,9,15)
end_dt = start_dt + timedelta(7)

trans = trans[(trans.t_dat > start_dt) & (trans.t_dat <= end_dt)]
print('Min date: ', trans.t_dat.min())
print('Max date: ', trans.t_dat.max())
print(f'Total Customers: {trans.customer_id.nunique()}')

target = pd.DataFrame(trans.groupby(['customer_id'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'actual'})
# weekly_purchased['weekly_purchased_products'] = weekly_purchased['weekly_purchased_products'].apply(lambda x: list(set(x)))
target.head()

Min date:  2020-09-16 00:00:00
Max date:  2020-09-22 00:00:00
Total Customers: 68984


Unnamed: 0,customer_id,actual
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[0624486001]
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[0827487003]
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[0640021019, 0757926001, 0788575004]"
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[0874110016]
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[0907149001, 0936622001, 0879189005, 091817100..."


In [15]:
path = '../data/processed'
target[['customer_id']][:1000].to_csv(os.path.join(path,'7day_target.csv'),index=False)

## Evaluation

In [17]:
test = submit.merge(target, on = 'customer_id',how='left')

# if prediction is the string use this.
test['prediction'] = test['prediction'].apply(lambda x: x.split())
# test['actual'] = test['actual'].fillna("").apply(list)

test.head()

Unnamed: 0,customer_id,prediction,actual
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0568601043, 0858856005, 0779781015, 039925600...",
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0112679048, 0111609001, 0111593001, 011158600...",
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0794321007, 0794321011, 0805000007, 070601606...",
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0112679048, 0111609001, 0111593001, 011158600...",
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0112679048, 0111609001, 0111593001, 011158600...",


In [18]:
# Calculate MAP@12 
apk_result = []
for i,(act,pred) in enumerate(zip(test['actual'],test['prediction'])):
    result = apk(act,pred,k=12)
#     print(f'customer {i}')
#     print(f'apk {result}')
    apk_result.append(result)
    
print('MAP@12: ',np.mean(apk_result))
mapk(test['actual'],test['prediction'],k=12)

TypeError: argument of type 'float' is not iterable