In [5]:
import os,path
import pickle
import datetime
import numpy as np
import pandas as pd
from datetime import date, timedelta


import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## MAP@K Function

In [6]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
#         print('items 1: ')
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
#             print('num_hits: ',num_hits)
#             print('score: ',score)
#             print('final score:', score / min(len(actual), k))
#             print('='*50)
    # remove this case in advance
#     if not actual:
#         return 0.0
    try:
        return score / min(len(actual), k)
    except:
        return 0


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
#     print([apk(a,p,k) for a,p in zip(actual, predicted)]) 
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) 

## Load Data

In [None]:
path = '../data/processed'
bird = pd.read_csv(os.path.join(path,'submission_bird.csv'))\
        .rename(columns={'last_article':'recent_purchase','recomends':'prediction'})\
        .drop(columns=['last_date','recent_purchase'])
bird['prediction'] = bird['prediction'].apply(lambda x: ' '.join(['0'+i for i in x.split()]))
bird.head()

### Submission files

In [4]:
path = '../data/processed'
# submit = pd.read_csv('submissions.csv',dtype=str)
mew = pd.read_csv(os.path.join(path,'submissions (2).csv'))
mew_v3 = pd.read_csv(os.path.join(path,'submissions_v3.csv')) #  {'factors': 500, 'iterations': 3, 'regularization': 0.01}
mew_v4 = pd.read_csv(os.path.join(path,'submissions_v4.csv')) #  {'factors': 500, 'iterations': 3, 'regularization': 0.01} & filter_already_liked_items
# mew_v5 = pd.read_csv(os.path.join(path,'submissions_v5.csv')) #  past 1 year {'factors': 50, 'iterations': 15, 'regularization': 0.01}
# mew_v6 = pd.read_csv(os.path.join(path,'submissions_v6.csv')) #  all-time data {'factors': 50, 'iterations': 15, 'regularization': 0.01}
top_l1m = pd.read_csv(os.path.join(path,'top_l1m.csv'))  
got = pd.read_csv(os.path.join(path,'recom_data_got2.csv'))
non = pd.read_csv(os.path.join(path,'submission_full_v1_NON.csv'))
bird = pd.read_csv(os.path.join(path,'submission_bird.csv'))\
        .rename(columns={'last_article':'recent_purchase','recomends':'prediction'})\
        .drop(columns=['last_date','recent_purchase'])



In [5]:

bird['prediction'] = bird['prediction'].apply(lambda x: ' '.join(['0'+i for i in x.split()]))
bird.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0176209023 0307952006 0389236006 0410024001 04...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0599580083 0607030002 0607031002 0684209025 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541491008 0557247004 0578630009 0593829007 05...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0466381012 0508323001 0527358001 0529035001 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0110065001 0153115019 0253448001 0278811002 02...


### Index to Customer_id 

In [6]:
# mapping index
path = '../data/processed'
infile = open(os.path.join(path,'index_to_cusId.pkl'),'rb')
index_to_id_dict = pickle.load(infile)
infile.close()

### Transaction file

In [7]:
path = '../data/processed'
trans = pd.read_pickle(os.path.join(path,'transactions.pkl'))
trans["customer_id"] = trans["customer_id"].map(index_to_id_dict)
trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.0508,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.0305,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.0152,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.0169,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.0169,2


## 7-day target

In [8]:
start_dt =  datetime.datetime(2020,9,15)
end_dt = start_dt + timedelta(7)

trans = trans[(trans.t_dat > start_dt) & (trans.t_dat <= end_dt)]
print('Min date: ', trans.t_dat.min())
print('Max date: ', trans.t_dat.max())
print(f'Total Customers: {trans.customer_id.nunique()}')

target = pd.DataFrame(trans.groupby(['customer_id'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'actual'})
target['actual'] = target['actual'].apply(lambda x: ' '.join(x))
# weekly_purchased['weekly_purchased_products'] = weekly_purchased['weekly_purchased_products'].apply(lambda x: list(set(x)))
target.head()

Min date:  2020-09-16 00:00:00
Max date:  2020-09-22 00:00:00
Total Customers: 68984


Unnamed: 0,customer_id,actual
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,0624486001
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,0827487003
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,0757926001 0640021019 0788575004
3,000525e3fe01600d717da8423643a8303390a055c578ed...,0874110016
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0903762001 0158340001 0879189005 0907149001 09...


## Evaluation

### Map Target

In [9]:
join_ = 'left'
new_top_l1m = top_l1m.merge(target, on = 'customer_id',how=join_).fillna('')
new_mew = mew.merge(target, on = 'customer_id',how=join_).fillna('')
new_mew_v3 = mew_v3.merge(target, on = 'customer_id',how=join_).fillna('')
new_mew_v4 = mew_v4.merge(target, on = 'customer_id',how=join_).fillna('')
# new_mew_v5 = mew_v5.merge(target, on = 'customer_id',how=join_).fillna('')
# new_mew_v6 = mew_v6.merge(target, on = 'customer_id',how=join_).fillna('')
new_got = got.merge(target, on = 'customer_id',how=join_).fillna('')
new_non = non.merge(target, on = 'customer_id',how=join_).fillna('')
new_bird = bird.merge(target, on = 'customer_id',how=join_).fillna('')


new_top_l1m['actual'] = new_top_l1m['actual'].apply(lambda x: x.split())
new_mew['actual'] = new_mew['actual'].apply(lambda x: x.split())
new_mew_v3['actual'] = new_mew_v3['actual'].apply(lambda x: x.split())
new_mew_v4['actual'] = new_mew_v4['actual'].apply(lambda x: x.split())
# new_mew_v5['actual'] = new_mew_v5['actual'].apply(lambda x: x.split())
# new_mew_v6['actual'] = new_mew_v6['actual'].apply(lambda x: x.split())
new_got['actual'] = new_got['actual'].apply(lambda x: x.split())
new_non['actual'] = new_non['actual'].apply(lambda x: x.split())

new_bird['actual'] = new_bird['actual'].apply(lambda x: x.split())

new_top_l1m['prediction'] = new_top_l1m['prediction'].apply(lambda x: x.split())
new_mew['prediction'] = new_mew['prediction'].apply(lambda x: x.split())
new_mew_v3['prediction'] = new_mew_v3['prediction'].apply(lambda x: x.split())
new_mew_v4['prediction'] = new_mew_v4['prediction'].apply(lambda x: x.split())
# new_mew_v5['prediction'] = new_mew_v5['prediction'].apply(lambda x: x.split())
# new_mew_v6['prediction'] = new_mew_v6['prediction'].apply(lambda x: x.split())
new_got['prediction'] = new_got['prediction'].apply(lambda x: x.split())
new_non['prediction'] = new_non['prediction'].apply(lambda x: x.split())
new_bird['prediction'] = new_bird['prediction'].apply(lambda x: x.split())

new_top_l1m.head()

Unnamed: 0,customer_id,prediction,actual
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0751471001, 0918292001, 0915526001, 070601600...",[]
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0751471001, 0918292001, 0915526001, 070601600...",[]
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0751471001, 0918292001, 0915526001, 070601600...",[]
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0751471001, 0918292001, 0915526001, 070601600...",[]
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0751471001, 0918292001, 0915526001, 070601600...",[]


### MAP@12

In [12]:
top_l1m_result = mapk(new_top_l1m['actual'],new_top_l1m['prediction'], k=12)
mew_result = mapk(new_mew['actual'],new_mew['prediction'], k=12)
mew_v3_result = mapk(new_mew_v3['actual'],new_mew_v3['prediction'], k=12)
mew_v4_result = mapk(new_mew_v4['actual'],new_mew_v4['prediction'], k=12)
# mew_v5_result = mapk(new_mew_v5['actual'],new_mew_v5['prediction'], k=12)
# mew_v6_result = mapk(new_mew_v6['actual'],new_mew_v6['prediction'], k=12)
got_result = mapk(new_got['actual'],new_got['prediction'], k=12)
non_result = mapk(new_non['actual'],new_non['prediction'], k=12)
bird_result = mapk(new_bird['actual'],new_bird['prediction'], k=12)

print('mAP@12')
print('top_l1m_result: {:.4%}'.format(top_l1m_result))
print('mew_v2_result: {:.4%}'.format(mew_result))
print('mew_v3_result: {:.4%}'.format(mew_v3_result))
print('mew_v4_result: {:.4%}'.format(mew_v4_result))
# print('mew_v5_result: {:.4%}'.format(mew_v5_result))
# print('mew_v6_result: {:.4%}'.format(mew_v6_result))
print('got_result: {:.4%}'.format(got_result))
print('non_result: {:.4%}'.format(non_result))
print('Pbird_result: {:.4%}'.format(bird_result))

mAP@12
top_l1m_result: 0.0284%
mew_v2_result: 2.3726%
mew_v3_result: 2.0516%
mew_v4_result: 0.0036%
got_result: 0.0279%
non_result: 0.0861%
Pbird_result: 0.0033%


In [11]:
numbers = "{:,}".format(len(new_top_l1m)*12)
print(f'Total recommended items: {numbers}')
print('Approx. puchased items:')
print('\t - top_l1m: {:,}'.format(round(len(new_top_l1m)*12*top_l1m_result),0))
print('\t - mew_v2_result: {:,}'.format(round(len(new_top_l1m)*12*mew_result),0))
print('\t - mew_v3_result: {:,}'.format(round(len(new_top_l1m)*12*mew_v3_result),0))
print('\t - mew_v4_result: {:,}'.format(round(len(new_top_l1m)*12*mew_v4_result),0))
# print('\t - mew_v5_result: {:,}'.format(round(len(new_top_l1m)*12*mew_v5_result),0))
# print('\t - mew_v6_result: {:,}'.format(round(len(new_top_l1m)*12*mew_v6_result),0))
print('\t - got_result: {:,}'.format(round(len(new_top_l1m)*12*got_result),0))
print('\t - non_result: {:,}'.format(round(len(new_top_l1m)*12*non_result),0))
print('\t - bird_result: {:,}'.format(round(len(new_top_l1m)*12*bird_result),0))

Total recommended items: 16,463,760
Approx. puchased items:
	 - top_l1m: 4,668
	 - mew_v2_result: 390,624
	 - mew_v3_result: 337,763
	 - mew_v4_result: 590
	 - got_result: 4,595
	 - non_result: 14,174
	 - bird_result: 539


In [1]:
print('Pbird_result: {:.4%}'.format(0.05))

Pbird_result: 5.0000%
