In [1]:
from collections import OrderedDict
from collections import Counter
import numpy as np
import pandas
import heapq

In [2]:
visit_popularity = Counter()
purchase_popularity = Counter()

with open('train.txt', 'r') as f:
    for line in f.xreadlines():
        visits, purchases = line.strip().split(';')
        visit_popularity.update(visits.split(','))
        purchase_popularity.update(purchases.split(','))

In [3]:
def recommend_by_purchase(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: purchase_popularity.get(x, 0))

def recommend_by_visit(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: visit_popularity.get(x, 0))

In [4]:
def calculate_metrics(recommend, max_count, sessions_file):
    with open(sessions_file, 'r') as f:
        avg_recall = np.zeros(max_count)
        avg_precision = np.zeros(max_count)
        sessions_count = 0
        for line in f.xreadlines():
            visits, purchases = line.strip().split(';')
            if purchases != '':
                visits = visits.split(',')
                purchases = purchases.split(',')
                rec = recommend(visits, max_count)
                sessions_count += 1
                p_len = len(purchases)
                for i in xrange(max_count):
                    intersect = set(rec[:i + 1]) & set(purchases[: i + 1])
                    len_intersect = float(len(intersect))
                    avg_precision[i] += len_intersect/(i + 1)
                    avg_recall[i] += len_intersect/p_len
                    
    return pandas.DataFrame({
            'k': np.arange(max_count) + 1,
            'avg_recall@k': [round(x, 2) for x in avg_recall / sessions_count],
            'avg_precision@k': [round(x, 2) for x in avg_precision / sessions_count]
    }).set_index('k')


In [5]:
calculate_metrics(recommend_by_purchase, 5, 'train.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.68,0.64
2,0.49,0.82
3,0.38,0.88
4,0.3,0.91
5,0.25,0.92


In [6]:
calculate_metrics(recommend_by_purchase, 5, 'test.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.46,0.44
2,0.36,0.63
3,0.29,0.73
4,0.24,0.79
5,0.21,0.82


In [7]:
calculate_metrics(recommend_by_visit, 5, 'train.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.44,0.41
2,0.36,0.62
3,0.29,0.72
4,0.25,0.79
5,0.21,0.82


In [8]:
calculate_metrics(recommend_by_visit, 5, 'test.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.42,0.39
2,0.34,0.6
3,0.28,0.7
4,0.24,0.76
5,0.2,0.8
