# MIPT Data Mining in Action
## Hometask 0 (Industry)

In [89]:
from collections import Counter
from collections import OrderedDict
import numpy as np
import pandas as pd
import heapq

In [90]:
# High-performance counter-like data structure (based on dict())
visit_counter = Counter()
purchase_counter = Counter()

# Extracting data from datasets
with open('train.txt', 'r') as f:
    for line in f:
        visits, purchases = line.strip().split(';')
        visits = visits.split(',')
        
        if purchases != '':
            purchases = purchases.split(',')
        
        visit_counter.update(visits)
        purchase_counter.update(purchases)

In [91]:
# Recommendation system based on top frequent purchased items
def recommend_by_purchase(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), 
                          key = lambda x: visit_counter.get(x, 0))

# Recommendation system based on top frequent visited items     
def recommend_by_visit(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), 
                          key = lambda x: purchase_counter.get(x, 0))

In [92]:
def calculate_metrics(recommend, max_count, session_file):
    with open(session_file, 'r') as f:
        # Keeping track of recall and precision sum
        recalls = np.zeros(max_count)
        precisions = np.zeros(max_count)
        session_count = 0;
        
        for line in f:
            visits, purchases = line.strip().split(';')
        
            if purchases == '':
                continue
                
            visits = visits.split(',')
            purchases = purchases.split(',')
        
            # Top max_count items
            for cur_count in range(1, max_count + 1):
                recommended = recommend(visits, cur_count)
                purchased_from_recommended = np.array([])
                
                for item in recommended:
                    if item in purchases:
                        purchased_from_recommended = np.append(purchased_from_recommended, item)
                        
                recalls[cur_count - 1] += len(purchased_from_recommended) * 1.0 / len(purchases)
                precisions[cur_count - 1] += len(purchased_from_recommended) * 1.0 / cur_count
        
            # Storing recalls and precisions
            # recalls = np.append(recalls, len(purchased_from_recommended) * 1.0 / len(purchases))
            # precisions = np.append(precisions, len(purchased_from_recommended) * 1.0 / (index + 1))
            session_count += 1
        
        #print recalls
        #print precisions
        
        # Calculating average recall and precision
        return pd.DataFrame({
                'k': np.arange(max_count) + 1,
                 'average_recall@k': [round(x, 2) for x in recalls / session_count],
                 'average_precision@k' : [round(x, 2) for x in precisions / session_count]
            }).set_index('k')

In [93]:
calculate_metrics(recommend_by_visit, 1, 'train.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.8,0.69


In [94]:
calculate_metrics(recommend_by_visit, 5, 'train.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.8,0.69
2,0.53,0.84
3,0.39,0.89
4,0.31,0.91
5,0.25,0.93


In [95]:
calculate_metrics(recommend_by_visit, 1, 'test.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.53,0.46


In [96]:
calculate_metrics(recommend_by_visit, 5, 'test.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.53,0.46
2,0.38,0.64
3,0.3,0.73
4,0.25,0.79
5,0.21,0.82


In [97]:
calculate_metrics(recommend_by_purchase, 1, 'train.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.51,0.44


In [98]:
calculate_metrics(recommend_by_purchase, 5, 'train.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.51,0.44
2,0.38,0.63
3,0.3,0.73
4,0.25,0.79
5,0.21,0.82


In [99]:
calculate_metrics(recommend_by_purchase, 1, 'test.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.48,0.42


In [100]:
calculate_metrics(recommend_by_purchase, 5, 'test.txt')

Unnamed: 0_level_0,average_precision@k,average_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.48,0.42
2,0.36,0.6
3,0.29,0.7
4,0.24,0.76
5,0.2,0.8
