In [2]:
import pandas as pd

Loading the data and exploring
---

In [3]:
orders = pd.read_csv('orders.csv')
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [4]:
order_products_prior = pd.read_csv('order_products__prior.csv')
order_products = pd.read_csv('order_products__train.csv')
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [5]:
products = pd.read_csv('products.csv')
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
print "There are {} products in {} aisles".format(products['product_id'].nunique(), products['aisle_id'].nunique())

There are 49688 products in 134 aisles


In [7]:
print "There are {} orders from {} users".format(orders['order_id'].nunique(), orders['user_id'].nunique())

There are 3421083 orders from 206209 users


In [8]:
orders['user_id'].value_counts().head()

53684     100
201623    100
50817     100
176951    100
88911     100
Name: user_id, dtype: int64

Naive implementation
---

We will fix the number of products in an order to 5 and calculate the most frequent product that a user orders.
We evaluate our predictions in our train dataset using the prior dataset

In [118]:
train_data = order_products_prior.merge(orders, left_on='order_id', right_on='order_id')
test_data = order_products.merge(orders, left_on='order_id', right_on='order_id')

test_orders = orders[orders['eval_set']=='train'][['order_id','user_id']]

In [115]:
from tqdm import tqdm

class Recommender:
    def __init__(self):
        self.recommendations = pd.DataFrame()

    def _most_popular_products(self, data):
        nb_products_to_reccomend = 10
        # user_ids = data['user_id'].unique()[:1000]

        # for user_id in tqdm(user_ids):
        #    self.recommendations[user_id] = data[data['user_id']==user_id]['product_id'].value_counts()[:nb_products_to_reccomend].index.values.tolist()

        most_ordered_items = train_data.groupby(['user_id', 'product_id'])['order_id'].count().reset_index(name="count")
        self.recommendations = most_ordered_items.groupby('user_id').head(nb_products_to_reccomend)
        
    def train(self, data):
        self._most_popular_products(data)
    
    def predict_order(self, user_id):
        return self.recommendations[self.recommendations['user_id']==user_id]['product_id'].as_matrix()
    
    def evaluate(self, test_data, metric='recall'):
        user_ids_in_recommendations = self.recommendations['user_id'].as_matrix()
        
        tp = 0
        fn = 0
        for order_id, user_id in tqdm(test_data):
            if user_id in user_ids_in_recommendations:
                recommended_products = self.predict_order(user_id)
                bought_products = order_products[order_products['order_id']==order_id]['product_id'].as_matrix()

                missed_products = list(set(bought_products)-set(recommended_products))

                tp += len(bought_products)-len(missed_products)
                fn += len(missed_products)

        if metric=='recall':
            recall = 1.0*tp/(tp+fn)
            print recall

In [116]:
rec = Recommender()
rec.train(train_data)

In [119]:
rec.evaluate(test_orders.as_matrix())

100%|██████████| 131209/131209 [15:37<00:00, 139.93it/s]

0.113263812303



