In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
## item-similarity
class collab_filter():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        #self.co_matrix = None
    
    ## retrieve all unique items for all users
    def get_user_items(self):
        user_items = self.train_data.groupby(self.user_id)[self.item_id].apply(lambda g: np.unique(g.values)\
                                                                                         .tolist()).to_dict()
        return user_items
    
    ## retrieve all unique users for all items
    def get_item_users(self):
        item_users = self.train_data.groupby(self.item_id)[self.user_id].apply(lambda g: np.unique(g.values)\
                                                                                         .tolist()).to_dict()
        return item_users
    
    ## filter user_items co-occurence for each unique item
    def filter_items(self, user_items, item_users, keys):
        filtered = {k: [user_items[x] for x in item_users[k]] for k in keys}
        
        return filtered
    ## flatten item-user_items co-occurence 
    def flat_clear(self, filtered):
        flattened = { k: list(set(itertools.chain(*v))) for k, v in filtered.items() }
        ## remove matching keys and items from co-occurrence 
        cleared = { k: [v for v in flattened[k] if v != k] for k in flattened.keys() }
        
        return cleared
    
    ## co-occurrence matrix
    def co_occurrence_mat(self, item_users, cleared):
        item_users_set = { k: set(v) for k, v in item_users.items() }

        co_occurrence = { k: [(v, len(item_users_set[k].intersection(item_users_set[v])), len(item_users_set[k]\
                                    .union(item_users_set[v]))) for v in cleared[k]] for k in cleared.keys()}
        
        ## normalization using ratio of intersection over union(Jaccard similarity)
        normalized_co_occurrence = { k: [(v[0], v[1]/float(v[2])) for v in co_occurrence[k]]\
                                                              for k in co_occurrence.keys() }
        
        return normalized_co_occurrence
    ## compute similarity score
    def compute_sim_score(self, user_items, user, co_occurrence):
        l = len(user_items[user])
        return { k: sum([v[1] for v in co_occurrence[k]]) / l for k in co_occurrence.keys()}
    
    ## use co-occurrence matrix
    def generate_recommendations(self, user, co_occurr_mat, user_items):
        ## calculate weighted average of score for each user_items
        sim_score = self.compute_sim_score(user_items, user, co_occurr_mat)
        
        ## sort in descending order of sim_score
        sort_sim_score = [(k, sim_score[k]) for k in sorted(sim_score, key=sim_score.get, reverse=True)]
        columns = ['user_id', 'item_id', 'score', 'rank']
        df = pd.DataFrame(columns=columns)
        
        ## fill with top 25 item based recommendations
        rank =1
        for i in range(0, len(sort_sim_score)):
            if rank <= 25:
                df.loc[len(df)] = [user, sort_sim_score[i][0], sort_sim_score[i][1],rank]
                rank = rank+1
        
        df_out = df.groupby(['user_id']).apply(lambda df: ' '.join(df['item_id'].tolist()))\
                                               .reset_index(name='item_recommendations')
        return df_out
        
    ## create model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id
    
    def recommend(self, user):
        user_items = self.get_user_items()
        item_users = self.get_item_users()
        keys = list(item_users.keys())
        filtered = self.filter_items(user_items, item_users, keys)
        cleared = self.flat_clear(filtered)
        
        ## co-occurrence matrix
        co_occurr_matrix = self.co_occurrence_mat(item_users, cleared)
        df_recommendations = self.generate_recommendations(user, co_occurr_matrix, user_items)
 
        return df_recommendations

In [None]:
#train_data = pd.read_csv('../input/traindata/train.csv', delimiter=',')

In [3]:
test_data = pd.read_csv('../input/testdata/test.csv', delimiter=',')

In [4]:
## filter test data by action type
mask = test_data["action_type"] == "clickout item"
test_click = test_data[mask]

In [None]:
## filter train data by action type
#mask2 = train_data["action_type"] == "clickout item"
#train_click = train_data[mask2]

In [None]:
#df_train = train_click.copy()
#df_train = df_train.iloc[:, [0,5]]
#df_train = df_train.dropna()

In [5]:
## drop NaN rows
df_test = test_click.copy()
df_test = df_test.iloc[:, [0,5]]
df_test = df_test.dropna()

In [6]:
##
df_user = df_test.copy()
df_user.rename(columns={'reference':'item_id'}, inplace=True)

In [7]:
## Generate recommendation for the test set
def generate_recommendations(df_user):
    unique_users = list(np.unique(df_user['user_id'].values))
    
    sim_model = collab_filter()
    sim_model.create(df_user, 'user_id', 'item_id')
    
    ## generate item recommendations for each user_id
    with open('submission_collab_filter.csv', 'a') as f:
        f.write('user_id' + ',' + 'item_recommendations\n')
        recommends = []
        for user in unique_users[50000:]:
            rec = sim_model.recommend(user)
            f.write(str(rec['user_id'].values[0]) + ',' + str(rec['item_recommendations'].values[0]) + '\n')
            recommends.append(rec)    
    df = pd.concat(recommends, sort=False)
    
    return df

In [None]:
pred = generate_recommendations(df_user)