In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
## item-similarity
class collab_filter():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.platform = None
        self.device = None
        self.co_matrix = None
    
    ## retrieve unique items for a user
    def get_unique_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(set(user_data[self.item_id].tolist()))
        
        return user_items
    
    ## retrieve unique users for an item
    def get_unique_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = list(set(item_data[self.user_id].tolist()))
        
        return item_users
    
    ## retrieve all unique items in training data
    def get_all_items(self):
        all_items = list(set(self.train_data[self.item_id].tolist()))
        
        return all_items
    
    ## Doing popularity based recommendation by platform and device
    def popularity_by_geodev(self, loc, dev_typ):
        ## count number of clicks for items
        df_data = self.train_data.copy()
        df_train = df_data.dropna()
        df_grouped = df_train.groupby(['user_id','item_id','platform','device'])\
                            .apply(lambda df: df['item_id'].count())\
                            .reset_index(name='n_clicks')
        ## get 
        df_filter = df_grouped[(df_grouped['platform'] == loc) & (df_grouped['device'] == dev_typ)]
        ## sort values
        df_filter_sorted= df_filter.sort_values(by=['n_clicks'], ascending=False)
         
        ## pick top 25 items
        top_k = df_filter_sorted.iloc[:25, :]
        
        ## group
        df_out = top_k.groupby(['user_id']).apply(lambda df: ' '.join(df['item_id'].tolist()))\
                                            .reset_index(name='item_recommendations')

        return df_out
    
    ## co-occurrence matrix
    def co_occurrence_mat(self, user_items, all_items):
        # get all users for all items
        user_items_users = [self.get_unique_users(user_items[i]) for i in range(len(user_items))]
        
        ## co-occurrence matrix
        co_occurr_matrix = np.matrix(np.zeros(shape=(len(user_items), len(all_items))), float)
        
        ## calculate similarity between user_items and all unique items
        for i in range(len(all_items)):
            items_i_data = self.train_data[self.train_data[self.item_id] == all_items[i]]
            users_i = set(items_i_data[self.user_id].unique())
            
            for j in range(len(user_items)):
                # get unique users for item j
                users_j = user_items_users[j]
                users_intersect = users_i.intersection(users_j)
                if len(users_intersect) != 0:
                    users_comb = users_i.union(users_j)
                    co_occurr_matrix[j,i] = float(len(users_intersect)/float(len(users_comb)))
                else:
                    co_occurr_matrix[j,i] = 0
                    
        return co_occurr_matrix
    
    ## use co-occurrence matrix
    def generate_recommendations(self, user, co_occurr_mat, all_items, user_items):
        ## calculate weighted average of score for each user_items
        user_sim_score = co_occurr_mat.sum(axis=0)/float(co_occurr_mat.shape[0])
        user_sim_score = np.array(user_sim_score)[0].tolist()
        
        sort_idx = sorted(((e,i) for i,e in enumerate(list(user_sim_score))), reverse=True)
        columns = ['user_id', 'item_id', 'score', 'rank']
        df = pd.DataFrame(columns=columns)
        
        ## fill with top 25 item based recommendations
        rank =1
        for i in range(0, len(sort_idx)):
            if ~np.isnan(sort_idx[i][0]) and all_items[sort_idx[i][1]] not in user_items and rank <= 25:
                df.loc[len(df)] = [user, all_items[sort_idx[i][1]], sort_idx[i][0],rank]
                rank = rank+1
                
        ## check if there is no recommendation
        if df.shape[0] == 0:
            ## Popularity recommendation based on platform and device 
            plat = self.train_data.loc[self.train_data['user_id'] == user, 'platform'].values
            dev = self.train_data.loc[self.train_data['user_id'] == user, 'device'].values
            user_loc = plat[0]
            dev_type = dev[0]
            ## recommend by popularity based on location and device type
            top_k = self.popularity_by_geodev(user_loc, dev_type)
            
            return top_k
        else:
            #rec_filter = rec['item_id'].tolist()
            df_filter = df.groupby(['user_id']).apply(lambda df: ' '.join(df['item_id'].tolist()))\
                                               .reset_index(name='item_recommendations')
            return df_filter
        
    ## create model
    def create(self, train_data, user_id, item_id, platform, device):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id
        self.platform = platform
        self.device = device
    
    def recommend(self, user):
        user_items = self.get_unique_items(user)
        all_items = self.get_all_items()
        
        ## co-occurrence matrix
        co_occurr_matrix = self.co_occurrence_mat(user_items, all_items)
        
        df_recommendations = self.generate_recommendations(user, co_occurr_matrix, all_items, user_items)
        
        return df_recommendations

In [3]:
test_data = pd.read_csv('../input/testdata/test.csv', delimiter=',')

In [4]:
##
mask = test_data["action_type"] == "clickout item"
test_click = test_data[mask]

In [5]:
df_test = test_click.copy()
df_test = df_test.iloc[:, [0,1,2,3]]

In [6]:
##
df2_test = test_click.copy()
df2_test = df2_test.iloc[:,[0,5,6,8]]
df2_test.rename(columns={'reference':'item_id'}, inplace=True)

In [7]:
## drop NaN rows
df_user = df2_test.copy()
#df_user = df_user.dropna()

In [8]:
item_users = df2_test[df2_test['item_id'].isnull()]['user_id'].values

In [13]:
## generate recommendations
def compute_recommendations(df_user, item_users):
    sim_model = collab_filter()
    sim_model.create(df_user, 'user_id', 'item_id', 'platform', 'device')
    
    ## generate item recommendations for each user_id
    recommends = []
    for user in item_users:
        df_rec = sim_model.recommend(user)
        with open('output.csv', 'a') as f:
            f.write(str(df_rec['user_id'].values[0]) + '\t' + str(df_rec['item_recommendations'].values[0]) + '\n')
        recommends.append(df_rec)
    df = pd.concat(recommends, sort=False, ignore_index=True)
    
    return df

In [None]:
pred = compute_recommendations(df_user, item_users)