# Neighborhood Based Brute Force Collaborative Filtering

In [18]:
import numpy as np
import math 
import pandas as pd

In [15]:
class Similarity: 
    
    def nonmissing_mean(self, u): 
        total = np.sum(u)
        nonzero = np.count_nonzero(u)
        return total/nonzero
        
    def inner_product(self, u, v): 
        return np.dot(u, v)
    
    def cosine_similarity(self, u, v): 
        return (np.dot(u, v))/(math.sqrt(np.dot(u, u))*math.sqrt(np.dot(v, v)))
    
    def pearson_similarity(self, u, v): 
        mean_u = self.nonmissing_mean(u)
        mean_v = self.nonmissing_mean(v)
        
        u_centered = u-mean_u 
        v_centered = v-mean_v 
        
        return (np.dot(u_centered, v_centered))/(math.sqrt(np.dot(u_centered, u_centered))*math.sqrt(np.dot(v_centered, v_centered)))
    

In [16]:
"""Usage Example"""
sim = Similarity() 
u = [0, 0, 2, 1, 4]
v = [0, 0, 1, 2, 3]

sim.pearson_similarity(u, v), sim.inner_product(u, v), sim.cosine_similarity(u, v)

(0.9086882225022429, 16, 0.933138949631687)

In [115]:
class Neighbor_Similarities: 
    
    def user_based_np(self, matrix): 
        (Y, X) = matrix.shape 
        result_matrix = np.empty((Y, Y))
        #for each row in matrix, iterate through all the rows
        
        sim = Similarity() 
        for i, row_i in enumerate(matrix): 
            for j, row_j in enumerate(matrix): 
                score = sim.pearson_similarity(row_i, row_j)
                result_matrix[i, j] = score 
        
        """TODO, replace diagonal with zero"""
        return result_matrix 
    
    def user_based_pd(self, matrix): 
        (Y, X) = matrix.shape 
        matrix_indices = matrix.index 
#         print(matrix_indices)
        temp_matrix_indices_pd = pd.DataFrame(matrix_indices, columns=['indices'])
#         print(temp_matrix_indices_pd)
        result_matrix = np.empty((Y, Y))
        #for each row in matrix, iterate through all the rows

        sim = Similarity() 
        for i, row_i in matrix.iterrows(): 
            for j, row_j in matrix.iterrows(): 
                score = sim.pearson_similarity(row_i, row_j)
                result_matrix[i, j] = score 
        
        result_matrix = pd.DataFrame(result_matrix)
        result_matrix.columns = matrix_indices 
        
        result_matrix = pd.concat([temp_matrix_indices_pd, result_matrix], axis=1)
        result_matrix = result_matrix.set_index('indices') 
        
        """TODO, replace diagonal with zero"""
        return result_matrix 


In [117]:
"""Usage Example"""

matrix = np.array([u, v])

NS = Neighbor_Similarities() 
print(NS.user_based_np(matrix))

matrix_pd = pd.DataFrame(matrix)
matrix_pd.columns = ['a', 'b', 'c', 'd', 'e']
NS.user_based_pd(matrix_pd)


[[1.         0.90868822]
 [0.90868822 1.        ]]


Unnamed: 0_level_0,0,1
indices,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,0.908688
1,0.908688,1.0


In [112]:
class UserGamePredictor: 
    
    NS = Neighbor_Similarities()
    def predict(self, user_item_dataframe, k): 
        #for each user
        #for each empty item
        #filter for other users that have nonzero empty item
        #return metric in user_item_matrix 
        sim = Similarity()
        user_similarity_matrix = NS.user_based_pd(user_item_dataframe)
        #might need to run this only on set with time played
        print(user_similarity_matrix)
        result = np.array(user_item_dataframe)
        
        for i, user_row in user_item_dataframe.iterrows():
            mean = sim.nonmissing_mean(user_row)
            for j, game in user_row.iterrows(): 
                continue if game > 0 
                #select neighbors who's game is nonempty
                neighbor_set = user_item_dataframe.loc[user_item_dataframe[game] > 0].indices
                neighbor_similarity = user_similarity_matrix[i][neighbor_set]
                
                #select top neighbor, similarty pairs
                top_neighbors = []
                temp_sum = 0 
                temp_sim_total = 0 
                for neighbor in top_neighbors: 
                    similarity = neighbor_similarity[][]
                    s_vj = sim.nonmissing_mean(neighbor)
                    temp_sim_total += math.abs(similarity)
                    temp_sum += similarity*s_vj
                
                result[i][j] = temp_sum/temp_sim_total
                
        return result 
                
                
                
                
                
                
                
                
                
        
        
        
        

In [121]:
"""Usage Example"""
processed_directory = "../../data/processed"
one_hot_collapsed = pd.read_csv(processed_directory+"/one_hot_collapsed.csv", ',')
one_hot_collapsed.describe()

predictor = UserGamePredictor()
predictor.predict(one_hot_collapsed, 3)

              0         1         2         3         4         5         6    \
indices                                                                         
0        1.000000  0.995226  0.960815  0.806150  1.000000  0.995225  1.000000   
1        0.995226  1.000000  0.983282  0.860053  0.995229  1.000000  0.995229   
2        0.960815  0.983282  1.000000  0.938578  0.960825  0.983283  0.960825   
3        0.806150  0.860053  0.938578  1.000000  0.806172  0.860057  0.806172   
4        1.000000  0.995229  0.960825  0.806172  1.000000  0.995229  1.000000   
5        0.995225  1.000000  0.983283  0.860057  0.995229  1.000000  0.995229   
6        1.000000  0.995229  0.960825  0.806172  1.000000  0.995229  1.000000   
7        1.000000  0.995229  0.960825  0.806172  1.000000  0.995229  1.000000   
8        0.934475  0.964762  0.996545  0.963994  0.934488  0.964764  0.934488   
9        1.000000  0.995229  0.960825  0.806172  1.000000  0.995229  1.000000   
10       0.981658  0.995579 