# 0. Import standard packages

In [71]:
import pandas as pd 
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Set-Up of Matrices for Collaborative filtering

First we want to implement the ratings matrix from Table 2.1 featuring 5 users and 6 items. We want to employ a simple user-based and item-based model for imputing ratings. For that we will use the cosine and the pearson-correlation-coefficient.

In [72]:
# create the tables 2.1 and 2.2

table_2_1 = np.array([[7, 6, 7, 4, 5 , 4], [6,7,np.nan, 4,3,4], [np.nan, 3, 3, 1, 1, np.nan], [1, 2, 2 , 3 ,3 , 4], [1, np.nan, 1, 2, 3, 3]])

In [73]:
table_2_1

array([[ 7.,  6.,  7.,  4.,  5.,  4.],
       [ 6.,  7., nan,  4.,  3.,  4.],
       [nan,  3.,  3.,  1.,  1., nan],
       [ 1.,  2.,  2.,  3.,  3.,  4.],
       [ 1., nan,  1.,  2.,  3.,  3.]])

Next, we want to compute the pearson-correlation-coefficient. The formula for this is:

$ \mu_u = \frac{\sum_{k \in I_{u}} r_{uk}}{|I_{u}|} $

$  Sim(u,v) = Pearson(u,v) = \frac{\sum_{k \in I_{u} \cup I_{v}} (r_{uk}- \mu_{u}) \cdot (r_{uk}- \mu_{u}) }{\sqrt{\sum_{k \in I_{u}\cup I_{v}} (r_{uk}- \mu_{u})^2} \cdot \sqrt{\sum_{k \in I_{u}\cup I_{v}} (r_{vk}- \mu_{v})^2}} $

In [74]:
def pearson_corr(table, x: int, y: int):
    # x and y are the rows of table on which to perform computation
    # get I_x and I_y
    I_x = np.where(np.isnan(table[x]) == False)[0]
    I_y = np.where(np.isnan(table[y]) == False)[0]
    # get the common indices
    I = np.intersect1d(I_x, I_y)
    # get the mean of x and y
    x_mean = np.mean(table[x][I])
    y_mean = np.mean(table[y][I])
    # get the numerator
    numerator = np.sum((table[x][I] - x_mean)*(table[y][I] - y_mean))
    # get the denominator
    denominator = np.sqrt(np.sum((table[x][I] - x_mean)**2)*np.sum((table[y][I] - y_mean)**2))
    # get the correlation
    corr = numerator/denominator
    return corr

Let's compute the correlation coefficient for rows 0 and 2 (1 and 3 in the book). In the book, it is given as 0.894.

In [75]:
pearson_corr(table_2_1, 0, 2)

np.float64(0.8944271909999159)

While we're at it, we can also compute the entire correlation matrix:

In [76]:
# generate a correlation matrix for the table
corr_matrix = np.zeros((5,5))
for i in range(5):
    for j in range(5):
        corr_matrix[i,j] = pearson_corr(table_2_1, i, j)

corr_matrix

array([[ 1.        ,  0.72347804,  0.89442719, -0.8992288 , -0.82422559],
       [ 0.72347804,  1.        ,  0.97072534, -0.72057669, -0.8992288 ],
       [ 0.89442719,  0.97072534,  1.        , -1.        , -0.8660254 ],
       [-0.8992288 , -0.72057669, -1.        ,  1.        ,  0.87705802],
       [-0.82422559, -0.8992288 , -0.8660254 ,  0.87705802,  1.        ]])

Let's write a function that does this:

In [77]:
def corr_matrix_func(table, dimension):
    if dimension == 0:
        number = table.shape[dimension]
        corr_matrix = np.zeros((number, number))
        for i in range(number):
            for j in range(number):
                corr_matrix[i,j] = pearson_corr(table, i, j)
    elif dimension == 1:
        number = table.shape[dimension]
        corr_matrix = np.zeros((number, number))
        table = table.T
        for i in range(number):
            for j in range(number):
                corr_matrix[i,j] = pearson_corr(table, i, j)
    else:
        print("Invalid dimension")
        return None
    return corr_matrix

In [78]:
corr_matrix_func(table = table_2_1, dimension = 0)

array([[ 1.        ,  0.72347804,  0.89442719, -0.8992288 , -0.82422559],
       [ 0.72347804,  1.        ,  0.97072534, -0.72057669, -0.8992288 ],
       [ 0.89442719,  0.97072534,  1.        , -1.        , -0.8660254 ],
       [-0.8992288 , -0.72057669, -1.        ,  1.        ,  0.87705802],
       [-0.82422559, -0.8992288 , -0.8660254 ,  0.87705802,  1.        ]])

In [79]:
corr_matrix_func(table_2_1, 0), corr_matrix_func(table_2_1, 1)

  corr = numerator/denominator


(array([[ 1.        ,  0.72347804,  0.89442719, -0.8992288 , -0.82422559],
        [ 0.72347804,  1.        ,  0.97072534, -0.72057669, -0.8992288 ],
        [ 0.89442719,  0.97072534,  1.        , -1.        , -0.8660254 ],
        [-0.8992288 , -0.72057669, -1.        ,  1.        ,  0.87705802],
        [-0.82422559, -0.8992288 , -0.8660254 ,  0.87705802,  1.        ]]),
 array([[1.        , 0.94063416, 0.98782916, 0.89714996, 0.67675297,
         0.57263713],
        [0.94063416, 1.        , 0.99862543, 0.69310328, 0.51449576,
                nan],
        [0.98782916, 0.99862543, 1.        , 0.6381449 , 0.62092042,
         0.62861856],
        [0.89714996, 0.69310328, 0.6381449 , 1.        , 0.81348922,
         0.87038828],
        [0.67675297, 0.51449576, 0.62092042, 0.81348922, 1.        ,
         0.33333333],
        [0.57263713,        nan, 0.62861856, 0.87038828, 0.33333333,
         1.        ]]))

This way we can compute the correlation coefficient both for rows or columns.

Next, we can demean the rating in every line.

In [80]:
def demean_func(table):
    # demean the table
    new_table = np.zeros(table.shape)
    for i in range(table.shape[0]):
        I = np.where(np.isnan(table[i]) == False)[0]
        new_table[i][I] = table[i][I] - np.mean(table[i][I])
    return new_table

In [81]:
table_2_2 = demean_func(table_2_1)
table_2_2

array([[ 1.5,  0.5,  1.5, -1.5, -0.5, -1.5],
       [ 1.2,  2.2,  0. , -0.8, -1.8, -0.8],
       [ 0. ,  1. ,  1. , -1. , -1. ,  0. ],
       [-1.5, -0.5, -0.5,  0.5,  0.5,  1.5],
       [-1. ,  0. , -1. ,  0. ,  1. ,  1. ]])

We also want to store the means somewhere. Let us write a function for that.

In [120]:
def mean_compute(table, dimension = 0):
    # compute the mean of the table
    if dimension == 1:
        table_copy = table.T
    else:
        table_copy = table
    mean = np.zeros(table_copy.shape[0])
    for i in range(table_copy.shape[0]):
        I = np.where(np.isnan(table_copy[i]) == False)[0]
        mean[i] = np.mean(table_copy[i][I])
    return mean

user_mean = mean_compute(table_2_1, 0)
user_mean

array([5.5, 4.8, 2. , 2.5, 2. ])

In [121]:
mean_compute(table_2_1, 1)

array([3.75, 4.5 , 3.25, 2.8 , 3.  , 3.75])

We can use the demeaned table to better predict a rating. Suppose that we want to predict for user in 3 (row-index 2) their ratings for movies 1 and 6 (indices 0 and 5, respectively). We can first compute the set of relevant neighbours as those whose correlation exceeds 0.7 (as an arbitrary benchmark).

In [87]:
corr_matrix_table_2_2 = corr_matrix_func(table_2_1, 0)
corr_matrix_table_2_2

array([[ 1.        ,  0.72347804,  0.89442719, -0.8992288 , -0.82422559],
       [ 0.72347804,  1.        ,  0.97072534, -0.72057669, -0.8992288 ],
       [ 0.89442719,  0.97072534,  1.        , -1.        , -0.8660254 ],
       [-0.8992288 , -0.72057669, -1.        ,  1.        ,  0.87705802],
       [-0.82422559, -0.8992288 , -0.8660254 ,  0.87705802,  1.        ]])

Aside from themselves, this is the case for users 1 and 2 (indices 0 and 1):

In [88]:
k = np.where((corr_matrix_table_2_2[2] > 0.7))
k = k[0][:-1]
k

array([0, 1])

In [90]:
# create a deep copy of table_2_2
table_2_1_copy = table_2_1.copy()
table_2_1_copy[2,0] = user_mean[2] + np.sum(corr_matrix_table_2_2[2][k]*table_2_2[k,0])/np.sum(np.abs(corr_matrix_table_2_2[2][k]))
table_2_1_copy

array([[7.        , 6.        , 7.        , 4.        , 5.        ,
        4.        ],
       [6.        , 7.        ,        nan, 4.        , 3.        ,
        4.        ],
       [3.34386392, 3.        , 3.        , 1.        , 1.        ,
               nan],
       [1.        , 2.        , 2.        , 3.        , 3.        ,
        4.        ],
       [1.        ,        nan, 1.        , 2.        , 3.        ,
        3.        ]])

Let us encapsulate this in a function. We will have to specify a correlation-threshold to determine k, and then within the function we create a deep-copy that fills in the NaN-values.

In [122]:
def imputation_function(table, threshold, dimension):
    table_copy = table.copy()
    # compute correlation_matrix
    corr_matrix = corr_matrix_func(table, dimension)
    # compute means
    user_mean = mean_compute(table, dimension)
    # demean the table
    table_copy_dm = demean_func(table_copy)
    # for each row in correlation matrix compute the k based on threshold
    k = [[] for _ in range(corr_matrix.shape[0])]
    for i in range(corr_matrix.shape[0]):
        k[i] = np.where((corr_matrix[i] > threshold))[0]
    # for each value of k, remove the same number if it is in k, e.g. remove 2 from k[2]
    for i in range(corr_matrix.shape[0]):
        k[i] = np.setdiff1d(k[i], i)
    # for each row and column in the table, compute the imputation
    for i in range(table.shape[0]):
        for j in range(table.shape[1]):
            if np.isnan(table[i,j]):
                table_copy[i,j] = user_mean[i] + np.sum(corr_matrix[i][k[i]]*table_copy_dm[k[i],j])/np.sum(np.abs(corr_matrix[i][k[i]]))
    return table_copy, corr_matrix, user_mean, k

In [123]:
table_2_1_imputed, corr_matrix_comp, user_mean_comp, k_comp = imputation_function(table_2_1, 0.7, 0)

In [124]:
table_2_1_imputed

array([[7.        , 6.        , 7.        , 4.        , 5.        ,
        4.        ],
       [6.        , 7.        , 6.0135157 , 4.        , 3.        ,
        4.        ],
       [3.34386392, 3.        , 3.        , 1.        , 1.        ,
        0.86431752],
       [1.        , 2.        , 2.        , 3.        , 3.        ,
        4.        ],
       [1.        , 1.5       , 1.        , 2.        , 3.        ,
        3.        ]])

In [125]:
corr_matrix_comp

array([[ 1.        ,  0.72347804,  0.89442719, -0.8992288 , -0.82422559],
       [ 0.72347804,  1.        ,  0.97072534, -0.72057669, -0.8992288 ],
       [ 0.89442719,  0.97072534,  1.        , -1.        , -0.8660254 ],
       [-0.8992288 , -0.72057669, -1.        ,  1.        ,  0.87705802],
       [-0.82422559, -0.8992288 , -0.8660254 ,  0.87705802,  1.        ]])

In [126]:
user_mean_comp

array([5.5, 4.8, 2. , 2.5, 2. ])

In [127]:
k_comp

[array([1, 2]), array([0, 2]), array([0, 1]), array([4]), array([3])]

# Interestingly enough, we can do the entire thing now as well for items!

In [128]:
table_2_1_imputed_cols, corr_matrix_comp_cols, user_mean_comp_cols, k_comp_cols = imputation_function(table_2_1, 0.7, 1)
table_2_1_imputed_cols

  corr = numerator/denominator


array([[7.        , 6.        , 7.        , 4.        , 5.        ,
        4.        ],
       [6.        , 7.        , 5.74252405, 4.        , 3.        ,
        4.        ],
       [4.59918476, 3.        , 3.        , 1.        , 1.        ,
        2.10190223],
       [1.        , 2.        , 2.        , 3.        , 3.        ,
        4.        ],
       [1.        , 2.5       , 1.        , 2.        , 3.        ,
        3.        ]])

In [129]:
corr_matrix_comp_cols

array([[1.        , 0.94063416, 0.98782916, 0.89714996, 0.67675297,
        0.57263713],
       [0.94063416, 1.        , 0.99862543, 0.69310328, 0.51449576,
               nan],
       [0.98782916, 0.99862543, 1.        , 0.6381449 , 0.62092042,
        0.62861856],
       [0.89714996, 0.69310328, 0.6381449 , 1.        , 0.81348922,
        0.87038828],
       [0.67675297, 0.51449576, 0.62092042, 0.81348922, 1.        ,
        0.33333333],
       [0.57263713,        nan, 0.62861856, 0.87038828, 0.33333333,
        1.        ]])

In [130]:
k_comp_cols

[array([1, 2, 3]),
 array([0, 2]),
 array([0, 1]),
 array([0, 4, 5]),
 array([3]),
 array([3])]

# 2. We have now computed using the pearson-correlation coefficient. Next up, let's turn towards using the cosine-distance as well as z-scores.