# Similarity Measure

In [1]:
import numpy as np
import pandas as pd

In [2]:
utility_matrix = pd.DataFrame([[np.nan, np.nan, 5, 5, 2],
                               [4, 5, np.nan, 1, np.nan]], 
                    columns = ["x-men", "antman", "frozen", "cinderalla", "annabelle"],
                    index = ["alice", "bob"])
utility_matrix

Unnamed: 0,x-men,antman,frozen,cinderalla,annabelle
alice,,,5.0,5,2.0
bob,4.0,5.0,,1,


## Euclidean Distance

In [3]:
alice_rating = utility_matrix.loc["alice"]
alice_rating

x-men         NaN
antman        NaN
frozen        5.0
cinderalla    5.0
annabelle     2.0
Name: alice, dtype: float64

In [4]:
bob_rating = utility_matrix.loc["bob"]
bob_rating

x-men         4.0
antman        5.0
frozen        NaN
cinderalla    1.0
annabelle     NaN
Name: bob, dtype: float64

In [5]:
np.nansum((alice_rating - bob_rating) ** 2) ** 0.5

4.0

## Cosine Similarity

In [6]:
alice_rating_filled_zero = np.nan_to_num(alice_rating)
alice_rating_filled_zero

array([0., 0., 5., 5., 2.])

In [7]:
bob_rating_filled_zero = np.nan_to_num(bob_rating)
bob_rating_filled_zero

array([4., 5., 0., 1., 0.])

In [8]:
print(np.sum(alice_rating_filled_zero ** 2) ** 0.5)

#to find the magnitude we can use np.linalg.norm()
magnitude_alice = np.linalg.norm(alice_rating_filled_zero)
print(magnitude_alice)

magnitude_bob = np.linalg.norm(bob_rating_filled_zero)
print(magnitude_bob)

7.3484692283495345
7.3484692283495345
6.48074069840786


In [9]:
#cosine similarity is the dot product / product of the 2 magnitudes
np.dot(alice_rating_filled_zero, bob_rating_filled_zero) / (magnitude_alice * magnitude_bob)

0.10499013139145201

## Pearson's Correlation Coefficient

Take a look at https://stats.stackexchange.com/questions/262925/is-there-a-serious-problem-with-dropping-observations-with-missing-values-when-c for reason why we should normalize the ratings by the mean of all ratings for each item (rather than just mean of the common items)

In [10]:
alice_rating_mean = np.mean(alice_rating)
alice_rating_mean

4.0

In [11]:
bob_rating_mean = np.mean(bob_rating)
bob_rating_mean

3.3333333333333335

In [12]:
alice_normalized_rating = (alice_rating - alice_rating_mean)
alice_normalized_rating

x-men         NaN
antman        NaN
frozen        1.0
cinderalla    1.0
annabelle    -2.0
Name: alice, dtype: float64

In [13]:
#find which item is rated by alice
~np.isnan(alice_normalized_rating)

x-men         False
antman        False
frozen         True
cinderalla     True
annabelle      True
Name: alice, dtype: bool

In [14]:
bob_normalized_rating = (bob_rating - bob_rating_mean)
bob_normalized_rating

x-men         0.666667
antman        1.666667
frozen             NaN
cinderalla   -2.333333
annabelle          NaN
Name: bob, dtype: float64

In [15]:
#find which item is rated by bob
~np.isnan(bob_normalized_rating)

x-men          True
antman         True
frozen        False
cinderalla     True
annabelle     False
Name: bob, dtype: bool

In [16]:
common_rated_items = ~np.isnan(alice_normalized_rating) & ~np.isnan(bob_normalized_rating)
common_rated_items

x-men         False
antman        False
frozen        False
cinderalla     True
annabelle     False
dtype: bool

In [17]:
alice_normalized_common_rating = alice_normalized_rating.values[common_rated_items.values]
alice_normalized_common_rating

array([1.])

In [18]:
magnitude_alice_common = np.linalg.norm(alice_normalized_common_rating)
magnitude_alice_common

1.0

In [19]:
bob_normalized_common_rating = bob_normalized_rating.values[common_rated_items.values]
bob_normalized_common_rating

array([-2.33333333])

In [20]:
magnitude_bob_common = np.linalg.norm(bob_normalized_common_rating)
magnitude_bob_common

2.3333333333333335

In [21]:
np.dot(alice_normalized_common_rating, bob_normalized_common_rating)

-2.3333333333333335

In [22]:
np.dot(alice_normalized_common_rating, bob_normalized_common_rating) / ( magnitude_alice_common * magnitude_bob_common)

-1.0