# Numpy : Distance Measure

In [1]:
import pandas as pd
import numpy as np
import scipy as sp

In [2]:
import os, sys

In [3]:
print( os.curdir )
print( os.getcwd() )
print( os.path.abspath( os.getcwd() ) )

.
/Users/nururrahman/myRecSystem
/Users/nururrahman/myRecSystem


## Data

In [4]:
m = np.asarray([
    [4,5,np.nan,5,1,np.nan,3,2],
    [np.nan,3,4,3,1,2,1,np.nan],
    [2,np.nan,1,3,np.nan,4,5,3]
])

In [5]:
print(m.shape)
print(m)

(3, 8)
[[ 4.  5. nan  5.  1. nan  3.  2.]
 [nan  3.  4.  3.  1.  2.  1. nan]
 [ 2. nan  1.  3. nan  4.  5.  3.]]


## Case 1 : Treat Data Matrix as Boolean True and False

In [6]:
m1 = np.where( np.isnan(m), False, True )
print(m1)

[[ True  True False  True  True False  True  True]
 [False  True  True  True  True  True  True False]
 [ True False  True  True False  True  True  True]]


#### Jaccard Distance

In [7]:
intersection = np.logical_and(m1[0,:], m1[1,:])
union = np.logical_or(m1[0,:], m1[1,:])
jd_ab = intersection.sum() / float(union.sum())

intersection = np.logical_and(m1[0,:], m1[2,:])
union = np.logical_or(m1[0,:], m1[2,:])
jd_ac = intersection.sum() / float(union.sum())

intersection = np.logical_and(m1[1,:], m1[2,:])
union = np.logical_or(m1[1,:], m1[2,:])
jd_bc = intersection.sum() / float(union.sum())

print( 'Jaccard Distances AB, AC, BC :', round(jd_ab,4), round(jd_ac,4), round(jd_bc,4) ) 

Jaccard Distances AB, AC, BC : 0.5 0.5 0.5


#### Cosine Distance

In [8]:
cos_ab = np.dot(m1[0,:], m1[1,:]) / (np.linalg.norm(m1[0,:]) * np.linalg.norm(m1[1,:]))
cos_ac = np.dot(m1[0,:], m1[2,:]) / (np.linalg.norm(m1[0,:]) * np.linalg.norm(m1[2,:]))
cos_bc = np.dot(m1[1,:], m1[2,:]) / (np.linalg.norm(m1[1,:]) * np.linalg.norm(m1[2,:]))

print('Cosine Distances AB, AC, BC :', round(cos_ab,4), round(cos_ac,4), round(cos_bc,4) ) 

Cosine Distances AB, AC, BC : 0.1667 0.1667 0.1667


## Case 2 : Treat Data Matrix as Binary 0 and 1

In [9]:
m2 = np.where( np.isnan(m), 0, m)
m2 = np.where( np.isin(m2, np.array([0,1,2])), 0, 1)
print(m2)

[[1 1 0 1 0 0 1 0]
 [0 1 1 1 0 0 0 0]
 [0 0 0 1 0 1 1 1]]


#### Jaccard Distance

In [10]:
intersection = np.logical_and(m2[0,:], m2[1,:])
union = np.logical_or(m2[0,:], m2[1,:])
jd_ab = intersection.sum() / float(union.sum())

intersection = np.logical_and(m2[0,:], m2[2,:])
union = np.logical_or(m2[0,:], m2[2,:])
jd_ac = intersection.sum() / float(union.sum())

intersection = np.logical_and(m2[1,:], m2[2,:])
union = np.logical_or(m2[1,:], m2[2,:])
jd_bc = intersection.sum() / float(union.sum())

print( 'Jaccard Distances AB, AC, BC :', round(jd_ab,4), round(jd_ac,4), round(jd_bc,4) ) 

Jaccard Distances AB, AC, BC : 0.4 0.3333 0.1667


#### Cosine Distance

In [11]:
cos_ab = np.dot(m2[0,:], m2[1,:]) / (np.linalg.norm(m2[0,:]) * np.linalg.norm(m2[1,:]))
cos_ac = np.dot(m2[0,:], m2[2,:]) / (np.linalg.norm(m2[0,:]) * np.linalg.norm(m2[2,:]))
cos_bc = np.dot(m2[1,:], m2[2,:]) / (np.linalg.norm(m2[1,:]) * np.linalg.norm(m2[2,:]))

print( 'Cosine Distances AB, AC, BC :', round(cos_ab,4), round(cos_ac,4), round(cos_bc,4) ) 

Cosine Distances AB, AC, BC : 0.5774 0.5 0.2887


In [12]:
np.isin(m2, np.array([1,2]))

array([[ True,  True, False,  True, False, False,  True, False],
       [False,  True,  True,  True, False, False, False, False],
       [False, False, False,  True, False,  True,  True,  True]])

## Case 3 : Treat Data Matrix as Numeric

In [13]:
mean_val = np.nanmean(m, axis=1).reshape( m.shape[0], 1)
m3 = m - mean_val
m3 = np.where( np.isnan(m3), 0 , m3) 
print(m3)

[[ 0.66666667  1.66666667  0.          1.66666667 -2.33333333  0.
  -0.33333333 -1.33333333]
 [ 0.          0.66666667  1.66666667  0.66666667 -1.33333333 -0.33333333
  -1.33333333  0.        ]
 [-1.          0.         -2.          0.          0.          1.
   2.          0.        ]]


#### Jaccard Distance

In [14]:
intersection = np.logical_and(m3[0,:], m3[1,:])
union = np.logical_or(m3[0,:], m3[1,:])
jd_ab = intersection.sum() / float(union.sum())

intersection = np.logical_and(m3[0,:], m3[2,:])
union = np.logical_or(m3[0,:], m3[2,:])
jd_ac = intersection.sum() / float(union.sum())

intersection = np.logical_and(m3[1,:], m3[2,:])
union = np.logical_or(m3[1,:], m3[2,:])
jd_bc = intersection.sum() / float(union.sum())

print( 'Jaccard Distances AB, AC, BC :', round(jd_ab,4), round(jd_ac,4), round(jd_bc,4) ) 

Jaccard Distances AB, AC, BC : 0.5 0.25 0.4286


#### Cosine Distance

In [15]:
cos_ab = np.dot(m3[0,:], m3[1,:]) / (np.linalg.norm(m3[0,:]) * np.linalg.norm(m3[1,:]))
cos_ac = np.dot(m3[0,:], m3[2,:]) / (np.linalg.norm(m3[0,:]) * np.linalg.norm(m3[2,:]))
cos_bc = np.dot(m3[1,:], m3[2,:]) / (np.linalg.norm(m3[1,:]) * np.linalg.norm(m3[2,:]))

print( 'Cosine Distances AB, AC, BC :', round(cos_ab,4), round(cos_ac,4), round(cos_bc,4) ) 

Cosine Distances AB, AC, BC : 0.5843 -0.1155 -0.7396


In [16]:
print( 'Cosine Angles AB, AC, BC :', 
      round(np.arccos(cos_ab),4), 
      round(np.arccos(cos_ac),4), 
      round(np.arccos(cos_bc),4) ) 

Cosine Angles AB, AC, BC : 0.9468 1.6865 2.4032


#### Pearson  Correlation

In [17]:
cor_ab = np.corrcoef(m3[0,:], m3[1,:])[0,1]
cor_ac = np.corrcoef(m3[0,:], m3[2,:])[0,1]
cor_bc = np.corrcoef(m3[1,:], m3[2,:])[0,1]

print( 'Pearson Correlation AB, AC, BC :', round(cor_ab,4), round(cor_ac,4), round(cor_bc,4) ) 

Pearson Correlation AB, AC, BC : 0.5843 -0.1155 -0.7396


#### Set Appraoch

In [18]:
intersection = set(m3[0,:]).intersection( set(m3[1,:]) )
union = set(m3[0,:]).union( set(m3[1,:]) )
ab = len(intersection)/len(union)

intersection = set(m3[0,:]).intersection( set(m3[2,:]) )
union = set(m3[0,:]).union( set(m3[1,:]) )
ac = len(intersection)/len(union)

intersection = set(m3[1,:]).intersection( set(m3[2,:]) )
union = set(m3[1,:]).union( set(m3[1,:]) )
bc = len(intersection)/len(union)

print( 'AB, AC, BC :', round(ab,4), round(ac,4), round(bc,4) ) 

AB, AC, BC : 0.8333 0.1667 0.2
