# Efficient Pairwise Cosine Similarity

In [1]:
%cd ~/drive/gdrive/projects/effcossim

/home/ngshya/drive/gdrive/projects/effcossim


In [2]:
from numpy import array
from time import time
from scipy.sparse import random
from effcossim.pcs import pairwise_cosine_similarity, pp_pcs

## Sample code: efficient vs non-efficient

In [3]:
A = array([
    [1, 2, 3], 
    [0, 1, 2],
    [5, 1, 1]
])

B = array([
    [1, 1, 2], 
    [0, 1, 2],
    [5, 0, 1], 
    [0, 0, 4]
])

In [4]:
M1 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=False, 
    dense_output=True
)

print(M1)
del M1

[[0.98198051 0.95618289 0.41931393 0.80178373]
 [0.91287093 1.         0.1754116  0.89442719]
 [0.62853936 0.25819889 0.98130676 0.19245009]]


In [5]:
M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=4, 
    lower_bound=0.0, 
    n_jobs=1, 
    dense_output=True
)

print(M2)
del M2

[[0.98198051 0.95618289 0.41931393 0.80178373]
 [0.91287093 1.         0.1754116  0.89442719]
 [0.62853936 0.25819889 0.98130676 0.19245009]]


## Testing efficient vs non-efficient

In [6]:
%%time

A = random(
    m=10000,       
    n=5000, 
    density=0.3, 
    format='csr', 
    random_state=1102
)
B = random(
    m=10000, 
    n=5000, 
    density=0.3, 
    format='csr', 
    random_state=1102
)

CPU times: user 6.37 s, sys: 236 ms, total: 6.6 s
Wall time: 6.59 s


In [7]:
%%time

M1 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=False, 
    dense_output=False
)

print("Output size:", M1.data.nbytes)
del M1

Output size: 800000000
CPU times: user 2min 27s, sys: 208 ms, total: 2min 27s
Wall time: 2min 27s


In [8]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=1, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 1min 58s, sys: 173 ms, total: 1min 58s
Wall time: 1min 58s


In [9]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=2, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 2min 43s, sys: 184 ms, total: 2min 43s
Wall time: 1min 22s


In [10]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=4, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 3min 47s, sys: 238 ms, total: 3min 48s
Wall time: 59 s


In [11]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=6, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 5min 40s, sys: 430 ms, total: 5min 41s
Wall time: 1min 8s


In [12]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=8, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 6min 12s, sys: 593 ms, total: 6min 13s
Wall time: 52.7 s


In [13]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=10, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 6min 16s, sys: 767 ms, total: 6min 16s
Wall time: 1min 2s


## Parallel run for lists of matrices

In [14]:
l1 = [random(m=10000, n=1000, density=0.3,) for _ in range(6)]
l2 = [random(m=10000, n=1000, density=0.3,) for _ in range(6)]

In [15]:
L = pp_pcs(
    l1=l1, 
    l2=l2, 
    n_workers=2, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.3, 
    n_jobs=2, 
    dense_output=False
)

In [16]:
L

[<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93943 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93890 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93541 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93726 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93642 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93921 stored elements in Compressed Sparse Row format>]