# Efficient Pairwise Cosine Similarity

In [1]:
%cd ~/drive/gdrive/projects/effcossim

In [2]:
from numpy import array
from time import time
from scipy.sparse import random
from effcossim.pcs import pairwise_cosine_similarity, pp_pcs

## Sample code: efficient vs non-efficient

In [3]:
A = array([
    [1, 2, 3], 
    [0, 1, 2],
    [5, 1, 1]
])

B = array([
    [1, 1, 2], 
    [0, 1, 2],
    [5, 0, 1], 
    [0, 0, 4]
])

In [4]:
M1 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=False, 
    dense_output=True
)

print(M1)
del M1

[[0.98198051 0.95618289 0.41931393 0.80178373]
 [0.91287093 1.         0.1754116  0.89442719]
 [0.62853936 0.25819889 0.98130676 0.19245009]]


In [5]:
M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=4, 
    lower_bound=0.0, 
    n_jobs=1, 
    dense_output=True
)

print(M2)
del M2

[[0.98198051 0.95618289 0.41931393 0.80178373]
 [0.91287093 1.         0.1754116  0.89442719]
 [0.62853936 0.25819889 0.98130676 0.19245009]]


## Testing efficient vs non-efficient

In [6]:
%%time

A = random(
    m=10000,       
    n=5000, 
    density=0.3, 
    format='csr', 
    random_state=1102
)
B = random(
    m=10000, 
    n=5000, 
    density=0.3, 
    format='csr', 
    random_state=1102
)

CPU times: user 7.62 s, sys: 313 ms, total: 7.93 s
Wall time: 7.92 s


In [7]:
%%time

M1 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=False, 
    dense_output=False
)

print("Output size:", M1.data.nbytes)
del M1

Output size: 800000000
CPU times: user 2min 59s, sys: 253 ms, total: 2min 59s
Wall time: 3min


In [8]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=1, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 2min 29s, sys: 261 ms, total: 2min 30s
Wall time: 2min 29s


In [9]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=2, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 2min 16s, sys: 132 ms, total: 2min 16s
Wall time: 1min 8s


In [10]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=4, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 3min 38s, sys: 196 ms, total: 3min 38s
Wall time: 55.5 s


In [11]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=6, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 5min 29s, sys: 302 ms, total: 5min 29s
Wall time: 57.9 s


In [12]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=8, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 6min 54s, sys: 670 ms, total: 6min 55s
Wall time: 58.1 s


In [13]:
%%time

M2 = pairwise_cosine_similarity(
    A=A, B=B, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.5, 
    n_jobs=10, 
    dense_output=False
)

print("Output size:", M2.data.nbytes)
del M2

Output size: 80000
CPU times: user 6min 56s, sys: 820 ms, total: 6min 57s
Wall time: 59.4 s


## Parallel run for lists of matrices

In [14]:
l1 = [random(m=10000, n=1000, density=0.3,) for _ in range(6)]
l2 = [random(m=10000, n=1000, density=0.3,) for _ in range(6)]

In [15]:
L = pp_pcs(
    l1=l1, 
    l2=l2, 
    n_workers=2, 
    efficient=True, 
    n_top=10, 
    lower_bound=0.3, 
    n_jobs=2, 
    dense_output=False
)

In [16]:
L

[<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93661 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93402 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93859 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93908 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93621 stored elements in Compressed Sparse Row format>,
 <10000x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 93906 stored elements in Compressed Sparse Row format>]