In [119]:
import numpy as np
import math

In [120]:
def angular_similarity(a, b, normalize=True):
    if normalize:
        norm_a = math.sqrt(sum([ x*x for x in a.values() ]))
        norm_b = math.sqrt(sum([ x*x for x in b.values() ]))

        if norm_a == 0.0 or norm_b == 0.0:
            return 0.0
    else:
        norm_a = 1.0
        norm_b = 1.0

    prod = 0.
    for k, abundance in a.items():
        prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b)
    prod = min(1.0, prod)

    distance = 2*math.acos(prod) / math.pi
    return 1.0 - distance        

def angular_similarity_ng(a, b, normalize=True):
    # simulate a as ng
    
    a_sq = 0
    b_sq = sum([ x*x for x in b.values() ])
    
    prod = 0.
    for h, abund in b.items():
        if a.get(h):
            a_sq += 1
            prod += abund * abund
    
    if normalize:
        norm_a = math.sqrt(a_sq)
        norm_b = math.sqrt(b_sq)
        if norm_a == 0.0 or norm_b == 0.0:
            return 0.0
    else:
        norm_a = 1.0
        norm_b = 1.0
    
    prod = min(1.0, prod / (norm_a * norm_b))

    distance = 2*math.acos(prod) / math.pi
    return 1.0 - distance      

In [121]:
a = {40: 10, 50: 5, 99: 1}
b = {40: 2, 50: 2, 100: 10}

In [122]:
angular_similarity(a, b)

0.16558154864938834

In [123]:
angular_similarity(a, {h:1 for h in b})

0.5610031968200676

In [124]:
angular_similarity_ng(a, b)

0.3664329116346875

## Checking if it works

In [151]:
from hypothesis import given, strategies as st, settings, example

EPSILON = 1e-3

In [154]:
@settings(max_examples=50000)
@given(st.dictionaries(st.integers(min_value=1, max_value=10000), st.integers(min_value=1)),
       st.dictionaries(st.integers(min_value=1, max_value=10000), st.integers(min_value=1)))
@example({1: 1, 2: 2}, {1: 1, 2: 2})
@example({1: 1, 2: 1}, {1: 1, 2: 1, 3: 1})
@example({1: 1, 2: 1}, {1: 1})
def test(a, b):
    a_s = angular_similarity(a, b)
    a_s_ng = angular_similarity_ng(a, b)
    assert a_s <= a_s_ng + EPSILON, (a_s, a_s_ng)

In [155]:
test()

## Fixing a_sq calculation

In [156]:
def angular_similarity_ng_mk2(a, b, normalize=True):
    # simulate a as ng
    
    a_sq = 0
    b_sq = sum([ x*x for x in b.values() ])
    
    prod = 0.
    for h, abund in b.items():
        if a.get(h):
            a_sq += abund * abund
            prod += abund * abund
    
    if normalize:
        norm_a = math.sqrt(a_sq)
        norm_b = math.sqrt(b_sq)
        if norm_a == 0.0 or norm_b == 0.0:
            return 0.0
    else:
        norm_a = 1.0
        norm_b = 1.0
    
    prod = min(1.0, prod / (norm_a * norm_b))

    distance = 2*math.acos(prod) / math.pi
    return 1.0 - distance 

In [157]:
@settings(max_examples=50000)
@given(st.dictionaries(st.integers(min_value=1, max_value=10000), st.integers(min_value=1)),
       st.dictionaries(st.integers(min_value=1, max_value=10000), st.integers(min_value=1)))
@example({1: 1, 2: 2}, {1: 1, 2: 2})
@example({1: 1, 2: 1}, {1: 1, 2: 1, 3: 1})
@example({1: 1, 2: 1}, {1: 1})
def test(a, b):
    a_s = angular_similarity(a, b)
    a_s_ng = angular_similarity_ng_mk2(a, b)
    assert a_s <= a_s_ng + EPSILON, (a_s, a_s_ng)

In [158]:
test()