In [101]:
import numpy as np
import math
from collections import Counter

In [102]:
doc1 = np.array("hello this is a cat cat is not dog".split())
doc2 = np.array("hello this is a good car this is a fast car".split())
collection = [doc1, doc2]
words = np.concatenate([doc1, doc2])

print("=== WORDS ===\n", words)

=== WORDS ===
 ['hello' 'this' 'is' 'a' 'cat' 'cat' 'is' 'not' 'dog' 'hello' 'this' 'is'
 'a' 'good' 'car' 'this' 'is' 'a' 'fast' 'car']


In [103]:
terms_dict = Counter(words)

print("=== TERMS ===\n", terms_dict)

=== TERMS ===
 Counter({'is': 4, 'this': 3, 'a': 3, 'hello': 2, 'cat': 2, 'car': 2, 'not': 1, 'dog': 1, 'good': 1, 'fast': 1})


In [104]:
def calc_tf(term, document):
    return sum(document == term)

In [105]:
calc_tf('cat', doc1)

2

In [106]:
def calc_df(term):
    df = 0
    for doc in collection:
        if term in doc:
            df += 1

    return df


In [107]:
def calc_idf(term):
    N = len(collection)
    df = calc_df(term)
    
    return math.log(N / df) if df else 0

In [108]:
calc_idf('dog')

0.6931471805599453

In [109]:
def calc_weight(term, document):
    return calc_tf(term, document) * calc_idf(term)

In [110]:
calc_weight("cat", doc1)

1.3862943611198906

In [131]:
def calc_score(query, document):
    return sum([calc_weight(term, document) for term in query.split()])

In [132]:
calc_score("cat vs dog", doc1)

2.0794415416798357

In [113]:
def generate_vector(document) -> np:
    vector = np.zeros(len(terms_dict))
    for i, term in enumerate(terms_dict):
        vector[i] = calc_weight(term, document)

    return vector


In [117]:
generate_vector(doc1)

array([0.        , 0.        , 0.        , 0.        , 1.38629436,
       0.69314718, 0.69314718, 0.        , 0.        , 0.        ])

In [120]:
def similatrity(document1, document2):
    return generate_vector(document1).dot(generate_vector(document2))

In [121]:
similatrity(doc1, doc2)

0.0

In [133]:
def normalize(vector: np) -> np:
    return vector / np.linalg.norm(vector)

In [127]:
normalize(generate_vector(doc1))

array([0.        , 0.        , 0.        , 0.        , 0.81649658,
       0.40824829, 0.40824829, 0.        , 0.        , 0.        ])

In [130]:
np.linalg.norm(normalize(generate_vector(doc1)))

0.9999999999999999

In [140]:
def calc_score2(query, document):
    query_terms = np.array(query.split())
    return normalize(generate_vector(query_terms)).dot(normalize(generate_vector(document)))

In [143]:
calc_score2("cat vs dog", doc1)

0.8660254037844388