# Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import chain
import numpy as np
from collections import Counter
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Utils

In [2]:
# sklearn TfidfVectorizer uses base e.
BASE = 'e'  # or 10 (int) for log10

def log(x, base:BASE):
    return np.log10(x) if base == 10 else np.log(x)

# Dataset

In [3]:
# documents = [
#     "cat sat on mat",
#     "cat cat sat sat sat on mat",
#     "dog sat on log",
#     "cat and dog are friends",
#     "mat mat mat mat cat",
# ]
# query = "cat sat mat"

documents = [
    "sweet sweet nurse love",
    "sweet sorrow",
    "how sweet is love",
    "nurse"
]
query = "sweet love"


vocabulary = sorted(set(chain(*map(lambda x: x.split(' '), documents))))
w2ix = dict(zip(vocabulary, range(len(vocabulary))))

vocabulary
w2ix

['how', 'is', 'love', 'nurse', 'sorrow', 'sweet']

{'how': 0, 'is': 1, 'love': 2, 'nurse': 3, 'sorrow': 4, 'sweet': 5}

# Bag of counts

In [4]:
vectorizer = CountVectorizer(lowercase=True, vocabulary=None, max_features=None,)
documents_transformed = vectorizer.fit_transform(documents).todense()
query_transformed = vectorizer.transform([query]).todense()

documents_transformed
query_transformed

matrix([[0, 0, 1, 1, 0, 2],
        [0, 0, 0, 0, 1, 1],
        [1, 1, 1, 0, 0, 1],
        [0, 0, 0, 1, 0, 0]])

matrix([[0, 0, 1, 0, 0, 1]])

# TF-IDF

The idea is basically that documents that contain a lot of words from the prompt are more likely to be relevant.

Discounting weight to words that appear in all documents.

It does not require stopwords cleanup.

# Sklearn computation

## Vectorise

In [5]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    vocabulary=None,
    max_features=None,
    norm=None,
    use_idf=True,
    smooth_idf=False,
    sublinear_tf=True,
)

# this will learn the vocabulary & the idf
# the tf is computed online on each .transform call.
_ = vectorizer.fit(documents)

# it internally smooths with + 1
# otherwise a term that appears in all docs would have
# an idf == 0; same as if the term did not exist.
vectorizer.idf_ = vectorizer.idf_ - 1

# --- smooth_id param
# if smooth_idf = True: idf(t) = log( (1 + n) / (1 + df(t)) ) + 1
# if smooth_idf = False: idf(t) = log( n / df(t) ) + 1

# --- sublinear_tf param
# if sublinear_tf = True: tf = (1. + log(cnt)) if cnt > 0 else 0.
# if sublinear_tf = False: use raw counts

documents_transformed_sklearn = (
    np.asarray(vectorizer.transform(documents).todense())
)

query_transformed_sklearn = (
    np.asarray(vectorizer.transform([query]).todense())
    .reshape(-1,) 
)

# pprint
pd.DataFrame(documents_transformed_sklearn.T, index=vocabulary)  # [words, doc]
pd.DataFrame(query_transformed_sklearn.T, index=vocabulary)  # [words, doc (1)]

Unnamed: 0,0,1,2,3
how,0.0,0.0,1.386294,0.0
is,0.0,0.0,1.386294,0.0
love,0.693147,0.0,0.693147,0.0
nurse,0.693147,0.0,0.0,0.693147
sorrow,0.0,1.386294,0.0,0.0
sweet,0.487088,0.287682,0.287682,0.0


Unnamed: 0,0
how,0.0
is,0.0
love,0.693147
nurse,0.0
sorrow,0.0
sweet,0.287682


## Compute similarity

After vectorizing, document query relevance is computed using cosine similarity.

In [6]:
query_transformed_sklearn_unit = query_transformed_sklearn / np.linalg.norm(query_transformed_sklearn, ord=2)
query_transformed_sklearn_unit

array([0.        , 0.        , 0.92361025, 0.        , 0.        ,
       0.38333289])

In [7]:
documents_transformed_sklearn_unit = documents_transformed_sklearn / np.linalg.norm(documents_transformed_sklearn, ord=2, axis=1)[:, np.newaxis]
documents_transformed_sklearn_unit

array([[0.        , 0.        , 0.63323936, 0.63323936, 0.        ,
        0.44498969],
       [0.        , 0.        , 0.        , 0.        , 0.97913937,
        0.20318978],
       [0.66037695, 0.66037695, 0.33018848, 0.        , 0.        ,
        0.1370406 ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        ]])

In [8]:
document_query_relevance_sklearn = documents_transformed_sklearn_unit @ query_transformed_sklearn_unit
document_query_relevance_sklearn

array([0.75544555, 0.07788932, 0.35749763, 0.        ])

## Manual computation

### Document vectorisation

In [9]:
# aux structure of [doc][word]: tf
doc_word_counts = dict([(ix, Counter(x.split(' '))) for ix, x in enumerate(documents)])
doc_word_counts

{0: Counter({'sweet': 2, 'nurse': 1, 'love': 1}),
 1: Counter({'sweet': 1, 'sorrow': 1}),
 2: Counter({'how': 1, 'sweet': 1, 'is': 1, 'love': 1}),
 3: Counter({'nurse': 1})}

In [10]:
def compute_smooth_term_frequency(cnt: int, base: int = BASE) -> float:
    """This is equivalent to TfidfVectorizer(sublinear_tf=True)."""
    return (1. + log(cnt, base)) if cnt > 0 else 0.

tf_documents = np.array([[0.] * len(documents) for _ in range(len(vocabulary))])  # [words, documents]
tf_documents_smoothed = np.array([[0.] * len(documents) for _ in range(len(vocabulary))])  # [words, documents]
for ix_w, w in enumerate(vocabulary):
    for ix_d, d in enumerate(documents):
        tf_documents[ix_w][ix_d] = doc_word_counts[ix_d][w]
        tf_documents_smoothed[ix_w][ix_d] = compute_smooth_term_frequency(cnt=doc_word_counts[ix_d][w])

# pprint
vocabulary
pd.DataFrame(tf_documents, index=vocabulary)
pd.DataFrame(tf_documents_smoothed, index=vocabulary)

['how', 'is', 'love', 'nurse', 'sorrow', 'sweet']

Unnamed: 0,0,1,2,3
how,0.0,0.0,1.0,0.0
is,0.0,0.0,1.0,0.0
love,1.0,0.0,1.0,0.0
nurse,1.0,0.0,0.0,1.0
sorrow,0.0,1.0,0.0,0.0
sweet,2.0,1.0,1.0,0.0


Unnamed: 0,0,1,2,3
how,0.0,0.0,1.0,0.0
is,0.0,0.0,1.0,0.0
love,1.0,0.0,1.0,0.0
nurse,1.0,0.0,0.0,1.0
sorrow,0.0,1.0,0.0,0.0
sweet,1.693147,1.0,1.0,0.0


In [11]:
n_documents = len(documents)
# will never have a count of 0; otherwise the word would not be in the vocabulary
df = (tf_documents >= 1).astype(int).sum(axis=1)
idf = log(n_documents / df, base=BASE)[:, np.newaxis]  # [words x 1]

pd.DataFrame(idf, index=vocabulary, columns=['IDF'])

Unnamed: 0,IDF
how,1.386294
is,1.386294
love,0.693147
nurse,0.693147
sorrow,1.386294
sweet,0.287682


In [12]:
# compute document tf-idf vectorisation
documents_transformed = tf_documents_smoothed * idf

pd.DataFrame(documents_transformed, index=vocabulary)

Unnamed: 0,0,1,2,3
how,0.0,0.0,1.386294,0.0
is,0.0,0.0,1.386294,0.0
love,0.693147,0.0,0.693147,0.0
nurse,0.693147,0.0,0.0,0.693147
sorrow,0.0,1.386294,0.0,0.0
sweet,0.487088,0.287682,0.287682,0.0


### Query vectorisation

In [13]:
# get query words
query_words = query.split(' ')
query_vocabulary = list(set(query_words))
query_word_counts = Counter(query_words)

# compute query tf (w/ the documents vocabulary)
tf_query = [0] * len(vocabulary)
tf_query = np.array([ compute_smooth_term_frequency(cnt=query_word_counts[w]) for ix_w, w in enumerate(vocabulary)])
tf_query

array([0., 0., 1., 0., 0., 1.])

In [14]:
query_transformed = tf_query * idf.reshape(-1,)
query_transformed

array([0.        , 0.        , 0.69314718, 0.        , 0.        ,
       0.28768207])

In [15]:
query_transformed_unit = query_transformed / np.linalg.norm(query_transformed, ord=2)
query_transformed_unit

array([0.        , 0.        , 0.92361025, 0.        , 0.        ,
       0.38333289])

In [16]:
documents_transformed_unit = (documents_transformed / np.linalg.norm(documents_transformed, ord=2, axis=0))
documents_transformed_unit

array([[0.        , 0.        , 0.66037695, 0.        ],
       [0.        , 0.        , 0.66037695, 0.        ],
       [0.63323936, 0.        , 0.33018848, 0.        ],
       [0.63323936, 0.        , 0.        , 1.        ],
       [0.        , 0.97913937, 0.        , 0.        ],
       [0.44498969, 0.20318978, 0.1370406 , 0.        ]])

In [17]:
document_query_relevance = documents_transformed_unit.T @ query_transformed_unit
document_query_relevance

array([0.75544555, 0.07788932, 0.35749763, 0.        ])

## Check manual == sklearn

In [18]:
document_query_relevance_sklearn
document_query_relevance

array([0.75544555, 0.07788932, 0.35749763, 0.        ])

array([0.75544555, 0.07788932, 0.35749763, 0.        ])