In [1]:
import math
from collections import defaultdict

from fts import FullTextSearch, DOCUMENTS

In [None]:
fts = FullTextSearch()

total_length = 0

for doc in DOCUMENTS:
    doc_id = doc["id"]
    fts.documents[doc_id] = doc
    # Combine title and content for tokenization
    text = f"{doc['title']} {doc['content']}"

    # Tokenize the combined text
    words = fts.tokenize(text)

    # Calculate document length
    doc_length = len(words)
    fts.doc_lengths[doc_id] = doc_length
    total_length += doc_length

    # Count word frequencies
    word_freq = defaultdict(int)
    for word in words:
        word_freq[word] += 1

    # Add to inverted index
    for word, freq in word_freq.items():
        fts.inverted_index[word].append((doc_id, freq))

fts.total_doc_count = len(DOCUMENTS)

fts.inverted_index
# print(fts.inverted_index['intelligence'])
# print(fts.inverted_index['python'])

https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [5]:
# query: intelligence
# tfidf(intelligence, d1) = count of "intelligence" in d1 * how rare "intelligence" is

query = "intelligence"
query_words = fts.tokenize(query)

# Calculate scores for each document
doc_scores = defaultdict(float)
matched_words = defaultdict(set)

for word in query_words:
    if word not in fts.inverted_index:
        continue

    for doc_id, freq in fts.inverted_index[word]:
        # Calculate TF-IDF score
        docs_with_word = fts.inverted_index[word]
        docs_count = len(docs_with_word)

        # Calculate Term Frequency (TF)
        # Normalize by document length to account for document size
        doc_len = fts.doc_lengths[doc_id]
        tf = freq / doc_len if doc_len > 0 else 0

        # Calculate Inverse Document Frequency (IDF)
        # Add 1 to numerator and denominator to avoid division by zero
        idf = math.log((fts.total_doc_count + 1) / (docs_count + 1)) + 1

        # Calculate TF-IDF score
        tfidf_score = tf * idf

        # Add to document score
        doc_scores[doc_id] += tfidf_score
        matched_words[doc_id].add(word)

doc_scores
# matched_words

defaultdict(float,
            {'1': 0.01639344262295082,
             '2': 0.013333333333333334,
             '3': 0.013793103448275862,
             '4': 0.07547169811320754,
             '5': 0.109375,
             '6': 0.1276595744680851,
             '7': 0.07142857142857142,
             '8': 0.05,
             '9': 0.024390243902439025,
             '10': 0.03571428571428571})

In [6]:
# Prepare search results
results = []
for doc_id, score in sorted(doc_scores.items(), key=lambda x: x[1], reverse=True):
    doc = fts.documents[doc_id]
    content = doc["content"]

    results.append(
        {
            "id": doc_id,
            "title": doc["title"],
            "snippet": content[:100] + "..." if len(content) > 100 else content,
            "score": round(score, 3),
        }
    )

results

[{'id': '6',
  'title': 'Artificial Intelligence Applications',
  'snippet': 'Artificial intelligence is revolutionizing multiple industries. Healthcare uses AI for diagnosis and...',
  'score': 0.128},
 {'id': '5',
  'title': 'Advanced Computing Systems',
  'snippet': 'Artificial intelligence is transforming how systems learn and make decisions. Artificial intelligenc...',
  'score': 0.109},
 {'id': '4',
  'title': 'Artificial Intelligence Revolution in Search',
  'snippet': 'Modern computing systems are becoming increasingly sophisticated. Machine learning models can recogn...',
  'score': 0.075},
 {'id': '7',
  'title': 'Using AI for Text Search',
  'snippet': 'Artificial intelligence techniques significantly improve full text search capabilities. Modern searc...',
  'score': 0.071},
 {'id': '8',
  'title': 'Intelligent Information Retrieval',
  'snippet': 'Advanced search systems now incorporate intelligence for better results. Some elements of full text ...',
  'score': 0.05},
 {'