In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer

def load(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def search_tfidf(tfidf_data, article_id, query_word):
    for article in tfidf_data:
        if article['title'] == article_id:
            return article['tfidf'].get(query_word, 0.0)  # Default to 0 if not found
    return 0.0  # If article not found in TF-IDF data

def create_dataframe(articles):
    df = pd.DataFrame(articles)
    return df

def bm25_score(query_tfidf, doc_tfidf, k1=1.2, b=0.75):
    scores = []
    for doc_idx, doc_vec in enumerate(doc_tfidf):
        score = 0
        for term_idx, tfidf in enumerate(query_tfidf):
            if tfidf > 0:
                tf = doc_vec[term_idx]
                idf = query_tfidf[term_idx]
                score += idf * ((k1 + 1) * tf) / (k1 * (1 - b) + b * len(doc_vec) / np.mean(len(doc_tfidf)) + tf)
        scores.append(score)
    return scores

def search(index, tfidf_data, query, search_type):
    words = query.split()
    result = set()

    if search_type == 'boolean':
        if len(words) == 1:
            # Single word query
            result = index.get(words[0], set())
        else:
            # Boolean query
            set1 = set(index.get(words[0], set()))
            set2 = set(index.get(words[2], set()))
            operator = words[1]
            if operator == 'AND':
                result = set1 & set2
            elif operator == 'OR':
                result = set1 | set2
            elif operator == 'NOT':
                result = set1 - set2
    elif search_type == 'bm25':
        # Initialize the vectorizer here
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(df['content'])

        query_tfidf = vectorizer.transform([query]).toarray()[0]
        bm25_scores = bm25_score(query_tfidf, X.toarray())
        result = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)

    if search_type == 'boolean':
        results_with_tfidf = [(article_id, search_tfidf(tfidf_data, article_id, words[0])) for article_id in result]
        sorted_results = sorted(results_with_tfidf, key=lambda x: x[1], reverse=True)
        for article_id, tfidf_value in sorted_results:
            print(f"{article_id} ({tfidf_value})")
    elif search_type == 'bm25':
        for rank, (article_idx, score) in enumerate(result):
            print(f"{rank+1}. {df.iloc[article_idx]['title']} ({score})")

if __name__ == "__main__":
    index = load("index.json")
    tfidf_data = load("tf-idf.json")
    articles = load("articles.json")
    df = create_dataframe(articles)
    print(df.columns)
    search_type = input("Choose search type (boolean/bm25): ")
    query = input("Enter your query: ")
    search(index, tfidf_data, query, search_type)

Index(['title', 'content'], dtype='object')


Choose search type (boolean/bm25):  boolean
Enter your query:  born


Joseph Gilman (guard) (0.09429915322785601)
Robert Mitwerandu (0.07749882182874171)
Robin Roussel (0.04990485502692381)
Garnet Mackley (0.042033528855095204)
Artem Bessalov (0.0225437915661147)
Chester railway station (0.002913677369347455)
