In [35]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import sys

def load(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def search_tfidf(tfidf_data, article_id, query_word):
    tfidf_values = []
    for article in tfidf_data:
        if article['title'] == article_id:
            tfidf_value = article['tfidf'].get(query_word, 0.0)
            tfidf_values.append(tfidf_value)
    return tfidf_values

def create_dataframe(articles):
    df = pd.DataFrame(articles)
    return df


def bm25_score(query_tfidf, doc_tfidf, k1=1.2, b=0.75):
    scores = []
    for doc_idx, doc_vec in enumerate(doc_tfidf):
        score = 0
        for term_idx, tfidf in enumerate(query_tfidf):
            if tfidf > 0: 
                tf = doc_vec[term_idx]
                idf = query_tfidf[term_idx]
                score += idf * ((k1 + 1) * tf) / (k1 * (1 - b) + b * len(doc_vec) / np.mean(len(doc_tfidf)) + tf)
            if score > 0.01:
                scores.append(score)
                break
    return scores

def split_and_search(df, index, tfidf_data, query, search_type, num_splits):
    df_list = np.array_split(df, num_splits)
    all_results = []
    for sub_df in df_list:
        sub_results = search(index, tfidf_data, query, search_type, df=sub_df)
        if sub_results:
            all_results.extend(sub_results)
            
    sorted_results = sorted(all_results, key=lambda x: x[1], reverse=True)
    for result in all_results:
        print(result)
            

def search(index, tfidf_data, query, search_type, df=None):
    if df is None:
        df = create_dataframe(articles)
    words = query.split()
    result = set()

    if search_type == 'boolean':
        if len(words) == 1:
            # Single word query
            result.update(index.get(words[0], set()))
        else:
            # Boolean query
            set1 = set(index.get(words[0], set()))
            set2 = set(index.get(words[2], set()))
            operator = words[1]
            if operator == 'AND':
                result = set1 & set2
            elif operator == 'OR':
                result = set1 | set2
            elif operator == 'NOT':
                result = set1 - set2
    elif search_type == 'bm25':
        # Initialize the vectorizer here
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(df['content'])

        query_tfidf = vectorizer.transform([query]).toarray()[0]
        bm25_scores = bm25_score(query_tfidf, X.toarray())
        result = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)

    if search_type == 'boolean':
        results_with_tfidf = [(article_id, search_tfidf(tfidf_data, article_id, words[0])) for article_id in result]
        sorted_results = sorted(results_with_tfidf, key=lambda x: max(x[1]), reverse=True)

        for article_id, tfidf_values in sorted_results:
            max_tfidf = max(tfidf_values)
            print(f"{article_id} ({max_tfidf})")
    elif search_type == 'bm25':
        return [(df.iloc[article_idx]['title'], score) for rank, (article_idx, score) in enumerate(result)]

if __name__ == "__main__":
    index = load("index.json")
    tfidf_data = load("tf-idf.json")
    articles = load("articles.json")
    df = create_dataframe(articles)
    search_type = input("Choose search type (boolean/bm25): ")
    query = input("Enter your query: ")
    if search_type == 'boolean':
        search(index, tfidf_data, query, search_type, df)
        sys.exit(1)
    split_and_search(df, index, tfidf_data, query, search_type, 5)

Choose search type (boolean/bm25):  boolean
Enter your query:  line


rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hockey
rec.sport.hock

SystemExit: 1