In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

def load(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def search_tfidf(tfidf_data, article_id, query_word):
    tfidf_values = []
    for article in tfidf_data:
        if article['title'] == article_id:
            tfidf_value = article['tfidf'].get(query_word, 0.0)
            tfidf_values.append(tfidf_value)
    return tfidf_values

def create_dataframe(articles):
    df = pd.DataFrame(articles)
    return df


def bm25_score(query_tfidf, doc_tfidf, k1=1.2, b=0.75):
    doc_lengths = np.sum(doc_tfidf, axis=1)
    avg_doc_length = np.mean(doc_lengths)

    # Vectorized BM25 calculation
    denom = k1 * (1 - b) + b * np.expand_dims(doc_lengths, axis=1) / avg_doc_length + doc_tfidf
    scores = np.sum(query_tfidf * (k1 + 1) * doc_tfidf / denom, axis=1)

    return scores

def vsm_search(query, tfidf_matrix, vectorizer, df):
  # Transform the query using the existing vectorizer
  query_vector = vectorizer.transform([query]).toarray().squeeze()

  # Calculate cosine similarity
  cosine_similarities = cosine_similarity(query_vector.reshape(1, -1), tfidf_matrix).flatten()

  # Sort results
  results = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)

  return [(df.iloc[idx]['title'], score) for idx, score in results]

def split_and_search(df, index, tfidf_data, query, search_type, num_splits):
    # Calculate document lengths
    df['doc_len'] = df['content'].str.len()

    # Sort by document length
    df = df.sort_values(by='doc_len')

    # Calculate chunk sizes
    chunk_size = len(df) // num_splits

    # Create a list to store the split DataFrames
    df_list = []

    # Split the DataFrame into even chunks based on document lengths
    for i in range(0, len(df), chunk_size):
        df_list.append(df[i:i+chunk_size])

    # Perform search on each chunk
    all_results = []
    for sub_df in df_list:
        sub_results = search(index, tfidf_data, query, search_type, df=sub_df)
        if sub_results:
            all_results.extend(sub_results)

    # Sort results by score
    sorted_results = sorted(all_results, key=lambda x: x[1], reverse=True)

    # Print results
    for result in sorted_results:
        print(result)

def search(index, tfidf_data, query, search_type, df=None):
    if df is None:
        df = create_dataframe(articles)
    words = query.split()
    result = set()
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['content'])
    
    if search_type == 'boolean':
        if len(words) == 1:
            # Single word query
            result.update(index.get(words[0], set()))
        else:
            # Boolean query
            set1 = set(index.get(words[0], set()))
            set2 = set(index.get(words[2], set()))
            operator = words[1]
            if operator == 'AND':
                result = set1 & set2
            elif operator == 'OR':
                result = set1 | set2
            elif operator == 'NOT':
                result = set1 - set2
        results_with_tfidf = [(article_id, search_tfidf(tfidf_data, article_id, words[0])) for article_id in result]
        sorted_results = sorted(results_with_tfidf, key=lambda x: max(x[1]), reverse=True)
        for article_id, tfidf_values in sorted_results:
            max_tfidf = max(tfidf_values)
            print(f"{article_id} ({max_tfidf})")
    elif search_type == 'bm25':
        query_tfidf = vectorizer.transform([query]).toarray()[0]
        bm25_scores = bm25_score(query_tfidf, X.toarray())
        result = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)
        return [(df.iloc[article_idx]['title'], score) for rank, (article_idx, score) in enumerate(result) if score > 0.0]
    elif search_type == 'vsm':
        return vsm_search(query, X, vectorizer, df)

if __name__ == "__main__":
    index = load("index.json")
    tfidf_data = load("tf-idf.json")
    articles = load("articles.json")
    df = create_dataframe(articles)
    search_type = input("Choose search type (boolean/bm25/vsm): ")
    query = input("Enter your query: ")
    if search_type == 'boolean':
        search(index, tfidf_data, query, search_type, df)
    else:
        split_and_search(df, index, tfidf_data, query, search_type, 5)

Choose search type (boolean/bm25/vsm):  vsm
Enter your query:  help


('comp.os.ms-windows.misc', 0.2798364226476149)
('rec.sport.hockey', 0.23446593455756115)
('comp.os.ms-windows.misc', 0.22200732059666867)
('comp.sys.ibm.pc.hardware', 0.21711728764965144)
('comp.os.ms-windows.misc', 0.2118167460599391)
('talk.politics.mideast', 0.19518348461780466)
('comp.graphics', 0.1922691684931675)
('comp.os.ms-windows.misc', 0.1917881901825437)
('comp.sys.mac.hardware', 0.18727521897210697)
('sci.space', 0.1856322876395236)
('comp.sys.ibm.pc.hardware', 0.18053612920751685)
('sci.med', 0.177803973686949)
('comp.windows.x', 0.16675686197645861)
('comp.os.ms-windows.misc', 0.16259012301532524)
('comp.sys.ibm.pc.hardware', 0.1587492063611156)
('comp.os.ms-windows.misc', 0.1571749098755522)
('comp.sys.ibm.pc.hardware', 0.15545331159378795)
('sci.med', 0.1536094072632977)
('alt.atheism', 0.15264212540292554)
('comp.windows.x', 0.15102232055698783)
('sci.electronics', 0.14697347870729358)
('sci.electronics', 0.14677434439258075)
('comp.graphics', 0.14638128456328586)
('