In [13]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

def load(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def search_tfidf(tfidf_data, article_id, query_word):
    tfidf_values = []
    for article in tfidf_data:
        if article['title'] == article_id:
            tfidf_value = article['tfidf'].get(query_word, 0.0)
            tfidf_values.append(tfidf_value)
    return tfidf_values

def create_dataframe(articles):
    df = pd.DataFrame(articles)
    return df


def bm25_score(query_tfidf, doc_tfidf, k1=1.2, b=0.75):
    doc_lengths = np.sum(doc_tfidf, axis=1)
    avg_doc_length = np.mean(doc_lengths)

    # Vectorized BM25 calculation
    denom = k1 * (1 - b) + b * np.expand_dims(doc_lengths, axis=1) / avg_doc_length + doc_tfidf
    scores = np.sum(query_tfidf * (k1 + 1) * doc_tfidf / denom, axis=1)

    return scores

def vsm_search(query, tfidf_matrix, vectorizer, df):
  # Transform the query using the existing vectorizer
  query_vector = vectorizer.transform([query]).toarray().squeeze()

  # Calculate cosine similarity
  cosine_similarities = cosine_similarity(query_vector.reshape(1, -1), tfidf_matrix).flatten()

  # Sort results
  results = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)

  return [(df.iloc[idx]['title'], score) for idx, score in results]

def split_and_search(df, index, tfidf_data, query, search_type, num_splits):
    df_list = np.array_split(df, num_splits)
    all_results = []
    for sub_df in df_list:
        sub_results = search(index, tfidf_data, query, search_type, df=sub_df)
        if sub_results:
            all_results.extend(sub_results)
            
    sorted_results = sorted(all_results, key=lambda x: x[1], reverse=True)
    for result in all_results:
        print(result)



def search(index, tfidf_data, query, search_type, df=None):
    if df is None:
        df = create_dataframe(articles)
    words = query.split()
    result = set()
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['content'])
    
    if search_type == 'boolean':
        if len(words) == 1:
            # Single word query
            result.update(index.get(words[0], set()))
        else:
            # Boolean query
            set1 = set(index.get(words[0], set()))
            set2 = set(index.get(words[2], set()))
            operator = words[1]
            if operator == 'AND':
                result = set1 & set2
            elif operator == 'OR':
                result = set1 | set2
            elif operator == 'NOT':
                result = set1 - set2
        results_with_tfidf = [(article_id, search_tfidf(tfidf_data, article_id, words[0])) for article_id in result]
        sorted_results = sorted(results_with_tfidf, key=lambda x: max(x[1]), reverse=True)
        for article_id, tfidf_values in sorted_results:
            max_tfidf = max(tfidf_values)
            print(f"{article_id} ({max_tfidf})")
    elif search_type == 'bm25':
        query_tfidf = vectorizer.transform([query]).toarray()[0]
        bm25_scores = bm25_score(query_tfidf, X.toarray())
        result = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)
        return [(df.iloc[article_idx]['title'], score) for rank, (article_idx, score) in enumerate(result)]
    elif search_type == 'vsm':
        return vsm_search(query, X, vectorizer, df)

if __name__ == "__main__":
    index = load("index.json")
    tfidf_data = load("tf-idf.json")
    articles = load("articles.json")
    df = create_dataframe(articles)
    search_type = input("Choose search type (boolean/bm25/vsm): ")
    query = input("Enter your query: ")
    if search_type == 'boolean':
        search(index, tfidf_data, query, search_type, df)
    else:
       split_and_search(df, index, tfidf_data, query, search_type, 5)

Choose search type (boolean/bm25):  vsm
Enter your query:  help


  return bound(*args, **kwds)


('comp.os.ms-windows.misc', 0.20576227268305242)
('comp.graphics', 0.19087685227922943)
('sci.space', 0.1750680992266774)
('sci.med', 0.16511941184839984)
('comp.graphics', 0.143621797938888)
('sci.med', 0.1401325494205319)
('comp.sys.ibm.pc.hardware', 0.13991470020589494)
('sci.electronics', 0.1320748456369014)
('comp.graphics', 0.12971479107332623)
('sci.electronics', 0.12068648962138641)
('sci.electronics', 0.11904818309718329)
('sci.med', 0.1176759888995751)
('comp.windows.x', 0.11466272867251705)
('comp.sys.mac.hardware', 0.11281778998409081)
('soc.religion.christian', 0.11010319314947276)
('sci.crypt', 0.10873192582935905)
('alt.atheism', 0.10839194580544857)
('sci.electronics', 0.10731090774071207)
('comp.os.ms-windows.misc', 0.10515487412309477)
('sci.electronics', 0.10336119465651034)
('comp.sys.ibm.pc.hardware', 0.10176584558907042)
('sci.electronics', 0.09924322934094199)
('rec.motorcycles', 0.0961731357654503)
('comp.sys.ibm.pc.hardware', 0.09446900520442184)
('sci.med', 0.