In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_cisi_data(doc_file, query_file, rel_file):
    documents = []
    with open(doc_file, 'r') as f:
        doc_id = None
        doc_text = ""
        for line in f:
            line = line.strip()
            if line.startswith(".I"):
                if doc_id:
                    documents.append({"id": doc_id, "text": doc_text})
                doc_id = line.split()[1]
                doc_text = ""
            elif line.startswith(".W"):
                doc_text += line[3:] + " "
        if doc_id:
            documents.append({"id": doc_id, "text": doc_text})

    queries = []
    with open(query_file, 'r') as f:
        query_id = None
        query_text = ""
        for line in f:
            line = line.strip()
            if line.startswith(".I"):
                if query_id:
                    queries.append({"id": query_id, "text": query_text})
                query_id = line.split()[1]
                query_text = ""
            elif line.startswith(".W"):
                query_text += line[3:] + " "
        if query_id:
            queries.append({"id": query_id, "text": query_text})

    relevant_docs = {}
    with open(rel_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                query_id = parts[0]
                doc_id = parts[1] 
                if query_id not in relevant_docs:
                    relevant_docs[query_id] = []
                relevant_docs[query_id].append(doc_id)

    return documents, queries, relevant_docs

def create_dataframe(documents):
    df = pd.DataFrame(documents)
    return df

def search_cisi(df, query, search_type):

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['text'])
    if search_type == 'vsm':
        query_vector = vectorizer.transform([query]).toarray().squeeze()
        cosine_similarities = cosine_similarity(query_vector.reshape(1, -1), X).flatten()
        results = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)
        return [(df.iloc[idx]['id'], score) for idx, score in results]

def calculate_precision(retrieved_docs, relevant_docs):
    if len(retrieved_docs) == 0:
        return 0.0
    true_positives = len(set(retrieved_docs) & set(relevant_docs))
    return true_positives / len(retrieved_docs)

def calculate_recall(retrieved_docs, relevant_docs):
    if len(relevant_docs) == 0:
        return 0.0
    true_positives = len(set(retrieved_docs) & set(relevant_docs))
    return true_positives / len(relevant_docs)

def calculate_f1_score(precision, recall):
    if precision == 0 and recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def calculate_map(retrieved_docs, relevant_docs):
    average_precisions = []
    num_relevant = len(relevant_docs)
    
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            precision = (i + 1) / (i + 1)
            average_precisions.append(precision)
    
    if num_relevant == 0:
        return 0.0
    
    return sum(average_precisions) / num_relevant

def evaluate_cisi(documents, queries, relevant_docs, search_type):
    df = create_dataframe(documents)
    results = []
    for query in queries:
        retrieved_docs = search_cisi(df, query['text'], search_type)
        retrieved_doc_ids = [doc[0] for doc in retrieved_docs]
        
        precision = calculate_precision(retrieved_doc_ids, relevant_docs[query['id']])
        recall = calculate_recall(retrieved_doc_ids, relevant_docs[query['id']])
        f1_score = calculate_f1_score(precision, recall)
        map_score = calculate_map(retrieved_doc_ids, relevant_docs[query['id']])
        
        results.append((precision, recall, f1_score, map_score))
    
    avg_precision = np.mean([result[0] for result in results])
    avg_recall = np.mean([result[1] for result in results])
    avg_f1_score = np.mean([result[2] for result in results])
    avg_map = np.mean([result[3] for result in results])
    
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1-Score: {avg_f1_score}")
    print(f"Mean Average Precision: {avg_map}")

if __name__ == "__main__":
    doc_file = "cisi.all"
    query_file = "cisi.qry"
    rel_file = "cisi.rel"
    search_type = 'vsm'

    documents, queries, relevant_docs = load_cisi_data(doc_file, query_file, rel_file)
    evaluate_cisi(documents, queries, relevant_docs, search_type)

ValueError: empty vocabulary; perhaps the documents only contain stop words