In [None]:
!pip install google-search-results

import json
import requests
import re
from serpapi import GoogleSearch
from collections import defaultdict


In [6]:
# SerpAPI Key (Replace with your own)
SERPAPI_KEY = "9feec7ed5ff4bc5f3290ccb2908f203877b018ee77854f646abecece77adba0b"

# Load CACM queries and relevance judgments
def load_cacm_relevance(qrels_file="/content/qrels.text"):
    relevance_judgments = defaultdict(dict)
    with open(qrels_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            query_id = int(parts[0])  # Query ID
            doc_id = str(parts[1])    # Convert doc_id to string for correct matching
            relevance = int(parts[3]) # Relevance score (last column)

            if relevance > 0:  # Store only relevant documents
                relevance_judgments[query_id][doc_id] = relevance

    return relevance_judgments

# Get Google search results
def get_google_results(query, num_results=10):
    params = {
        "q": query,
        "num": num_results,
        "api_key": SERPAPI_KEY
    }
    search = GoogleSearch(params)
    results = search.get_dict()

    if "organic_results" not in results:
        return []

    return [result["link"] for result in results["organic_results"]]

# Improved URL-to-CACM ID mapping
def map_url_to_cacm_id(url):
    match = re.search(r'/(\d{4,})', url)  # Extracts 4+ digit document ID
    if match:
        return match.group(1)  # Return only the numeric document ID
    return None  # No match found

# Evaluate retrieved results against CACM ground truth
def evaluate_results(retrieved_docs, relevant_docs):
    retrieved_ids = [map_url_to_cacm_id(url) for url in retrieved_docs]
    retrieved_ids = [id for id in retrieved_ids if id is not None]  # Remove None values

    retrieved_set = set(retrieved_ids)
    relevant_set = set(relevant_docs)

    if not retrieved_set:
        return 0, 0, 0  # No valid retrieved results

    precision = len(retrieved_set & relevant_set) / len(retrieved_set)
    recall = len(retrieved_set & relevant_set) / len(relevant_set) if relevant_set else 0

    # Mean Average Precision (MAP)
    relevant_count = 0
    average_precision = 0
    for i, doc in enumerate(retrieved_ids):
        if doc in relevant_set:
            relevant_count += 1
            precision_at_i = relevant_count / (i + 1)
            average_precision += precision_at_i

    map_score = average_precision / len(relevant_set) if relevant_set else 0

    return precision, recall, map_score

# Run evaluation
relevance_data = load_cacm_relevance()

query_id = 1  # Change query ID as needed
query_text = "computer networks CACM site:dl.acm.org OR site:researchgate.net"

retrieved_docs = get_google_results(query_text, num_results=10)
relevant_docs = set(relevance_data.get(query_id, {}).keys())  # Ensure it's a set of strings

precision, recall, map_score = evaluate_results(retrieved_docs, relevant_docs)

# Print evaluation results
print(f"Query: {query_text}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"MAP: {map_score:.4f}")


Query: computer networks CACM site:dl.acm.org OR site:researchgate.net
Precision: 0.0000
Recall: 0.0000
MAP: 0.0000
