In [None]:
import os
import tarfile
import requests
import csv
import xml.etree.ElementTree as ET
from tqdm import tqdm

# --- CONFIGURATION ---
SOLR_URL = "http://localhost:8983/solr"
INDEX_NAME = "trec-covid-index"
LOCAL_TAR_PATH = "data/cord-19_2020-07-16.tar.gz"  # Your local file
METADATA_PATH = "data/2020-07-16/metadata.csv"
TOPICS_PATH = "data/topics-rnd5.xml"
DOCIDS_PATH = "data/docids-rnd5.txt"

# 1. Extract Metadata from Local File
if not os.path.exists(METADATA_PATH):
    if os.path.exists(LOCAL_TAR_PATH):
        print(f"Extracting metadata from {LOCAL_TAR_PATH}...")
        with tarfile.open(LOCAL_TAR_PATH, "r:gz") as tar:
            tar.extract(tar.getmember("2020-07-16/metadata.csv"), path="data")
        print("‚úÖ Metadata extracted.")
    else:
        print(f"‚ùå Error: Could not find {LOCAL_TAR_PATH}. Please ensure it is in the 'data' folder.")
else:
    print("‚úÖ Metadata already exists.")

# 2. Download Helper Files (Topics & DocIDs)
files = {
    DOCIDS_PATH: "https://ir.nist.gov/trec-covid/data/docids-rnd5.txt",
    TOPICS_PATH: "https://ir.nist.gov/trec-covid/data/topics-rnd5.xml"
}
for path, url in files.items():
    if not os.path.exists(path):
        with open(path, 'wb') as f:
            f.write(requests.get(url).content)

# 3. Load Valid DocIDs (Set for speed)
with open(DOCIDS_PATH, "r", encoding="utf-8") as f:
    valid_docids = set(line.strip() for line in f)

# 4. Load Documents into Memory
trec_covid_documents = []
seen_ids = set()
print("Loading documents...")
with open(METADATA_PATH, "r", encoding="utf-8") as csv_file:
    reader = csv.reader(csv_file)
    columns = next(reader)
    for doc in tqdm(reader, desc="Parsing CSV"):
        d = dict(zip(columns, doc))
        docid = d.get("cord_uid")
        if docid in valid_docids and docid not in seen_ids:
            seen_ids.add(docid)
            trec_covid_documents.append({
                "cord_uid": docid,
                "title": d.get("title", ""),
                "abstract": d.get("abstract", ""),
                # We only need year for boosting
                "publication_year": d.get("publish_time", "")[:4] 
            })
print(f"‚úÖ Loaded {len(trec_covid_documents)} docs.")

# 5. Load & Expand Topics (Query + Question)
trec_topics = []
tree = ET.parse(TOPICS_PATH)
for topic in tree.getroot().findall("topic"):
    query = topic.find("query").text
    question = topic.find("question").text
    # Combine for better recall
    full_query = f"{query} {question}" 
    trec_topics.append({"id": topic.get("number"), "query": full_query})
print(f"‚úÖ Loaded {len(trec_topics)} topics.")

‚úÖ Metadata already exists.
Loading documents...


Parsing CSV: 192509it [00:02, 77702.81it/s]

‚úÖ Loaded 191175 docs.
‚úÖ Loaded 50 topics.





In [None]:
import requests

# 1. Hard Reset (Wipe Core)
print("üßπ Wiping Solr Core...")
requests.get(f"{SOLR_URL}/admin/cores?action=UNLOAD&core={INDEX_NAME}&deleteIndex=true&deleteInstanceDir=true")
requests.get(f"{SOLR_URL}/admin/cores?action=CREATE&name={INDEX_NAME}&instanceDir={INDEX_NAME}&configSet=_default")

# 2. Define Schema with Porter Stemmer
schema_payload = {
    "add-field-type": {
        "name": "text_tuned",
        "class": "solr.TextField",
        "analyzer": {
            "tokenizer": { "class": "solr.StandardTokenizerFactory" },
            "filters": [
                { "class": "solr.LowerCaseFilterFactory" },
                { "class": "solr.StopFilterFactory", "words": "stopwords.txt", "ignoreCase": "true" },
                { "class": "solr.EnglishPossessiveFilterFactory" },
                { "class": "solr.PorterStemFilterFactory" } # Aggressive Stemmer
            ]
        }
    },
    "add-field": [
        {"name": "cord_uid", "type": "string", "stored": True, "indexed": True},
        {"name": "title", "type": "text_tuned", "stored": True, "indexed": True},
        {"name": "abstract", "type": "text_tuned", "stored": True, "indexed": True},
        {"name": "publication_year", "type": "pint", "stored": True, "indexed": True}
    ]
}

print("Applying Schema...")
requests.post(f"{SOLR_URL}/{INDEX_NAME}/schema", json=schema_payload)
print("‚úÖ Schema Defined (Porter Stemmer).")

üßπ Wiping Solr Core...
Applying Schema...
‚úÖ Schema Defined (Porter Stemmer).


: 

In [None]:
print(f"üöÄ Indexing {len(trec_covid_documents)} documents...")
resp = requests.post(
    f"{SOLR_URL}/{INDEX_NAME}/update",
    headers={"Content-Type": "application/json"},
    json=trec_covid_documents,
    params={"commit": "true"}
)
if resp.status_code == 200:
    print("‚úÖ Indexing Complete.")
else:
    print(f"‚ùå Indexing Failed: {resp.text}")

üöÄ Indexing 191175 documents...
‚úÖ Indexing Complete.


: 

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

# 1. Load Ground Truth
qrels = requests.get("https://ir.nist.gov/trec-covid/data/qrels-covid_d5_j0.5-5.txt").text
qrel_dict = {}
for line in qrels.strip().split('\n'):
    qid, _, docid, rel = line.split()
    qrel_dict[(qid, docid)] = int(rel)

# 2. Extract Features
print("Running ML Feature Extraction (This takes a minute)...")
X = [] # Features
y = [] # Labels

# We use a subset of topics to train quickly
for topic in tqdm(trec_topics[:25], desc="Training"):
    qid = topic['id']
    # Remove stopwords for cleaner feature matching
    q_clean = " ".join([w for w in topic['query'].split() if w.lower() not in {"what","is","the","of","in"}])
    
    # Get scores for Title, Abstract, and Phrase separately
    try:
        s_title = requests.get(f"{SOLR_URL}/{INDEX_NAME}/select", params={"q": q_clean, "defType": "edismax", "qf": "title", "fl": "cord_uid,score", "rows": 50}).json()
        s_abst = requests.get(f"{SOLR_URL}/{INDEX_NAME}/select", params={"q": q_clean, "defType": "edismax", "qf": "abstract", "fl": "cord_uid,score", "rows": 50}).json()
        s_phrase = requests.get(f"{SOLR_URL}/{INDEX_NAME}/select", params={"q": q_clean, "defType": "edismax", "qf": "title", "pf": "title", "ps": "5", "fl": "cord_uid,score", "rows": 50}).json()
        
        scores_t = {d['cord_uid']: d['score'] for d in s_title.get('response',{}).get('docs',[])}
        scores_a = {d['cord_uid']: d['score'] for d in s_abst.get('response',{}).get('docs',[])}
        scores_p = {d['cord_uid']: d['score'] for d in s_phrase.get('response',{}).get('docs',[])}
        
        all_docs = set(scores_t.keys()) | set(scores_a.keys()) | set(scores_p.keys())
        
        for doc in all_docs:
            if (qid, doc) in qrel_dict:
                X.append([scores_t.get(doc,0), scores_a.get(doc,0), scores_p.get(doc,0)])
                y.append(qrel_dict[(qid, doc)])
    except: pass

# 3. Train & Print
if X:
    model = LinearRegression().fit(X, y)
    w = model.coef_
    base = w[0] if w[0] > 0.001 else 1.0
    print("\n" + "="*40)
    print("ü§ñ ML RECOMMENDED WEIGHTS")
    print("="*40)
    print(f"Title Boost:    1.0")
    print(f"Abstract Boost: {abs(w[1]/base):.2f}")
    print(f"Phrase Boost:   {abs(w[2]/base):.2f}")
    print("="*40)
    print("(I have pre-filled the next cell with robust defaults, but you can update them with these numbers!)")
else:
    print("‚ùå ML failed to find training data.")

Running ML Feature Extraction (This takes a minute)...


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:01<00:00, 23.32it/s]


ü§ñ ML RECOMMENDED WEIGHTS
Title Boost:    1.0
Abstract Boost: 1.39
Phrase Boost:   1.00
(I have pre-filled the next cell with robust defaults, but you can update them with these numbers!)





: 

In [None]:
RUN_FILENAME = "run_high_recall.txt"
RUN_NAME = "high_recall_run"

# STOPWORDS - Keep these to reduce noise slightly, but we will be looser elsewhere
STOPWORDS = {"what", "is", "the", "of", "in", "on", "to", "and", "a", "an", "for", "with", "are", "do", "does", "how"}

print(f"Running HIGH RECALL Search...")
with open(RUN_FILENAME, 'w') as f:
    for topic in tqdm(trec_topics, desc="Searching"):
        topic_id = topic['id']
        
        # 1. Clean Query
        clean_terms = [word for word in topic['query'].split() if word.lower() not in STOPWORDS]
        clean_query = " ".join(clean_terms)

        # 2. High Recall Params
        params = {
            "q": clean_query,
            "defType": "edismax",
            
            # WIDE NET: Search Title AND Abstract
            # Reduced boosting gap to allow abstract matches to surface
            "qf": "title^2 abstract^1", 
            
            # PHRASE BOOST: Keep this high to ensure top results are good (Precision)
            "pf": "title^10 abstract^5",
            "ps": 15,       # Very loose phrase slop (words can be far apart)
            
            # NO 'mm' parameter! (Allow any match)
            # This maximizes the number of docs we retrieve
            
            "rows": 1000,
            
            # RERANKING: Keep the recent paper boost, it's a solid heuristic
            "rq": f'{{!rerank reRankQuery="publication_year:2020^10 OR publication_year:2021^10" reRankDocs=1000 reRankWeight=2.0}}',
            
            "fl": "cord_uid,score"
        }
        
        try:
            resp = requests.get(f"{SOLR_URL}/{INDEX_NAME}/select", params=params)
            docs = resp.json().get('response', {}).get('docs', [])
            for rank, doc in enumerate(docs):
                f.write(f"{topic_id} Q0 {doc['cord_uid']} {rank+1} {doc['score']} {RUN_NAME}\n")
        except: pass

print(f"‚úÖ Run file created: {RUN_FILENAME}")

Running HIGH RECALL Search...


Searching: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:02<00:00, 24.70it/s]

‚úÖ Run file created: run_high_recall.txt





: 

In [None]:
import pytrec_eval

# Load QRELs
qrels = requests.get("https://ir.nist.gov/trec-covid/data/qrels-covid_d5_j0.5-5.txt").text
qrel = {}
for line in qrels.strip().split('\n'):
    qid, _, docid, rel = line.split()
    if qid not in qrel: qrel[qid] = {}
    qrel[qid][docid] = int(rel)

# Load Run
run = {}
with open(RUN_FILENAME, 'r') as f:
    for line in f:
        qid, _, docid, _, score, _ = line.split()
        if qid not in run: run[qid] = {}
        run[qid][docid] = float(score)

# Score
evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'map', 'P_100', 'recall_100'})
res = evaluator.evaluate(run)

print("\n" + "="*40)
print("FINAL RESULTS")
print("="*40)
print(f"MAP:        {sum(x['map'] for x in res.values()) / len(res):.4f}")
print(f"Prec@100:   {sum(x['P_100'] for x in res.values()) / len(res):.4f}")
print(f"Recall@100: {sum(x['recall_100'] for x in res.values()) / len(res):.4f}")
print("="*40)


üèÜ FINAL RESULTS
MAP:        0.2222
Prec@100:   0.5470
Recall@100: 0.1221


: 

In [None]:
# # Parameters used

# "defType": "edismax",

# "qf": "title^2 abstract^1", 

# "pf": "title^10 abstract^5",
# "ps": 15,     

# "rows": 1000,

# "rq": f'{{!rerank reRankQuery="publication_year:2020^10 OR publication_year:2021^10" reRankDocs=1000 reRankWeight=2.0}}', # push relevancy for recent papers

# "fl": "cord_uid,score"

: 