In [8]:
!pip install biopython



In [None]:
import pandas as pd
import time
import json
from Bio import Entrez
from collections import defaultdict
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set NCBI email (required)
Entrez.email = "your_email@example.com"  # Replace with your actual email

# Load the dataset from PathoPlexus
url = "https://lapis.pathoplexus.org/cchf/sample/details?downloadAsFile=true&downloadFileBasename=cchf_metadata_2025-04-03T2223&dataFormat=tsv&fields=accessionVersion%2CdataUseTerms%2CdataUseTermsUrl%2CearliestReleaseDate%2CgeoLocAdmin1%2CgeoLocAdmin2%2CgeoLocCity%2CgeoLocCountry%2ChostNameScientific%2Clength_L%2Clength_M%2Clength_S%2CsampleCollectionDate%2CinsdcAccessionFull_L%2CinsdcAccessionFull_S%2CinsdcAccessionFull_M%2CspecimenCollectorSampleId%2Cauthors%2CauthorAffiliations&versionStatus=LATEST_VERSION&isRevocation=false"
df = pd.read_csv(url, sep="\t")

# Gather all unique INSDC accessions
accession_cols = ["insdcAccessionFull_L", "insdcAccessionFull_M", "insdcAccessionFull_S"]
all_insdc_ids = set()
for col in accession_cols:
    all_insdc_ids.update(df[col].dropna().str.split(",").explode().str.strip())

print(f"🔎 Found {len(all_insdc_ids)} unique INSDC accessions.")

# Function to fetch strain name from NCBI
def fetch_strain(acc):
    try:
        handle = Entrez.efetch(db="nucleotide", id=acc, rettype="gb", retmode="text")
        record = handle.read()
        handle.close()
        for line in record.splitlines():
            if "/strain=" in line:
                return acc, line.split("=", 1)[1].strip().strip('"')
    except Exception as e:
        return acc, None
    return acc, None

# Use ThreadPoolExecutor to fetch up to 5 in parallel
print("⚡ Fetching strain names using NCBI efetch...")
accession_to_strain = {}
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(fetch_strain, acc) for acc in all_insdc_ids]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching"):
        acc, strain = future.result()
        if strain:
            accession_to_strain[acc] = strain
        time.sleep(0.1)  # slight delay between batches to be polite

# Map strain → list of accessions
strain_to_accessions = defaultdict(list)
for acc, strain in accession_to_strain.items():
    strain_to_accessions[strain].append(acc)

# Save result to JSON
with open("strain_accession_map.json", "w") as f:
    json.dump(strain_to_accessions, f, indent=2)

print("✅ All done! Saved as 'strain_accession_map.json'")


🔎 Found 4800 unique INSDC accessions.
⚡ Fetching strain names using NCBI efetch...


Fetching:   0%|          | 0/4800 [00:00<?, ?it/s]

✅ All done! Saved as 'strain_accession_map.json'


In [11]:
import pandas as pd
import json
import time
from collections import defaultdict

# ----- Step 1: Load strain→accession map -----
with open("strain_accession_map.json", "r") as f:
    strain_accession_map = json.load(f)

# Create reverse lookup
accession_to_strain = {}
for strain, accessions in strain_accession_map.items():
    for acc in accessions:
        accession_to_strain[acc] = strain

# ----- Step 2: Download input dataset -----
url = (
    "https://lapis.pathoplexus.org/cchf/sample/details?"
    "downloadAsFile=true&downloadFileBasename=cchf_metadata_2025-04-03T2223"
    "&dataFormat=tsv&fields=accessionVersion%2CdataUseTerms%2CdataUseTermsUrl"
    "%2CearliestReleaseDate%2CgeoLocAdmin1%2CgeoLocAdmin2%2CgeoLocCity"
    "%2CgeoLocCountry%2ChostNameScientific%2Clength_L%2Clength_M%2Clength_S"
    "%2CsampleCollectionDate%2CinsdcAccessionFull_L%2CinsdcAccessionFull_S"
    "%2CinsdcAccessionFull_M%2CspecimenCollectorSampleId%2Cauthors%2CauthorAffiliations"
    "&versionStatus=LATEST_VERSION&isRevocation=false"
)
df = pd.read_csv(url, sep="\t")

# ----- Step 3: Segment presence flags -----
df["has_L"] = df["length_L"].apply(lambda x: x > 0)
df["has_M"] = df["length_M"].apply(lambda x: x > 0)
df["has_S"] = df["length_S"].apply(lambda x: x > 0)

# ----- Step 4: Assign strain per row -----
def determine_strain(row):
    for col in ["insdcAccessionFull_L", "insdcAccessionFull_M", "insdcAccessionFull_S"]:
        if pd.notnull(row[col]):
            ids = [x.strip() for x in row[col].split(",") if x.strip()]
            for acc in ids:
                if acc in accession_to_strain:
                    return accession_to_strain[acc]
    return None

df["strain"] = df.apply(determine_strain, axis=1)
df = df[df["strain"].notnull()].copy()
print(f"✅ Rows with matched strain info: {len(df)}")

# ----- Step 5: Group by strain -----
grouped_by_strain = df.groupby("strain").agg({
    "length_L": "max",
    "length_M": "max",
    "length_S": "max",
    "accessionVersion": lambda x: list(x),
    "insdcAccessionFull_L": lambda x: ", ".join(x.dropna().astype(str)),
    "insdcAccessionFull_M": lambda x: ", ".join(x.dropna().astype(str)),
    "insdcAccessionFull_S": lambda x: ", ".join(x.dropna().astype(str)),
    "sampleCollectionDate": lambda x: ", ".join(x.dropna().astype(str)),
    "geoLocCountry": lambda x: ", ".join(x.dropna().astype(str)),
    "hostNameScientific": lambda x: ", ".join(x.dropna().astype(str)),
    "authors": lambda x: list(x),
    "earliestReleaseDate": lambda x: ", ".join(x.dropna().astype(str)),
    "has_L": "sum",
    "has_M": "sum",
    "has_S": "sum"
}).reset_index()

# ----- Step 6: Filter groups -----
filtered = grouped_by_strain[
    (grouped_by_strain["accessionVersion"].apply(lambda x: len(x) > 1)) &
    (grouped_by_strain["has_L"] <= 1) &
    (grouped_by_strain["has_M"] <= 1) &
    (grouped_by_strain["has_S"] <= 1)
]

# ----- Step 7: Generate JSON output -----
def extract_insdc_list(row):
    insdc_ids = []
    for col in ["insdcAccessionFull_L", "insdcAccessionFull_M", "insdcAccessionFull_S"]:
        if pd.notnull(row[col]):
            insdc_ids.extend([x.strip() for x in row[col].split(",") if x.strip()])
    return insdc_ids

strain_json = {}
for _, row in filtered.iterrows():
    insdc_ids = extract_insdc_list(row)
    unique_ids = list(dict.fromkeys(insdc_ids))  # keep order, remove dups
    key = "_".join(unique_ids)
    strain_json[key] = insdc_ids

# ----- Step 8: Save outputs -----
filtered.to_csv("grouped_by_strain.tsv", sep="\t", index=False)
with open("strain_groupings.json", "w") as f:
    json.dump(strain_json, f, indent=2)

print("✅ Saved:")
print("- grouped_by_strain.tsv")
print("- strain_groupings.json")


✅ Rows with matched strain info: 391
✅ Saved:
- grouped_by_strain.tsv
- strain_groupings.json
