In [1]:
# Import needed libraries.
from pathlib import Path

import pandas as pd

from impc_api import batch_solr_request, solr_request

In [2]:
def fetch_children(mp_id):
    num_found, df = solr_request(
        core='mp',
        params={
            'q': f'mp_id:"{mp_id}"',
            'fl': 'child_mp_id',
            'rows': 100
        },
        silent=True
    )

    if num_found == 0 or 'child_mp_id' not in df.columns:
        return []

    if num_found > 100:
        raise ValueError(f"Too many results found: {num_found}. Aborting.")

    df = df.explode('child_mp_id').dropna(subset=['child_mp_id'])
    return df['child_mp_id'].unique().tolist()

def fetch_all_descendants(mp_id):
    visited = set()
    queue = [mp_id]
    all_terms = [mp_id]

    while queue:
        current = queue.pop(0)
        if current in visited:
            continue
        visited.add(current)

        children = fetch_children(current)
        new_terms = [child for child in children if child not in visited]

        all_terms.extend(new_terms)
        queue.extend(new_terms)

    return all_terms

def filter_impc_mp_ids(mp_ids):
    valid_terms = []

    for mp_id in mp_ids:
        # Check MP core.
        num_found_mp, df_mp = solr_request(
            core='mp',
            params={
                'q': f'mp_id:"{mp_id}"',
                'fl': 'mp_id, mp_term',
                'rows': 100
            },
            silent=True
        )

        if num_found_mp == 1:
            # Check statistical results core.
            num_found_stat, _ = solr_request(
                core='statistical-result',
                params={
                    'q': f'mp_term_id_options:"{mp_id}"',
                    'fl': 'mp_term_id_options',
                    'rows': 1
                },
                silent=True
            )
            if num_found_stat > 0:
                valid_terms.append(mp_id)

    return valid_terms

def fetch_statistical_data(mp_id, fields):
    all_descendants = fetch_all_descendants(mp_id)
    mp_term_list = filter_impc_mp_ids(all_descendants)

    query = " OR ".join(f'mp_term_id_options:"{term}"' for term in mp_term_list)
    params = {
        'q': query,
        'fl': fields
    }

    return batch_solr_request(core='statistical-result', params=params, download=False)


def summarise_colony_significance(df, term_name):
    if df is None or df.empty:
        print(f"- {term_name.upper()}: No data found.\n")
        return

    df = df.drop(columns=['parameter_stable_id'], errors='ignore')

    # Convert lists to strings.
    for col in df.columns:
        if df[col].apply(lambda x: isinstance(x, list)).any():
            df[col] = df[col].apply(str)

    df = df.drop_duplicates()

    # Save deduplicated data.
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    filename = f"{term_name.lower().replace(' ', '_')}_deduplicated.csv"
    filepath = output_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved deduplicated data to: {filepath}")

    # Calculate significance.
    grouped = df.groupby(
        ['colony_id', 'allele_symbol', 'marker_symbol', 'pipeline_stable_id']
    )['significant']

    summary = grouped.any().reset_index()

    count_true = summary['significant'].sum()
    count_false = len(summary) - count_true

    print(f"- {term_name.upper()}")
    print(f"Significant lines: {count_true}")
    print(f"Non-significant lines: {count_false}")
    print(f"Total lines: {count_true + count_false}\n")

requested_fields = [
    'allele_symbol',
    'colony_id',
    'marker_symbol',
    'mp_term_id_options',
    'parameter_stable_id',
    'pipeline_stable_id',
    'significant'
]
requested_fields = ','.join(requested_fields)

## Number of lines assessed for otic vesicle morphology

In [3]:
df = fetch_statistical_data("MP:0009806", requested_fields)
summarise_colony_significance(df, "Otic vesicle morphology")

Number of found documents: 1116


5000it [00:00, 54841.98it/s]                                                                                                                                    

Saved deduplicated data to: output/otic_vesicle_morphology_deduplicated.csv
- OTIC VESICLE MORPHOLOGY
Significant lines: 7
Non-significant lines: 812
Total lines: 819






## Number of lines assessed for ABR

In [4]:
df = fetch_statistical_data('MP:0004738', requested_fields)
summarise_colony_significance(df, "Auditory brainstem response")

Number of found documents: 43941


45000it [00:02, 18178.79it/s]                                                                                                                                   

Saved deduplicated data to: output/auditory_brainstem_response_deduplicated.csv
- AUDITORY BRAINSTEM RESPONSE
Significant lines: 335
Non-significant lines: 6835
Total lines: 7170






## Number of lines examined for retinal abnormalities

In [5]:
df = fetch_statistical_data('MP:0001325', requested_fields)
summarise_colony_significance(df, "Abnormal retinal phenotype")

Number of found documents: 54872


55000it [00:02, 18690.91it/s]                                                                                                                                   

Saved deduplicated data to: output/abnormal_retinal_phenotype_deduplicated.csv
- ABNORMAL RETINAL PHENOTYPE
Significant lines: 817
Non-significant lines: 8501
Total lines: 9318




