In [1]:
# Import needed libraries.
import contextlib
import io
from pathlib import Path

import pandas as pd

from impc_api import batch_solr_request, solr_request

In [2]:
def download_per_term(mp_term, requested_fields, silent=True):
    # Quick check for results.
    num_found, _ = solr_request(
        core='statistical-result',
        params={
            'q': f'mp_term_id_options:"{mp_term}"',
            'rows': 3,
            'fl': requested_fields
        },
        silent=silent
    )

    if num_found > 0:
        if silent:
            with contextlib.redirect_stdout(io.StringIO()):
                df = batch_solr_request(
                    core='statistical-result',
                    params={
                        'q': f'mp_term_id_options:"{mp_term}"',
                        'fl': requested_fields
                    },
                    download=False
                )
        else:
            df = batch_solr_request(
                core='statistical-result',
                params={
                    'q': f'mp_term_id_options:"{mp_term}"',
                    'fl': requested_fields
                },
                download=False
            )
        return df
    return None


def calculate_colony(df, term_name):
    if df is None or df.empty:
        print(f"- {term_name.upper()}: No data found.\n")
        return

    df = df.drop(columns=['parameter_stable_id'], errors='ignore')

    # Convert lists to strings.
    for col in df.columns:
        if df[col].apply(lambda x: isinstance(x, list)).any():
            df[col] = df[col].apply(str)

    df = df.drop_duplicates()

    # Create output directory using pathlib.
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    # Format filename and save to CSV.
    filename = f"{term_name.lower().replace(' ', '_')}_deduplicated.csv"
    filepath = output_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved deduplicated data to: {filepath}")

    grouped = df.groupby(
        ['colony_id', 'allele_symbol', 'marker_symbol', 'pipeline_stable_id']
    )['significant']

    significance_per_group = grouped.any().reset_index()

    count_true = significance_per_group['significant'].sum()
    count_false = len(significance_per_group) - count_true

    print(f'- {term_name.upper()}')
    print(f"Number of lines with significant phenotype: {count_true}")
    print(f"Number of lines without significant result: {count_false}")
    print(f"Total number of lines: {count_true + count_false}\n")

requested_fields = [
    'allele_symbol',
    'colony_id',
    'marker_symbol',
    'mp_term_id_options',
    'parameter_stable_id',
    'pipeline_stable_id',
    'significant'
]
requested_fields = ','.join(requested_fields)

## Number of lines assessed for otic vesicle morphology

In [3]:
df = download_per_term('MP:0009806', requested_fields)
calculate_colony(df, "Otic vesicle morphology")

5000it [00:00, 39346.56it/s]                                                                                                                                               

Saved deduplicated data to: output/otic_vesicle_morphology_deduplicated.csv
- OTIC VESICLE MORPHOLOGY
Number of lines with significant phenotype: 7
Number of lines without significant result: 812
Total number of lines: 819






## Number of lines assessed for ABR

In [4]:
df = download_per_term('MP:0004738', requested_fields)
calculate_colony(df, "Auditory brainstem response")

45000it [00:02, 16092.63it/s]                                                                                                                                              

Saved deduplicated data to: output/auditory_brainstem_response_deduplicated.csv
- AUDITORY BRAINSTEM RESPONSE
Number of lines with significant phenotype: 335
Number of lines without significant result: 6835
Total number of lines: 7170






## Number of lines examined for retinal abnormalities

In [5]:
retina_terms = batch_solr_request(
    core='mp', 
    params={
        'q': 'mp_term:*retina*',
        'fl': 'mp_id, mp_term'
    },
    download=False
)
for index, row in retina_terms.iterrows():
    df = download_per_term(row["mp_id"], requested_fields)
    calculate_colony(df, row["mp_term"])

Number of found documents: 14


5000it [00:00, 64688.98it/s]                                                                                                                                               
15000it [00:00, 21131.41it/s]                                                                                                                                              


Saved deduplicated data to: output/abnormal_retina_vasculature_morphology_deduplicated.csv
- ABNORMAL RETINA VASCULATURE MORPHOLOGY
Number of lines with significant phenotype: 201
Number of lines without significant result: 9117
Total number of lines: 9318

- ABNORMAL RETINA NEURONAL LAYER MORPHOLOGY: No data found.



5000it [00:00, 19336.84it/s]                                                                                                                                               
  0%|                                                                                                                                             | 0/1643 [00:00<?, ?it/s]

Saved deduplicated data to: output/increased_total_retina_thickness_deduplicated.csv
- INCREASED TOTAL RETINA THICKNESS
Number of lines with significant phenotype: 200
Number of lines without significant result: 1325
Total number of lines: 1525



5000it [00:00, 28499.92it/s]                                                                                                                                               
  0%|                                                                                                                                              | 0/831 [00:00<?, ?it/s]

Saved deduplicated data to: output/abnormal_retina_pigmentation_deduplicated.csv
- ABNORMAL RETINA PIGMENTATION
Number of lines with significant phenotype: 14
Number of lines without significant result: 739
Total number of lines: 753



5000it [00:00, 39562.19it/s]                                                                                                                                               
  0%|                                                                                                                                             | 0/3626 [00:00<?, ?it/s]

Saved deduplicated data to: output/retina_degeneration_deduplicated.csv
- RETINA DEGENERATION
Number of lines with significant phenotype: 0
Number of lines without significant result: 380
Total number of lines: 380



5000it [00:00, 18911.32it/s]                                                                                                                                               
  0%|                                                                                                                                            | 0/12538 [00:00<?, ?it/s]

Saved deduplicated data to: output/abnormal_total_retina_thickness_deduplicated.csv
- ABNORMAL TOTAL RETINA THICKNESS
Number of lines with significant phenotype: 200
Number of lines without significant result: 1325
Total number of lines: 1525



15000it [00:00, 20317.74it/s]                                                                                                                                              
  0%|                                                                                                                                             | 0/1643 [00:00<?, ?it/s]

Saved deduplicated data to: output/abnormal_retina_morphology_deduplicated.csv
- ABNORMAL RETINA MORPHOLOGY
Number of lines with significant phenotype: 441
Number of lines without significant result: 8574
Total number of lines: 9015



5000it [00:00, 22096.12it/s]                                                                                                                                               


Saved deduplicated data to: output/abnormal_retina_blood_vessel_pattern_deduplicated.csv
- ABNORMAL RETINA BLOOD VESSEL PATTERN
Number of lines with significant phenotype: 2
Number of lines without significant result: 751
Total number of lines: 753

- ABNORMAL RETINA LAYER MORPHOLOGY: No data found.

- ABNORMAL RETINA PIGMENT EPITHELIUM MORPHOLOGY: No data found.



20000it [00:01, 18295.29it/s]                                                                                                                                              
  0%|                                                                                                                                             | 0/2454 [00:00<?, ?it/s]

Saved deduplicated data to: output/abnormal_retina_blood_vessel_morphology_deduplicated.csv
- ABNORMAL RETINA BLOOD VESSEL MORPHOLOGY
Number of lines with significant phenotype: 228
Number of lines without significant result: 8337
Total number of lines: 8565



5000it [00:00, 25816.98it/s]                                                                                                                                               
  0%|                                                                                                                                             | 0/2454 [00:00<?, ?it/s]

Saved deduplicated data to: output/abnormal_retina_outer_nuclear_layer_morphology_deduplicated.csv
- ABNORMAL RETINA OUTER NUCLEAR LAYER MORPHOLOGY
Number of lines with significant phenotype: 81
Number of lines without significant result: 916
Total number of lines: 997



5000it [00:00, 26162.79it/s]                                                                                                                                               
  0%|                                                                                                                                             | 0/3626 [00:00<?, ?it/s]

Saved deduplicated data to: output/abnormal_retina_inner_nuclear_layer_morphology_deduplicated.csv
- ABNORMAL RETINA INNER NUCLEAR LAYER MORPHOLOGY
Number of lines with significant phenotype: 69
Number of lines without significant result: 928
Total number of lines: 997



5000it [00:00, 20462.02it/s]                                                                                                                                               

Saved deduplicated data to: output/decreased_total_retina_thickness_deduplicated.csv
- DECREASED TOTAL RETINA THICKNESS
Number of lines with significant phenotype: 200
Number of lines without significant result: 1325
Total number of lines: 1525




