# How to Get Raw-QCed Cardiovascular Data?

To know more about solr requests, go to the workshop [repository](https://github.com/mpi2/impc-data-api-workshop/tree/main) with all materials.

### To download the data

### Set up helper functions
1. `solr_request` — Performs a single Solr request
2. `batch_request` — Calls `solr_request` multiple times with `params` to retrieve results in chunk `batch_size` rows at a time.
3. `facet_request` — Performs a faceting Solr request
4. `entity_iterator` — Generator function fetches results from the Solr server in chunks using pagination.
5. `iterator_solr_request` — Fetches results in batches from the Solr API and write them to a file.

In [1]:
import csv
import json
from urllib.parse import unquote

import pandas as pd
import numpy as np
import requests
from IPython.display import display
from tqdm import tqdm

# Display the whole dataframe <15
# pd.set_option('display.max_rows', 15)
# pd.set_option('display.max_columns', None)

# Create helper function
def solr_request(core, params, silent=False):
    """Performs a single Solr request.
    
    Returns:
        num_found: How many rows in total did the request match.
        df: A Pandas dataframe with a portion of the request matching `start` and `rows` parameters.
        silent: Suppress displaying the df and number of results (useful for batch requests).
    """
    base_url = "https://www.ebi.ac.uk/mi/impc/solr/"
    solr_url = base_url + core + "/select"

    response = requests.get(solr_url, params=params)
    if not silent:
        print(f"\nYour request:\n{response.request.url}\n")
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        num_found = data["response"]["numFound"]
        if not silent:
            print(f'Number of found documents: {num_found}\n')
        # Extract and add search results to the list
        search_results = []
        for doc in data["response"]["docs"]:
            search_results.append(doc)
    
        # Convert the list of dictionaries into a DataFrame and print the DataFrame
        df = pd.DataFrame(search_results)
        if not silent:
            display(df)
        return num_found, df
    
    else:
        print("Error:", response.status_code, response.text)

def batch_request(core, params, batch_size):
    """Calls `solr_request` multiple times with `params` to retrieve results in chunk `batch_size` rows at a time."""
    if "rows" in "params":
        print("WARN: You have specified the `params` -> `rows` value. It will be ignored, because the data is retrieved `batch_size` rows at a time.")
    # Determine the total number of rows. Note that we do not request any data (rows = 0).
    num_results, _ = solr_request(core=core, params={**params, "start": 0, "rows": 0}, silent=True)
    # Initialise everything for data retrieval.
    start = 0
    chunks = []
    # Request chunks until we have complete data.
    with tqdm(total=num_results) as pbar:  # Initialize tqdm progress bar.
        while start < num_results:
            # Update progress bar with the number of rows requested.
            pbar.update(batch_size) 
            # Request chunk. We don't need num_results anymore because it does not change.
            _, df_chunk = solr_request(core=core, params={**params, "start": start, "rows": batch_size}, silent=True)
            # Record chunk.
            chunks.append(df_chunk)
            # Increment start.
            start += batch_size
    # Prepare final dataframe.
    return pd.concat(chunks, ignore_index=True)

def facet_request(core, params, silent=False):
    """Performs a faceting Solr request.
    
    Returns:
        num_found: How many rows in total did the request match.
        df: A Pandas dataframe with a portion of the request matching `start` and `rows` parameters.
        silent: Suppress displaying the df and number of results (useful for batch requests).
    """
    base_url = "https://www.ebi.ac.uk/mi/impc/solr/"
    solr_url = base_url + core + "/select"

    response = requests.get(solr_url, params=params)
    if not silent:
        print(f"\nYour request:\n{unquote(response.request.url)}\n")
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        num_found = data["response"]["numFound"]
        if not silent:
            print(f'Number of found documents: {num_found}\n')
        # Extract and add faceting query results to the list
        facet_counts = data["facet_counts"]["facet_fields"][params["facet.field"]]
        # Initialize an empty dictionary
        faceting_dict = {}
        # Iterate over the list, taking pairs of elements
        for i in range(0, len(facet_counts), 2):
            # Assign label as key and count as value
            label = facet_counts[i]
            count = facet_counts[i + 1]
            faceting_dict[label] = [count]
        
        # Print the resulting dictionary
        # Convert the list of dictionaries into a DataFrame and print the DataFrame
        df = pd.DataFrame(faceting_dict)
        df = pd.DataFrame.from_dict(faceting_dict, orient='index', columns=['counts']).reset_index()

        # Rename the columns
        df.columns = [params["facet.field"], 'count_per_category']
        if not silent:
            display(df)
        return num_found, df
    
    else:
        print("Error:", response.status_code, response.text)

# Helper function to fetch results. This function is used by the 'iterator_solr_request' function.
def entity_iterator(base_url, params):
    """Generator function to fetch results from the SOLR server in chunks using pagination

    Args:
        base_url (str): The base URL of the Solr server to fetch documents from.
        params (dict): A dictionary of parameters to include in the GET request. Must include
                       'start' and 'rows' keys, which represent the index of the first document
                       to fetch and the number of documents to fetch per request, respectively.

    Yields:
        dict: The next document in the response from the Solr server.
    """
    # Initialise variable to check the first request
    first_request = True

    # Call the API in chunks and yield the documents in each chunk
    while True:
        response = requests.get(base_url, params=params)
        data = response.json()
        docs = data["response"]["docs"]

        # Print the first request only
        if first_request:
            print(f'Your first request: {response.url}')
            first_request = False

        # Yield the documents in the current chunk
        for doc in docs:
            yield doc

        # Check if there are more results to fetch
        start = params["start"] + params["rows"]
        num_found = data["response"]["numFound"]
        if start >= num_found:
            break

        # Update the start parameter for the next request
        params["start"] = start

    # Print last request and total number of documents retrieved
    print(f'Your last request: {response.url}')
    print(f'Number of found documents: {data["response"]["numFound"]}\n')

# Function to iterate over field list and write results to a file.
def iterator_solr_request(core, params, filename='iteration_solr_request', format='json'):
    """Function to fetch results in batches from the Solr API and write them to a file
        Defaults to fetching 5000 rows at a time.

    Args:
        core (str): The name of the Solr core to fetch results from.
        params (dict): A dictionary of parameters to use in the filter query. Must include
                       'field_list' and 'field_type' keys, which represent the list of field items (i.e., list of MGI model identifiers)
                        to fetch and the type of the field (i.e., model_id) to filter on, respectively.
        filename (str): The name of the file to write the results to. Defaults to 'iteration_solr_request'.
        format (str): The format of the output file. Can be 'csv' or 'json'. Defaults to 'json'.
    """

    # Validate format
    if format not in ['json','csv']:
        raise ValueError("Invalid format. Please use 'json' or 'csv'")
    
    # Base URL
    base_url = "https://www.ebi.ac.uk/mi/impc/solr/"
    solr_url = base_url + core + "/select"

    # Extract entities_list and entity_type from params
    field_list = params.pop("field_list")
    field_type = params.pop("field_type")

    # Construct the filter query with grouped model IDs
    fq = "{}:({})".format(
        field_type, " OR ".join(['"{}"'.format(id) for id in field_list])
    )

    # Show users the field and field values they passed to the function
    print("Queried field:",fq)
    # Set internal params the users should not change
    params["fq"] = fq
    params["wt"] = 'json'
    params["start"]=0 # Start at the first result
    params["rows"]=5000 # Fetch results in chunks of 5000


    try:
        # Fetch results using a generator function
        results_generator = entity_iterator(solr_url, params)
    except Exception as e:
        raise Exception("An error occurred while downloading the data: " + str(e))

    # Append extension to the filename
    filename = f"{filename}.{format}"

    try:
        # Open the file in write mode
        with open(filename, "w", newline="") as f:
            if format == 'csv':
                writer = None
                for item in results_generator:
                    # Initialize the CSV writer with the keys of the first item as the field names
                    if writer is None:
                        writer = csv.DictWriter(f, fieldnames=item.keys())
                        writer.writeheader()
                    # Write the item to the CSV file
                    writer.writerow(item)
                    # Write to json without loading to memory.
            elif format == 'json':
                f.write('[')
                for i, item in enumerate(results_generator):
                    if i != 0:
                        f.write(',')
                    json.dump(item, f)
                f.write(']')
    except Exception as e:
        raise Exception("An error occurred while writing the file: " + str(e))

    print(f"File {filename} was created.")

def parse_metadata(metadata_list):
    metadata_dict = {}
    for item in metadata_list:
        key, value = item.split(' = ')
        metadata_dict[key] = value
    return metadata_dict

In [2]:
# Request data for ECG
field_list = 'experiment_id,specimen_id,observation_id,biological_sample_group,pipeline_stable_id,procedure_stable_id,phenotyping_center,production_center,external_sample_id,strain_name,sex,zygosity,date_of_birth,date_of_experiment,age_in_weeks,parameter_stable_id,parameter_name,data_point,unit,observation_type,metadata_group,weight,weight_date,weight_days_old,weight_parameter_stable_id,metadata,life_stage_name,gene_symbol'
df = batch_request(
    core='experiment',
    params={
        'q': 'procedure_name:"Electrocardiogram (ECG)"',
        'fl': field_list
    },
    batch_size=50000
)
df.to_json("ECG_raw.json", orient="records")

# Request data for HWT
field_list = 'experiment_id,specimen_id,observation_id,biological_sample_group,pipeline_stable_id,procedure_stable_id,phenotyping_center,production_center,external_sample_id,strain_name,sex,zygosity,date_of_birth,date_of_experiment,age_in_weeks,parameter_stable_id,parameter_name,data_point,unit,observation_type,metadata_group,weight,weight_date,weight_days_old,weight_parameter_stable_id,metadata,life_stage_name,gene_symbol'
df = batch_request(
    core='experiment',
    params={
        'q': 'procedure_name:"Heart Weight"',
        'fl': field_list
    },
    batch_size=50000
)
df.to_json("HWT_raw.json", orient="records")

# Request data for ECH
field_list = 'experiment_id,specimen_id,observation_id,biological_sample_group,pipeline_stable_id,procedure_stable_id,phenotyping_center,production_center,external_sample_id,strain_name,sex,zygosity,date_of_birth,date_of_experiment,age_in_weeks,parameter_stable_id,parameter_name,data_point,unit,observation_type,metadata_group,weight,weight_date,weight_days_old,weight_parameter_stable_id,metadata,life_stage_name,gene_symbol'
df = batch_request(
    core='experiment',
    params={
        'q': 'procedure_name:"Echo"',
        'fl': field_list
    },
    batch_size=10000
)
df.to_json("ECH_raw.json", orient="records")

# Request data for OWT
field_list = 'experiment_id,specimen_id,observation_id,biological_sample_group,pipeline_stable_id,procedure_stable_id,phenotyping_center,production_center,external_sample_id,strain_name,sex,zygosity,date_of_birth,date_of_experiment,age_in_weeks,parameter_stable_id,parameter_name,data_point,unit,observation_type,metadata_group,weight,weight_date,weight_days_old,weight_parameter_stable_id,metadata,life_stage_name,gene_symbol'
df = batch_request(
    core='experiment',
    params={
        'q': 'procedure_name:"Organ Weight"',
        'fl': field_list
    },
    batch_size=50000
)
df.to_json("OWT_raw.json", orient="records")

# Request data for body weight (BWT)
field_list = 'experiment_id,specimen_id,observation_id,biological_sample_group,pipeline_stable_id,procedure_stable_id,phenotyping_center,production_center,external_sample_id,strain_name,sex,zygosity,date_of_birth,date_of_experiment,age_in_weeks,parameter_stable_id,parameter_name,data_point,observation_type,metadata_group,metadata,life_stage_name,gene_symbol'
df = batch_request(
    core='experiment',
    params={
        'q': 'parameter_stable_id:*_CAL_001_001 OR parameter_stable_id:*_DXA_001_001 OR parameter_stable_id:*_BWT_001_001',
        'fl': field_list
    },
    batch_size=50000
)
df.to_json("BWT_raw.json", orient="records")

# 1. Download Electrocardiogram (ECG) data
We use `procedure_name` field to filter out Electrocardiogram data. 

In [4]:
df_ecg = pd.read_json('ECG_raw.json')

In [5]:
# Rename data_point -> value
df_ecg = df_ecg.rename(columns={'data_point': 'value'})

# Apply the function to the metadata column
metadata_dicts = df_ecg['metadata'].apply(parse_metadata)

# Convert list of dictionaries to a DataFrame
metadata_df = pd.DataFrame(metadata_dicts.tolist())

# Merge the new DataFrame with the original one
result_df = pd.concat([df_ecg.drop(columns=['metadata']), metadata_df], axis=1)

# Get parameter_unit list of dictionaries
procID_list1 = [str(i) for i in range(1415, 1433)]
procID_list2 = [str(i) for i in range(647, 654)]
procID_list3 = ['108','126','932', '1134', '1156', '1000126']
procID_list = procID_list1 + procID_list2 + procID_list3

parameter_list = []
for procID in procID_list:
    response = requests.get('https://api.mousephenotype.org/impress/parameter/belongingtoprocedure/full/' + procID)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        parameter_unit = [{'parameterKey': d['parameterKey'], 'unit': d['unit']} for d in data]
        parameter_list.extend(parameter_unit)
    
# Get dictionary of ID and units
response = requests.get('https://api.mousephenotype.org/impress/unit/list')
if response.status_code == 200:
# Parse the JSON response
   unit_dict = response.json()

# Replace 'unit' values in parameter_unit list with values from unit_dict
for item in parameter_list:
    if str(item['unit']) in unit_dict:
        item['unit'] = unit_dict[str(item['unit'])]

# Convert parameter_unit list of dicts to a dictionary for easier lookup
parameter_unit_dict = {item['parameterKey']: item['unit'] for item in parameter_list}

# Add 'unit' column based on parameter_stable_id
result_df['unit'] = result_df['parameter_stable_id'].map(parameter_unit_dict)

# Specify list of columns in the final dataframe
ecg_csv = "experiment_id	specimen_id	observation_id	biological_sample_group	pipeline_stable_id	procedure_stable_id	phenotyping_center	production_center	external_sample_id	strain_name	sex	zygosity	date_of_birth	date_of_experiment	age_in_weeks	parameter_stable_id	parameter_name	value	unit	observation_type	metadata_group	weight	weight_date	weight_days_old	weight_parameter_stable_id	Analysis Software	Anesthetic	Equipment Manufacturer	Equipment Model	gene_symbol	life_stage_name"
ecg_columns = ecg_csv.split("	")
result_df_slice = result_df.loc[:, ecg_columns]

# Split the DataFrame
ea_ecg_df = result_df_slice[result_df_slice['life_stage_name'] == 'Early adult'].drop(columns=['life_stage_name'])
la_ecg_df = result_df_slice[result_df_slice['life_stage_name'] == 'Late adult'].drop(columns=['life_stage_name'])

ea_ecg_df.to_csv("output/EA_ECG.csv", index=False)
la_ecg_df.to_csv("output/LA_ECG.csv", index=False)

# 2. Download Heart Weight (HWT) data

In [6]:
df_hwt = pd.read_json('HWT_raw.json')

In [7]:
# Rename data_point -> value
df_hwt = df_hwt.rename(columns={'data_point': 'value'})

# Apply the function to the metadata column
metadata_dicts = df_hwt['metadata'].apply(parse_metadata)

# Convert list of dictionaries to a DataFrame
metadata_df = pd.DataFrame(metadata_dicts.tolist())

# Merge the new DataFrame with the original one
result_df = pd.concat([df_hwt.drop(columns=['metadata']), metadata_df], axis=1)

# Get parameter_unit list of dictionaries
# procID was used for api from HWT page: https://www.mousephenotype.org/impress/ProcedureInfo?action=list&procID=601&pipeID=7
# response = requests.get('https://api.mousephenotype.org/impress/parameter/belongingtoprocedure/full/601')
procID_list1 = [str(i) for i in range(600, 607)]
procID_list2 = ['986', '1038','1060','1072', '1091', '1108', '1141', '1163', '1292', '1332', '1333', '100141']
procID_list3 = ['100']
procID_list = procID_list1 + procID_list2 + procID_list3

parameter_list = []
for procID in procID_list:
    response = requests.get('https://api.mousephenotype.org/impress/parameter/belongingtoprocedure/full/' + procID)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        parameter_unit = [{'parameterKey': d['parameterKey'], 'unit': d['unit']} for d in data]
        parameter_list.extend(parameter_unit)
    
# Get dictionary of ID and units
response = requests.get('https://api.mousephenotype.org/impress/unit/list')
if response.status_code == 200:
# Parse the JSON response
   unit_dict = response.json()

# Replace 'unit' values in parameter_unit list with values from unit_dict
for item in parameter_list:
    if str(item['unit']) in unit_dict:
        item['unit'] = unit_dict[str(item['unit'])]

# Convert parameter_unit list of dicts to a dictionary for easier lookup
parameter_unit_dict = {item['parameterKey']: item['unit'] for item in parameter_list}

# Add 'unit' column based on parameter_stable_id
result_df['unit'] = result_df['parameter_stable_id'].map(parameter_unit_dict)

# Specify list of columns in the final dataframe
hwt_csv = "experiment_id	specimen_id	observation_id	biological_sample_group	pipeline_stable_id	procedure_stable_id	phenotyping_center	production_center	external_sample_id	strain_name	sex	zygosity	date_of_birth	date_of_experiment	age_in_weeks	parameter_stable_id	parameter_name	value	unit	observation_type	metadata_group	weight	weight_date	weight_days_old	weight_parameter_stable_id	Equipment manufacturer	Equipment model	gene_symbol	life_stage_name"
hwt_columns = hwt_csv.split("	")
result_df_slice = result_df.loc[:, hwt_columns]

# Split the DataFrame
ea_hwt_df = result_df_slice[result_df_slice['life_stage_name'] == 'Early adult'].drop(columns=['life_stage_name'])
la_hwt_df = result_df_slice[result_df_slice['life_stage_name'] == 'Late adult'].drop(columns=['life_stage_name'])

ea_hwt_df.to_csv("output/EA_HWT.csv", index=False)
la_hwt_df.to_csv("output/LA_HWT.csv", index=False)

# 3. Download Echo (ECH) data.

In [8]:
df_ech = pd.read_json('ECH_raw.json')

In [9]:
def get_impressTime(row):
    pipeline_stable_id = row['pipeline_stable_id']
    procedure_stable_id = row['procedure_stable_id']
    
    # Get scheduleId for procedure_stable_id
    response = requests.get('https://api.mousephenotype.org/impress/procedure/bykey/' + procedure_stable_id)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
    schedule_list = []
    for dictionary in data:
        schedule_list.append(dictionary['scheduleId'])
    
    # Based on scheduleId and pipeline_stable_id get time
    response = requests.get('https://api.mousephenotype.org/impress/schedule/belongingtopipeline/full/' + pipeline_stable_id)
    if response.status_code == 200:
        # Parse the JSON response
        schedule = response.json()
        
    for dict_pipeline in schedule:
        for schedule_id in schedule_list:
            if dict_pipeline['scheduleId'] == schedule_id:
                return dict_pipeline['time']

# Rename data_point -> value
df_ech = df_ech.rename(columns={'data_point': 'value', 'weight_date': 'body_weight_doe'})

# Apply the function to the metadata column
metadata_dicts = df_ech['metadata'].apply(parse_metadata)

# Convert list of dictionaries to a DataFrame
metadata_df = pd.DataFrame(metadata_dicts.tolist())

# Merge the new DataFrame with the original one
result_df = pd.concat([df_ech.drop(columns=['metadata']), metadata_df], axis=1)

# Get parameter_unit list of dictionaries
# procID was used for api from ECH page: https://www.mousephenotype.org/impress/ProcedureInfo?action=list&procID=654&pipeID=7
response = requests.get('https://api.mousephenotype.org/impress/parameter/belongingtoprocedure/full/654')
procID_list1 = [str(i) for i in range(654, 659)]
procID_list2 = ['109', '450', '1046', '1065', '1082', '1115']
procID_list3 = ['1256']
procID_list = procID_list1 + procID_list2 + procID_list3

parameter_list = []
for procID in procID_list:
    response = requests.get('https://api.mousephenotype.org/impress/parameter/belongingtoprocedure/full/' + procID)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        parameter_unit = [{'parameterKey': d['parameterKey'], 'unit': d['unit']} for d in data]
        parameter_list.extend(parameter_unit)
    
# Get dictionary of ID and units
response = requests.get('https://api.mousephenotype.org/impress/unit/list')
if response.status_code == 200:
# Parse the JSON response
   unit_dict = response.json()

# Replace 'unit' values in parameter_unit list with values from unit_dict
for item in parameter_list:
    if str(item['unit']) in unit_dict:
        item['unit'] = unit_dict[str(item['unit'])]

# Convert parameter_unit list of dicts to a dictionary for easier lookup
parameter_unit_dict = {item['parameterKey']: item['unit'] for item in parameter_list}

# Add 'unit' column based on parameter_stable_id
result_df['unit'] = result_df['parameter_stable_id'].map(parameter_unit_dict)

# Add 'impressTimepoint'
pipeline_procedure = result_df.groupby(['pipeline_stable_id', 'procedure_stable_id']).size().reset_index(name='count')    
pipeline_procedure['impress_time'] = pipeline_procedure.apply(get_impressTime, axis=1)
pipeline_procedure['impress_time'] = pipeline_procedure['impress_time'].astype(int)

# Merge the dataframes on 'pipeline_stable_id' and 'procedure_stable_id'
merged_df = pd.merge(result_df, pipeline_procedure[['pipeline_stable_id', 'procedure_stable_id', 'impress_time']],
                     on=['pipeline_stable_id', 'procedure_stable_id'], how='left')

# Rename the 'impress_time' column to 'impressTimepoint'
merged_df = merged_df.rename(columns={'impress_time': 'impressTimepoint'})

# Add column with age in weeks for body_weight
merged_df['body_weight_age_in_weeks'] = merged_df['weight_days_old'].apply(lambda x: x // 7 if pd.notnull(x) else np.nan)

# Calculate age_in_weeks-timepoint
merged_df['(age_in_weeks-timepoint)'] = abs(merged_df['body_weight_age_in_weeks'] - merged_df['impressTimepoint'])

# Specify list of columns in the final dataframe
ech_csv = "experiment_id	specimen_id	observation_id	biological_sample_group	pipeline_stable_id	procedure_stable_id	phenotyping_center	production_center	external_sample_id	strain_name	sex	zygosity	date_of_birth	date_of_experiment	age_in_weeks	parameter_stable_id	parameter_name	value	unit	observation_type	metadata_group	weight	weight_days_old	weight_parameter_stable_id	Anesthetic	Equipment Manufacturer	Equipment Model	impressTimepoint	(age_in_weeks-timepoint)	gene_symbol	life_stage_name"
ech_columns = ech_csv.split("	")
result_df_slice = merged_df.loc[:, ech_columns]

# Split the DataFrame
ea_ech_df = result_df_slice[result_df_slice['life_stage_name'] == 'Early adult'].drop(columns=['life_stage_name'])
la_ech_df = result_df_slice[result_df_slice['life_stage_name'] == 'Late adult'].drop(columns=['life_stage_name'])

ea_ech_df.to_csv("output/EA_ECH.csv", index=False)
la_ech_df.to_csv("output/LA_ECH.csv", index=False)

# 4. Download Organ Weight (OWT) data.

In [10]:
df_owt = pd.read_json('OWT_raw.json')

In [11]:
# Rename data_point -> value
df_owt = df_owt.rename(columns={'data_point': 'value'})

# Apply the function to the metadata column
metadata_dicts = df_owt['metadata'].apply(parse_metadata)

# Convert list of dictionaries to a DataFrame
metadata_df = pd.DataFrame(metadata_dicts.tolist())

# Merge the new DataFrame with the original one
result_df = pd.concat([df_owt.drop(columns=['metadata']), metadata_df], axis=1)

# Get parameter_unit list of dictionaries
procID_list1 = [str(i) for i in range(939, 944)]
procID_list2 = ['106', '247', '275','1059', '1125', '1229', '1263', '1291', '1366', '1412']
procID_list3 = ['1000247', '1000939', '1000940', '1000941', '1000942', '1000943', '1001125', '1000106']
procID_list4 = [str(i) for i in range(600, 607)]
procID_list5 = ['986', '1038','1060','1072', '1091', '1108', '1141', '1163', '1292', '1332', '1333', '100141']
procID_list6 = ['100']
procID_list = procID_list1 + procID_list2 + procID_list3 + procID_list4 + procID_list5 + procID_list6

parameter_list = []
for procID in procID_list:
    response = requests.get('https://api.mousephenotype.org/impress/parameter/belongingtoprocedure/full/' + procID)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        parameter_unit = [{'parameterKey': d['parameterKey'], 'unit': d['unit']} for d in data]
        parameter_list.extend(parameter_unit)
    
# Get dictionary of ID and units
response = requests.get('https://api.mousephenotype.org/impress/unit/list')
if response.status_code == 200:
# Parse the JSON response
   unit_dict = response.json()

# Replace 'unit' values in parameter_unit list with values from unit_dict
for item in parameter_list:
    if str(item['unit']) in unit_dict:
        item['unit'] = unit_dict[str(item['unit'])]

# Convert parameter_unit list of dicts to a dictionary for easier lookup
parameter_unit_dict = {item['parameterKey']: item['unit'] for item in parameter_list}

# Add 'unit' column based on parameter_stable_id
result_df['unit'] = result_df['parameter_stable_id'].map(parameter_unit_dict)

# Specify list of columns in the final dataframe
owt_csv = "experiment_id	specimen_id	observation_id	biological_sample_group	pipeline_stable_id	procedure_stable_id	phenotyping_center	production_center	external_sample_id	strain_name	sex	zygosity	date_of_birth	date_of_experiment	age_in_weeks	parameter_stable_id	parameter_name	value	unit	observation_type	metadata_group	weight	weight_date	weight_days_old	weight_parameter_stable_id	Equipment manufacturer	Equipment model	gene_symbol	life_stage_name"
owt_columns = owt_csv.split("	")
result_df_slice = result_df.loc[:, owt_columns]

# Split the DataFrame
ea_owt_df = result_df_slice[result_df_slice['life_stage_name'] == 'Early adult'].drop(columns=['life_stage_name'])
la_owt_df = result_df_slice[result_df_slice['life_stage_name'] == 'Late adult'].drop(columns=['life_stage_name'])

ea_owt_df.to_csv("output/EA_OWT.csv", index=False)
la_owt_df.to_csv("output/LA_OWT.csv", index=False)

# 5. Download Body Weight (BWT) data

In [12]:
df_bwt = pd.read_json('BWT_raw.json')

In [13]:
# Rename data_point -> value
df_bwt = df_bwt.rename(columns={'data_point': 'value'})

# Apply the function to the metadata column
metadata_dicts = df_bwt['metadata'].apply(parse_metadata)

# Convert list of dictionaries to a DataFrame
metadata_df = pd.DataFrame(metadata_dicts.tolist())

# Merge the new DataFrame with the original one
result_df = pd.concat([df_bwt.drop(columns=['metadata']), metadata_df], axis=1)

# Get parameter_unit list of dictionaries
procID_list1 = [str(i) for i in range(623, 632)]
procID_list2 = ['90', '103', '135', '137', '329', '331', '336', '346', '388', '369', '402', '423', '431', '432', '454', '462', '467', '470', '471', '477', '1076', '1186', '1094', '1102', '1120', '1137','1151', '1158', '1159', '1165', '1166', '1259', '1384' ]
procID_list3 = ['1314', '1330', '992', '1000135', '1000942', '1000103', '1000329', '1000331', '1000336', '1000338', '1000431', '1000432', '1000454']
procID_list4 = [str(i) for i in range(548, 556)]
procID_list5 = ['86', '121', '153', '240', '977', '1045', '1057', '1081', '1110', '1191', '1254']
procID_list6 = [str(i) for i in range(524, 530)]
procID_list7 = [str(i) for i in range(696, 703)]
procID_list8 = [str(i) for i in range(852, 860)]
procID_list = procID_list1 + procID_list2 + procID_list3 + procID_list4 + procID_list5 + procID_list6 + procID_list7 + procID_list8

parameter_list = []
for procID in procID_list:
    response = requests.get('https://api.mousephenotype.org/impress/parameter/belongingtoprocedure/full/' + procID)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        parameter_unit = [{'parameterKey': d['parameterKey'], 'unit': d['unit']} for d in data]
        parameter_list.extend(parameter_unit)
    
# Get dictionary of ID and units
response = requests.get('https://api.mousephenotype.org/impress/unit/list')
if response.status_code == 200:
# Parse the JSON response
   unit_dict = response.json()

# Replace 'unit' values in parameter_unit list with values from unit_dict
for item in parameter_list:
    if str(item['unit']) in unit_dict:
        item['unit'] = unit_dict[str(item['unit'])]

# Convert parameter_unit list of dicts to a dictionary for easier lookup
parameter_unit_dict = {item['parameterKey']: item['unit'] for item in parameter_list}

# Add 'unit' column based on parameter_stable_id
result_df['unit'] = result_df['parameter_stable_id'].map(parameter_unit_dict)

# Specify list of columns in the final dataframe
bwt_csv = "experiment_id	specimen_id	observation_id	biological_sample_group	pipeline_stable_id	procedure_stable_id	phenotyping_center	production_center	external_sample_id	strain_name	sex	zygosity	date_of_birth	date_of_experiment	age_in_weeks	parameter_stable_id	parameter_name	value	unit	observation_type	metadata_group	Equipment manufacturer	Equipment model	gene_symbol	life_stage_name"
bwt_columns = bwt_csv.split("	")
result_df_slice = result_df.loc[:, bwt_columns]

# Split the DataFrame
ea_bwt_df = result_df_slice[result_df_slice['life_stage_name'] == 'Early adult'].drop(columns=['life_stage_name'])
la_bwt_df = result_df_slice[result_df_slice['life_stage_name'] == 'Late adult'].drop(columns=['life_stage_name'])

ea_bwt_df.to_csv("output/EA_BWT.csv", index=False)
la_bwt_df.to_csv("output/LA_BWT.csv", index=False)