In [1]:
import os
import json
import glob
import pandas as pd

# Define the directory containing the phenopacket JSON files
directory = '../supplemental_data/phenopackets'  # Change this to your directory path

# Initialize a list to store the phenotypic feature IDs
all_ids = []

# Initialize a counter for the total number of phenopacket files
total_cases = 0

# Loop through each JSON file in the directory
for filepath in glob.glob(os.path.join(directory, '*.json')):
    with open(filepath, 'r') as file:
        data = json.load(file)
        # Extract the phenotypic feature IDs
        phenotypic_features = data.get('phenotypicFeatures', [])
        for feature in phenotypic_features:
            feature_id = feature['type']['id']
            all_ids.append(feature_id)
        total_cases += 1

# Create a DataFrame from the list of all IDs
df = pd.DataFrame(all_ids, columns=['PhenotypicFeatureID'])

# Count the total number of unique IDs
unique_ids_count = df['PhenotypicFeatureID'].nunique()

# Calculate the average number of phenotypic features per case
average_features_per_case = len(all_ids) / total_cases if total_cases > 0 else 0

# Display the results
print(f"Total number of unique phenotypic feature IDs: {unique_ids_count}")
print(f"Average number of phenotypic features per case: {average_features_per_case:.2f}")

# Optionally display the DataFrame with unique phenotypic feature IDs
unique_ids = df

Total number of unique phenotypic feature IDs: 2975
Average number of phenotypic features per case: 15.99


In [3]:
# Count the number of unique genes in the phenopackets
!find ../supplemental_data/phenopackets -name "*.json" -exec jq '.interpretations[].diagnosis.genomicInterpretations[].variantInterpretation.variationDescriptor.geneContext.valueId' {} \; | sort | uniq | wc -l

     336


In [5]:
!find ../supplemental_data/phenopackets -name '*.json' -exec jq -r '.interpretations[].diagnosis.disease.id' {} + | sort | uniq | wc -l

     378


In [4]:
# Count the number of unique publications in the phenopackets
! find ../supplemental_data/phenopackets -name "*.json" -exec jq -r '.metaData.externalReferences[].id' {} \; | sort | uniq | wc -l

     726
