# Phenopacket store statistics

This notebook performs quality assessment and calculate descriptive statistics about a phenopacket-store release. 

The input file is the zip file that is or will be added to each release.

In [1]:
from ppktstore.stats import PPKtStoreStats
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
stats = PPKtStoreStats(input_zipfile="all_phenopackets.zip")

FileNotFoundError: Not a file: all_phenopackets.zip

In [None]:
df = stats.get_df()
df.head(2)

In [None]:
stats_d = stats.get_descriptive_stats()
items = list()
for k,v in stats_d.items():
    items.append({"item": k, "value": v})
df = pd.DataFrame(items)
df.head(30)

# Display distribution of counts of phenopackets per disease

In [None]:
counts_per_disease = stats.get_counts_per_disease_df()
counts_per_disease.head(2)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def plot_with_max(
        disease_counts: pd.Series,
        max_val: int,
        ax=None,
        ):
    assert isinstance(max_val, int) and max_val > 0, '`max_val` must be a positive `int`'

    if ax is None:
        _, ax = plt.subplots(figsize=(7, 4), dpi=300)

    bins = list(range(1, max_val + 1)) + [disease_counts.max() + 1]
    labels = list(map(str, bins[:-2])) + [f'  ≥{max_val}']
    cats = pd.cut(
        x=disease_counts,
        bins=bins,
        labels=labels,
        right=False,
    )
    cat_count = cats.value_counts().sort_index()
    ax.bar(x=cat_count.index, height=cat_count.values)
    ax.set(
        xlabel='Number of cases',
        ylabel='Disease count',
    )
    ax.grid(axis='y')
    plt.xticks(np.arange(0, max_val, step=1))


In [None]:
plot_with_max(
    disease_counts=counts_per_disease["count"],
    max_val=25,
)
plt.savefig('counts_per_disease.png')

# Check all disease identifiers
They should all be CURIEs with the prefixes OMIM or MONDO. There should be no whitespace between the colon and the suffix.

In [None]:
df = stats.check_disease_id()

# Check for duplicates
In some cases, duplicate phenopackets were inadvertently added to some of the initial cohorts at earlier stages of the project. This function lists phenopacket identifers arranged according to variant and allows us to more easily check for this kind of error.

In [None]:
input_zipfile="all_phenopackets.zip"
df = stats.show_possible_duplicates_by_variant("ERI1")

# Quality control
The following commands perform several consistency checks.

In [None]:
df = stats.find_phenopackets_with_no_variants()

In [None]:
stats.show_possible_duplicates_by_variant( "ARPC5")

In [None]:
stats.find_phenopackets_with_no_disease()