# Filter initial dataset for plots


The dataset containing the information about the $estimated\;\beta$ needs to be filtered to obtain only the binary traits that are in one of the follwing therapetic areas


### Load the datasets


In [6]:
import polars as pl


In [7]:
initial_dataset = pl.read_parquet("../../data/rescaled-betas.parquet")
therapeutic_areas = pl.read_parquet("../../data/therapeutic_areas/*.parquet")
therapeutic_area_classes = [
    "Haematology",
    "Metabolic",
    "Congenital",
    "Signs/symptoms",
    "Neurology",
    "Immune",
    "Psychiatry",
    "Dermatology",
    "Ophthalmology",
    "Cardiovascular",
    "Respiratory",
    "Digestive",
    "Endocrine",
    "Musculoskeletal",
    "Infection",
    "Oncology",
    "Other",
]


In [8]:
initial_dataset.columns


['variantId',
 'studyId',
 'studyLocusId',
 'beta',
 'zScore',
 'pValueMantissa',
 'pValueExponent',
 'standardError',
 'finemappingMethod',
 'studyType',
 'credibleSetSize',
 'isTransQtl',
 'posteriorProbability',
 'nSamples',
 'nControls',
 'nCases',
 'geneId',
 'traitFromSourceMappedIds',
 'majorPopulation',
 'allelefrequencies',
 'vepEffect',
 'majorPopulationAF',
 'majorPopulationMAF',
 'leadVariantStats',
 'rescaledStatistics']

In [10]:
def therapeutic_area_studies(
    df: pl.DataFrame, therapeutic_areas: pl.DataFrame, therapeutic_area_cols: list[str]
) -> pl.DataFrame:
    """Extract therapeutic area studies from the initial dataset."""
    expr = pl.when(False).then(None)
    for area in therapeutic_area_cols:
        expr = expr.when(pl.col(area) == 1).then(True)
    therapeutic_areas.select("studyId", *therapeutic_area_classes, "Measurement")
    expr = expr.otherwise(False).alias("inTherapeuticArea")
    return (
        df.join(therapeutic_areas, on="studyId", how="left")
        .select(
            "*",
            expr,
            pl.struct(*[pl.col(x) for x in therapeutic_area_cols]).alias("therapeuticAreas"),
            pl.sum_horizontal(*therapeutic_area_cols).alias("nTherapeuticAreasPerStudy"),
        )
        .drop(*therapeutic_area_classes)
    )


def filter_for_plotting(
    df: pl.DataFrame, prev_threshold: float = 0.01, n_samples_threshold: int = 100_000
) -> pl.DataFrame:
    """Filter the dataset for plotting."""
    prev = pl.col("rescaledStatistics").struct.field("prev")
    n_cases = pl.col("nCases")
    n_controls = pl.col("nControls")
    n_samples = pl.col("nSamples")
    est_beta = pl.col("rescaledStatistics").struct.field("estimatedBeta")
    return df.filter(
        (
            (prev.is_not_null())
            & (prev.is_not_nan())
            & (n_cases < n_controls)
            & (est_beta.abs() <= 3)
            & (prev >= prev_threshold)
            & (n_samples >= n_samples_threshold)
        )
    )


### Check how unique the therapeutic area classes are


In [11]:
therapeutic_areas.select(
    pl.sum_horizontal(*therapeutic_area_classes).alias("nTherapeuticAreasPerStudy"), pl.col("studyId")
).group_by("nTherapeuticAreasPerStudy").len().sort("len")


nTherapeuticAreasPerStudy,len
i32,u32
9,1
8,2
7,3
6,40
5,53
4,244
3,687
2,3369
1,10990
0,81015


In [12]:
therapeutic_areas.select(
    *therapeutic_area_classes,
    pl.sum_horizontal(*therapeutic_area_classes).alias("nTherapeuticAreasPerStudy"),
    pl.col("studyId"),
).filter(pl.col("nTherapeuticAreasPerStudy") == 9)


Haematology,Metabolic,Congenital,Signs/symptoms,Neurology,Immune,Psychiatry,Dermatology,Ophthalmology,Cardiovascular,Respiratory,Digestive,Endocrine,Musculoskeletal,Infection,Oncology,Other,nTherapeuticAreasPerStudy,studyId
i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,str
0,1,1,0,1,1,0,1,1,0,0,1,1,1,0,0,0,9,"""GCST005320"""


### Studies that are treated as quantitative, even though they are binary


In [13]:
# Create a new dataset with the therapeutic area columns
dataset = therapeutic_area_studies(initial_dataset, therapeutic_areas, therapeutic_area_classes)
dataset.select(
    pl.col("rescaledStatistics").struct.field("traitClass"),
    pl.col("nTherapeuticAreasPerStudy"),
    pl.col("studyId"),
).group_by("nTherapeuticAreasPerStudy", "traitClass").len().sort("traitClass", "nTherapeuticAreasPerStudy")


nTherapeuticAreasPerStudy,traitClass,len
i32,str,u32
0,"""binary""",11496
1,"""binary""",33807
2,"""binary""",19305
3,"""binary""",4436
4,"""binary""",945
…,…,…
6,"""binary""",109
7,"""binary""",3
0,"""quantitative""",2550657
1,"""quantitative""",28


In [14]:
pl.Config.set_fmt_str_lengths(1000)
dataset.filter(pl.col("inTherapeuticArea")).filter(
    pl.col("rescaledStatistics").struct.field("traitClass") == "quantitative"
).unique("studyId").select("studyId", "nSamples", "nControls", "nCases", "initialSampleSize")


studyId,nSamples,nControls,nCases,initialSampleSize
str,i32,i32,i32,str
"""GCST001953""",204498,65840,0,"""93,015 European ancestry overweight individuals, 32,858 European ancestry class I obese individuals, 9,889 European ancestry class II obese individuals, 2,896 European ancestry class III obese individuals, up to 65,840 European ancestry controls"""
"""GCST004413""",14933,12953,0,"""1,980 European ancestry mothers of spontaneous dizygotic twins, 12,953 European ancestry controls"""
"""GCST001637""",2153,828,0,"""1,325 African ancestry, 828 African ancestry controls"""


### Prepare the dataset for plotting


In [15]:
plot_ready_dataset = filter_for_plotting(dataset)


In [16]:
plot_ready_dataset.shape[0]


49606

We have ~50k lead variants (creadible sets) that pass the prevelance and beta filtering and are assigned to therapeutic areas


In [17]:
plot_ready_dataset.unique("studyId").shape[0]


1937

This number of variants account for ~2k studies


In [18]:
plot_ready_dataset.write_parquet("../../data/binary-therapeutic-lead-variants.parquet")
