# Example Downstream Usage: 1000g Figures


Get statistics from 1000g samples, including
  - total samples
  - % matches to each kb individually and metakb as a whole
  - study ids and descriptions by variant hit in metakb

(to be run in a tmp/ directory)

In [None]:
%%capture
%pip install --upgrade --no-cache-dir terra-notebook-utils
%env REPO_DIR=/home/jupyter/vrs-python-testing

In [None]:
# completes setup for Terra
!cd $REPO_DIR && bash terra/setup.sh

In [None]:
# creates cohort allele frequency dict, make sure
!python $REPO_DIR/scripts/1000g-processing.py

In [None]:
import io
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pathlib
import pysam
import seaborn as sns
import subprocess
import yaml

from collections import defaultdict
from google.cloud import storage
from glob import glob
from glom import glom
from firecloud import api as fapi
from re import match
from vrs_anvil import query_metakb
from vrs_anvil.annotator import MATCHES, TOTAL, VRS_OBJECT

In [None]:
# constants to load and save files
figure_dir = "figures"
TIMESTAMP = "20240329_013953"
BASE_DIR = "/home/jupyter/vrs-python-testing/tmp"

# seaborn styling
sns.set_theme()
sns.set_style("whitegrid")

In [None]:
# read Cohort Allele Frequency (CAF) objects to file
caf_dir = os.path.expanduser(f"{BASE_DIR}/state")
os.makedirs(caf_dir, exist_ok=True)
file_name = f'caf_objects_{TIMESTAMP}.json'

with open(f'{caf_dir}/{file_name}', 'r') as file:
    caf_dicts = json.load(file)

In [None]:
# example formatting of dictionary
print(caf_dicts[0]["id"])
print(caf_dicts[0]["ancillaryResults"]["sample_dict"]['HG00382'])

In [None]:
# sorted list of matches to local metakb
sorted_caf_dicts = sorted(caf_dicts, \
                          key=lambda d: d['ancillaryResults']['patient_matches'], \
                           reverse=True)

for caf_dict in sorted_caf_dicts:
    print(caf_dict["id"], ":", caf_dict['ancillaryResults']['patient_matches'])

In [None]:
# create function to merge each sample dict from each caf into one dict 
def merge_sample_dict(sample_evidence_dict, allele_id, sample_dict):
    '''update sample evidence for a given allele'''
    
    if allele_id in sample_evidence_dict:
        print(allele_id, "exists, skipping")
        return
    
    for sample, sample_info in sample_dict.items():
        study_ids = sample_info["study_ids"]
        variant_types = sample_info["variant_types"]
        
        if sample not in sample_evidence_dict:
            sample_evidence_dict[sample] = {}

        if "study_ids" in sample_evidence_dict[sample]:
            sample_evidence_dict[sample]["study_ids"].extend(study_ids)
            sample_evidence_dict[sample]["variant_types"].extend(variant_types)
        else:
            sample_evidence_dict[sample]["study_ids"] = list(study_ids)
            sample_evidence_dict[sample]["variant_types"] = list(variant_types)

# create evidence dict for all matches
sample_evidence_dict = defaultdict(list)

for caf_dict in sorted_caf_dicts:
    merge_sample_dict(sample_evidence_dict, caf_dict["id"], caf_dict["ancillaryResults"]["sample_dict"])

In [None]:
# plot vrs_ids sorted by number of patient matches
num_patients_list = [d['ancillaryResults']['patient_matches'] for d in sorted_caf_dicts]
ids = [d['id'] for d in sorted_caf_dicts]
print(len(ids))
plt.bar(ids, num_patients_list)
plt.xticks(rotation=90, fontsize=8)
plt.show()

In [None]:
# example therapeutic evidence

for d in sorted_caf_dicts[:2]:
    print(f'### {d["id"]} ###')
    for study in d["ancillaryResults"]["metakb_dict"]["studies"]:
        for key in ["id", "type", "direction", "predicate", "therapeutic", "tumorType.label", "strength.label"]:
            v = glom(study, key)
            if key == "therapeutic":
                if "substitutes" in v:
                    print([a["label"] for a in v["substitutes"]])
                elif "components" in v:
                    print([a["label"] for a in v["components"]])
                    
            else:
                print(v)
        
        print("~~~~")
    print()

In [None]:
# setup
knowledgebases = ["MOAlmanac", "CIVIC", "All Knowledgebases"]
kb_keywords = ["moa", "civic", ""]  # "" represents match for any knowledgebase

KB, PCT, VAR = range(3)
cols = ["knowledgebase", "percent", "variant_type"]
variant_types = ["germline", "somatic"]
variant_types_set = set(variant_types)
dtypes = [str, float, str]

TOTAL = "all"
num_samples = 3202
variants = sorted(list(variant_types_set)) + [TOTAL]

data = None

# get percentage of patients with variant match by category
for variant_type in variants:
    for i, keyword in enumerate(kb_keywords):
        num_matching_samples = 0

        # increment if sample has matching variant and kb
        for _, id_lists in sample_evidence_dict.items():
            v_types = np.array(id_lists["variant_types"])

            if variant_type == TOTAL:
                study_ids = np.array(id_lists["study_ids"])
            else:
                study_ids = np.array(id_lists["study_ids"])[v_types == variant_type]
            if any(keyword in study_id for study_id in study_ids):
                num_matching_samples += 1


        percent = num_matching_samples * 100.0 / num_samples
        if data is None:
            data = np.array([knowledgebases[i], percent, variant_type])
        else:
            data = np.vstack([data, [knowledgebases[i], percent, variant_type]])
        
expected_num_rows = len(knowledgebases) * len(variants)
assert len(data) == expected_num_rows, f"expected {expected_num_rows} rows, got {len(data)}"

df_pct = pd.DataFrame(data, columns=cols).astype(dtype={col: dtype for col, dtype in zip(cols, dtypes)})
print("Percent of Patients with a Single Variant Match")
df_pct["matching_samples"] = (df_pct["percent"]*num_samples/100).astype(int)

df_display = df_pct.copy()
df_display["percent"] = df_display["percent"].round(2)
df_display.sort_values(by=["knowledgebase", "variant_type"], ascending=[False, True])
df_display[["knowledgebase", "variant_type", "percent", "matching_samples"]].head()

In [None]:
# helper function to add fractional labels to bars on plots
def add_labels(num_bars, ax, vertical_nudge, num_samples, fractional=False, fontsize=8):
    for i, bar in enumerate(ax.patches):
        if i == 0:
            continue
        if i > num_bars:
            break

        height = bar.get_height()
        matching_samples = int(round(height * num_samples))
        
        
        if not fractional:
            label = f"{matching_samples}"
        else:
            label = f"{matching_samples}/{num_samples}"
        x = bar.get_x() + bar.get_width() / 2
        y = bar.get_y() + height + vertical_nudge
        ax.text(x, y, label, ha="center", va="center", fontsize=fontsize)

In [None]:
# average number of variants for all samples
num_variant_hits = len([1 for d in sorted_caf_dicts if "metakb_dict" in d["ancillaryResults"]])
jitter = [0.008, .008]


NUM_VARIANTS = "num_variants"

for i, variant_type in enumerate(variant_types):
    num_variants_per_patient = [len( \
        [v for v in values["variant_types"] if v == variant_type or variant_type == TOTAL]) \
        for values in sample_evidence_dict.values()]
        
    num_variants_per_patient.extend([0 for _ in range(num_samples - len(sample_evidence_dict))])
    assert len(num_variants_per_patient) == num_samples, \
    f"only {len(num_variants_per_patient)} samples, expected {num_samples}"
    
    df = pd.DataFrame({NUM_VARIANTS: num_variants_per_patient})
    df["percentage"] = df[NUM_VARIANTS].value_counts(normalize=True) * 100
    
    plt.figure(dpi=200)
    ax = sns.histplot(data=df, x=NUM_VARIANTS, stat="density", discrete=True)
    plt.xlabel("Number of Variants")
    plt.ylabel("Percentage of All Patients (%)")
    plt.title(f"Number of {variant_type.capitalize()} Variants Associated with Each Patient")
    
    add_labels(num_variant_hits, ax, jitter[i], num_samples)

    plt.gca().yaxis.set_major_formatter(lambda x, _: f"{(x*100):.0f}")
    plt.grid(False)
    
    x_lims = (0.5, max((df[NUM_VARIANTS])+1))
    plt.xlim(0.5, max((df[NUM_VARIANTS])+1))
    plt.xticks(range(1, max((df[NUM_VARIANTS])+1)))
    plt.ylim(0, 0.5)

    plt.show()