In [117]:
import malariagen_data
import allel
from tqdm.auto import tqdm
from tqdm.dask import TqdmCallback
import numpy as np
import bokeh.io as bkio
bkio.reset_output()
bkio.output_notebook(hide_banner=True)
import bokeh.plotting as bkplt
import bokeh.models as bkmod

In [3]:
ag3 = malariagen_data.Ag3(pre=True)

In [5]:
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,gs://vo_agam_release/
Data releases available,"3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6"
Results cache,
Cohorts analysis,20220608
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 6.0.1
Client location,"England, GB"


In [6]:
release = "3.6"

In [7]:
ag3.sample_sets(release=release)

Unnamed: 0,sample_set,sample_count,release
0,1273-VO-ZM-MULEBA-VMF00176,201,3.6
1,1279-VO-CI-KOFFI-VMF00173,379,3.6
2,1280-VO-ZA-MUNHENGA-VMF00165,178,3.6
3,1280-VO-ZA-MUNHENGA-VMF00178,163,3.6
4,1288-VO-UG-DONNELLY-VMF00168,483,3.6


In [12]:
query_zambia = "country == 'Zambia'"
contig="3L" 
analysis="gamb_colu"

In [9]:
ag3.count_samples(sample_query=query_zambia)

Load sample metadata:   0%|          | 0/54 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,taxon,gambiae
country,admin1_iso,admin1_name,admin2_name,year,Unnamed: 5_level_1
Zambia,ZW-04,Luapula,Nchelenge,2018,12
Zambia,ZW-04,Luapula,Nchelenge,2020,17
Zambia,ZW-04,Luapula,Nchelenge,2021,56
Zambia,ZW-08,Copperbelt,Haut-Katanga,2021,20
Zambia,ZW-08,Copperbelt,Ndola,2020,96


In [34]:
def h12_calibration(contig, 
                    analysis, 
                    sample_query,
                    sample_sets, 
                    cohort_size=20,
                    window_sizes=(100, 200, 500, 1000, 2000, 5000, 10000, 20000),
                    random_seed=42):
    
    # access haplotypes
    ds_haps = ag3.haplotypes(
        region=contig, 
        sample_sets=sample_sets, 
        sample_query=sample_query, 
        analysis=analysis,
        cohort_size=cohort_size,
        random_seed=random_seed
        
    )
    
    gt = allel.GenotypeDaskArray(ds_haps['call_genotype'].data)
    
    # TODO - cache haplotype data 
    
    ht = gt.to_haplotypes().compute()
    
    calibration_runs = list()
    for window_size in tqdm(window_sizes, desc="Compute H12"):
        h1, h12, h123, h2_h1 = allel.moving_garud_h(ht, size=window_size)
        calibration_runs.append(h12)
        
    # TODO - cache calibration runs
    
    return calibration_runs

In [48]:
%%time
calibration_runs = h12_calibration(contig=contig, analysis=analysis, sample_query=query_zambia, 
                    sample_sets=release, cohort_size=20)

Compute H12:   0%|          | 0/8 [00:00<?, ?it/s]

CPU times: user 2min 19s, sys: 5.81 s, total: 2min 25s
Wall time: 1min 41s


In [119]:
import bokeh.plotting as bkplt
import bokeh.models as bkmod

fig = bkplt.figure(plot_width=700, plot_height=400, x_axis_type="log")

fig.patch(x+x[::-1],q75+q25[::-1], alpha=0.75, line_width=2, legend_label="25-75%")
fig.patch(x+x[::-1],q95+q05[::-1], alpha=0.5, line_width=2, legend_label="5-95%")
fig.line(x,y, line_color='black', line_width=4, legend_label="median")
fig.circle(x, y, color='black',fill_color='black', size=8)


fig.xaxis.ticker = x
fig.x_range = bkmod.Range1d(100, 10000)
fig.title = "test"
bkplt.show(fig)

In [120]:
def plot_h12_calibration(contig, 
                         analysis, 
                         sample_query,
                         sample_sets,
                         window_sizes=(100, 200, 500, 1000, 2000, 5000, 10000, 20000), 
                         cohort_size=20, 
                         random_seed=42, 
                         title=None):
    
    # get H12 values
    calibration_runs = h12_calibration(
        contig=contig, 
        analysis=analysis, 
        sample_query=sample_query, 
        sample_sets=sample_sets, 
        window_sizes=window_sizes,
        cohort_size=cohort_size, 
        random_seed=random_seed
    )

    # compute summaries
    q50 = [np.median(h12) for h12 in calibration_runs]
    q25 = [np.percentile(h12, 25) for h12 in calibration_runs]
    q75 = [np.percentile(h12, 75) for h12 in calibration_runs]
    q05 = [np.percentile(h12, 5) for h12 in calibration_runs]
    q95 = [np.percentile(h12, 95) for h12 in calibration_runs]

    # make plot 
    fig = bkplt.figure(plot_width=700, plot_height=400, x_axis_type="log")
    fig.patch(window_sizes+window_sizes[::-1],q75+q25[::-1], alpha=0.75, line_width=2, legend_label="25-75%")
    fig.patch(window_sizes+window_sizes[::-1],q95+q05[::-1], alpha=0.5, line_width=2, legend_label="5-95%")
    fig.line(window_sizes, q50, line_color='black', line_width=4, legend_label="median")
    fig.circle(window_sizes, q50, color='black',fill_color='black', size=8)


    fig.xaxis.ticker = window_sizes
    fig.x_range = bkmod.Range1d(100, 10000)
    if title is None:
        title = sample_query
    fig.title = title
    bkplt.show(fig)

In [121]:
%%time
plot_h12_calibration(contig=contig, analysis=analysis, sample_query=query_zambia, 
                    sample_sets=release, cohort_size=20)

Compute H12:   0%|          | 0/8 [00:00<?, ?it/s]

CPU times: user 1min 50s, sys: 3.69 s, total: 1min 54s
Wall time: 1min 17s
