In [1]:
import malariagen_data
# import allel
# from tqdm.auto import tqdm
# from tqdm.dask import TqdmCallback
# import numpy as np
# import bokeh.io as bkio
# bkio.reset_output()
# bkio.output_notebook(hide_banner=True)
# import bokeh.plotting as bkplt
# import bokeh.models as bkmod

In [2]:
ag3 = malariagen_data.Ag3()

In [3]:
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,gs://vo_agam_release/
Data releases available,3.0
Results cache,
Cohorts analysis,20220608
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 6.0.1
Client location,"England, GB"


In [4]:
release = "3.0"

In [None]:
ag3.sample_sets(release=release)

In [5]:
query_ghana = "country == 'Ghana'"
contig="3L" 
analysis="gamb_colu"

In [None]:
ag3.count_samples(sample_query=query_ghana)

In [None]:
def h12_calibration(contig, 
                    analysis, 
                    sample_query,
                    sample_sets, 
                    cohort_size=20,
                    window_sizes=(100, 200, 500, 1000, 2000, 5000, 10000, 20000),
                    random_seed=42):
    
    # access haplotypes
    ds_haps = ag3.haplotypes(
        region=contig, 
        sample_sets=sample_sets, 
        sample_query=sample_query, 
        analysis=analysis,
        cohort_size=cohort_size,
        random_seed=random_seed
        
    )
    
    gt = allel.GenotypeDaskArray(ds_haps['call_genotype'].data)
    
    # TODO - cache haplotype data 
    
    ht = gt.to_haplotypes().compute()
    
    calibration_runs = list()
    for window_size in tqdm(window_sizes, desc="Compute H12"):
        h1, h12, h123, h2_h1 = allel.moving_garud_h(ht, size=window_size)
        calibration_runs.append(h12)
        
    # TODO - cache calibration runs
    
    return calibration_runs

In [None]:
%%time
calibration_runs = ag3.h12_calibration(contig=contig, analysis=analysis, sample_query=query_ghana, 
                    sample_sets=release, cohort_size=20)

In [None]:
type(calibration_runs[0])

In [None]:
len(calibration_runs)

In [None]:
import bokeh.plotting as bkplt
import bokeh.models as bkmod

fig = bkplt.figure(plot_width=700, plot_height=400, x_axis_type="log")

fig.patch(x+x[::-1],q75+q25[::-1], alpha=0.75, line_width=2, legend_label="25-75%")
fig.patch(x+x[::-1],q95+q05[::-1], alpha=0.5, line_width=2, legend_label="5-95%")
fig.line(x,y, line_color='black', line_width=4, legend_label="median")
fig.circle(x, y, color='black',fill_color='black', size=8)


fig.xaxis.ticker = x
fig.x_range = bkmod.Range1d(100, 10000)
fig.title = "test"
bkplt.show(fig)

In [None]:
def plot_h12_calibration(contig, 
                         analysis, 
                         sample_query,
                         sample_sets,
                         window_sizes=(100, 200, 500, 1000, 2000, 5000, 10000, 20000), 
                         cohort_size=20, 
                         random_seed=42, 
                         title=None):
    
    # get H12 values
    calibration_runs = h12_calibration(
        contig=contig, 
        analysis=analysis, 
        sample_query=sample_query, 
        sample_sets=sample_sets, 
        window_sizes=window_sizes,
        cohort_size=cohort_size, 
        random_seed=random_seed
    )

    # compute summaries
    q50 = [np.median(h12) for h12 in calibration_runs]
    q25 = [np.percentile(h12, 25) for h12 in calibration_runs]
    q75 = [np.percentile(h12, 75) for h12 in calibration_runs]
    q05 = [np.percentile(h12, 5) for h12 in calibration_runs]
    q95 = [np.percentile(h12, 95) for h12 in calibration_runs]

    # make plot 
    fig = bkplt.figure(plot_width=700, plot_height=400, x_axis_type="log")
    fig.patch(window_sizes+window_sizes[::-1],q75+q25[::-1], alpha=0.75, line_width=2, legend_label="25-75%")
    fig.patch(window_sizes+window_sizes[::-1],q95+q05[::-1], alpha=0.5, line_width=2, legend_label="5-95%")
    fig.line(window_sizes, q50, line_color='black', line_width=4, legend_label="median")
    fig.circle(window_sizes, q50, color='black',fill_color='black', size=8)


    fig.xaxis.ticker = window_sizes
    fig.x_range = bkmod.Range1d(100, 10000)
    if title is None:
        title = sample_query
    fig.title = title
    bkplt.show(fig)

In [6]:
%%time
ag3.plot_h12_calibration(contig=contig, analysis=analysis, sample_query=query_ghana, 
                    sample_sets=release, cohort_size=20)

Load sample metadata:   0%|          | 0/28 [00:00<?, ?it/s]

Compute H12:   0%|          | 0/8 [00:00<?, ?it/s]

CPU times: user 1min 7s, sys: 2.78 s, total: 1min 9s
Wall time: 1min 13s


In [None]:
def h12_gwss(contig, analysis, sample_query, window_size, 
             sample_sets, downsample=30, seed=42):

    pos, ht = load_haplotypes(
        contig=contig, 
        analysis=analysis, 
        sample_query=sample_query, 
        sample_sets=sample_sets, 
        downsample=downsample, 
        seed=seed
    )

    h1, h12, h123, h2_h1 = allel.moving_garud_h(ht, size=window_size)

    x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)

    return x, h12