In [1]:
import numpy as np
import allel
import pandas as pd

In [2]:
import ag3

  import pandas.util.testing as tm


In [3]:
v3 = ag3.release_data()

## Overview

- how many segregating sites?

- biallelic/multiallelic

- how many undiscovered sites?

## Reporting

We want to report the headline number of SNPs. So that's gamb_colu snps with gamb_colu mask, plus arab snps with arab mask. 
Also report how many are private to arab and private to gamb_colu.
Do this between gamb_colu vs arab and also gamb vs colu.

Additionally for each species group, we want:
n_seg sites, n_biallelic, n_multiallelic.

## Definitions:
Where there are different masks, cannot say for certain if private/shared.

So, if a variant is seg in gamb_colu, and masked in arab, this does not count as private to gamb_colu.

For each species generate 4 arrays: is_seg, is_multi, is_bial, is_masked.

then the number of seg sites discovered is: `n_seg_sites = (is_seg & is_masked_).sum()`

For the group comparisons:
_shared_: segregating and accessible in both groups
_private A_: segregating in A, not B. Accessible in both.
_private B_: vv above
_total_: segregating and accessible in _either_ group.

In [4]:
import dask.array as da
from dask_kubernetes import KubeCluster
from dask.distributed import Client, progress
import dask

In [5]:
# kubernetes cluster setup
n_workers = 30
cluster = KubeCluster()
cluster.scale_up(n_workers)
#cluster.adapt(minimum=1, maximum=n_workers)
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:   tcp://10.34.4.153:41387
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [6]:
# dask client setup
client = Client(cluster)
client

distributed.scheduler - INFO - Receive client connection: Client-77d00c5a-8d7d-11eb-89e7-76b47941c236
distributed.core - INFO - Starting established connection


0,1
Client  Scheduler: tcp://10.34.4.153:41387  Dashboard: /user/nicholasharding/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [7]:
chromosomes = "2R", "2L", "3R", "3L", "X"

In [8]:
sample_sets = v3.all_sample_sets

In [9]:
meta = v3.load_sample_set_metadata(sample_sets)

distributed.scheduler - INFO - Register tcp://10.32.48.6:33085
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.48.6:33085
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.151.4:35483
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.151.4:35483
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.43.6:38595
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.43.6:38595
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.193.10:43779
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.193.10:43779
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.113.3:41445
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.113.3:41445
distributed.cor

In [10]:
meta.shape

(3081, 18)

In [11]:
pop_definitions = {
    "gamb_colu": "species_gambcolu_arabiensis == 'gamb_colu'",
    "arab": "species_gambcolu_arabiensis == 'arabiensis'",
    "all": "species_gambcolu_arabiensis != 'NA'"
}

In [12]:
## 1. All samples, g_c_a mask

In [13]:
count_seg_df = pd.DataFrame(
    index=pd.MultiIndex.from_product([pop_definitions.keys(), chromosomes]), 
    columns=["segregating", "multiallelic", "total"],
    dtype=np.int64)

In [14]:
def get_allelism(block):

    ac = allel.AlleleCountsArray(block)
    
    al  = ac.allelism()
    
    return al.reshape((-1, 1))

In [15]:
subpops = meta.groupby("species_gambcolu_arabiensis").indices

In [16]:
def report_window_values(positions, loc, a, a_labels, path, path_total):
    
    arr_loc = np.compress(loc, a, axis=0)
    pos_loc = allel.SortedIndex(np.compress(loc, positions, axis=0))
    
    eqa_windows = allel.stats.window.equally_accessible_windows(loc, size=100_000)
    
    # need to add the last window on.
    # also add a column explaining number of bases in window..

    output = np.zeros((eqa_windows.shape[0], arr_loc.shape[1]), dtype=np.int64)
    
    for ix in range(arr_loc.shape[1]):
        
        val, windows, counts = allel.stats.window.windowed_statistic(
            pos_loc, arr_loc[:, ix], np.sum, windows=eqa_windows)
        
        output[:, ix] = val
        
    df = pd.DataFrame(
        output, 
        index=pd.MultiIndex.from_arrays(
            [eqa_windows[:, 0], eqa_windows[:, 1]], names=["start", "stop"]),
        columns=a_labels)
    
    df.to_csv(path)
    
    # now totals:
    tot = pd.Series(
        data=arr_loc.sum(axis=0), index=a_labels, dtype=np.int64, name="count")
    
    tot.loc["total_accessible_bases"] = loc.sum()
    
    tot.to_csv(path_total)
    
    return df

In [17]:
for chrom in chromosomes:    
    
    g = v3.load_sample_set_calldata(
        chrom, sample_set=sample_sets, field="GT")
    
    ac = allel.GenotypeDaskArray(g).count_alleles_subpops(subpops)
    
    allelism = {c: da.map_blocks(
        get_allelism,
        ac[c].values,
        chunks=(ac[c].values.chunks[0],),
        drop_axis=1).compute() for c in ac.keys()}
    
    # leverage sum to compute allelism over entire cohort
    ac_all = da.stack([ac[c].values for c in ac.keys()], axis=2).sum(axis=2) 
    allelism["all"] = da.map_blocks(
        get_allelism,
        ac_all,
        chunks=(ac_all.chunks[0],),
        drop_axis=1).compute() 
    
    # first look at *all* samples. 
    # arrays that describe:
    # seg in union
    # multi in union
    is_seg_union = allelism["all"] > 1
    is_mta_union = allelism["all"] > 2
    
    # seg in gamb_col
    # multi in gamb_col
    is_seg_gambcolu = allelism["gamb_colu"] > 1
    is_mta_gambcolu = allelism["gamb_colu"] > 2

    # seg in arab
    # multi in arab
    is_seg_arab = allelism["arabiensis"] > 1
    is_mta_arab = allelism["arabiensis"] > 2

    # these 4 are mutually exclusive.
    # seg in both
    is_seg_both = is_seg_arab & is_seg_gambcolu
    # priv to gamb_colu
    is_priv_gambcolu = is_seg_gambcolu & ~is_seg_arab
    # priv to arabarab
    is_priv_arab = ~is_seg_gambcolu & is_seg_arab
    # fixed diff in both
    is_fixed_diff = (~is_seg_gambcolu & ~is_seg_arab) & (is_seg_union)
    
    pos = v3.load_variants(chrom)
    union_mask = v3.load_mask(chrom, "gamb_colu_arab").compute()

    arr_labels = [
        "is_segregating_gambcoluarab", "is_multiallelic_gambcoluarab",
        "is_segregating_both", "is_priv_gambcolu", "is_priv_arab", "is_fixed_diff"]
    
    arr = np.hstack(
        [is_seg_union, is_mta_union, 
         is_seg_both, is_priv_gambcolu, 
         is_priv_arab, is_fixed_diff])

    _ = report_window_values(
        positions=pos, loc=union_mask, a=arr, a_labels=arr_labels,
        path= f"../content/tables/snp_discovery/gambcoluarab_{chrom}_windows.csv",
        path_total=f"../content/tables/snp_discovery/gambcoluarab_{chrom}_tot.csv")
    
    # then mask by gamb_colu arab.
    # after then sum
    # after then compute over windows.
    
    # could do as loop.
    arr_labels = ["is_segregating", "is_multiallelic"]
    for label, arr in zip(
        ["gamb_colu", "arab"], 
        [np.hstack([is_seg_gambcolu, is_mta_gambcolu]), 
         np.hstack([is_seg_arab, is_mta_arab])]):
    
        mask = v3.load_mask(chrom, label).compute()
        
        _ = report_window_values(
            positions=pos, 
            loc=mask, 
            a=arr, 
            a_labels=arr_labels,
            path=f"../content/tables/snp_discovery/{label}_{chrom}_windows.csv",
            path_total=f"../content/tables/snp_discovery/{label}_{chrom}_tot.csv")

distributed.utils_perf - INFO - full garbage collection released 22.60 MB from 26 reference cycles (threshold: 10.00 MB)
distributed.core - INFO - Event loop was unresponsive in Scheduler for 12.34s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.59s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 15.21s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.62s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event l

In [18]:
cluster.adapt()

<distributed.deploy.adaptive.Adaptive at 0x7fd6d7c2fb10>

distributed.scheduler - INFO - Retire worker names (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)
distributed.scheduler - INFO - Retire workers {<Worker 'tcp://10.32.152.4:37659', memory: 0, processing: 0>, <Worker 'tcp://10.32.61.6:43103', memory: 0, processing: 0>, <Worker 'tcp://10.32.50.6:45547', memory: 0, processing: 0>, <Worker 'tcp://10.32.56.6:37683', memory: 0, processing: 0>, <Worker 'tcp://10.32.151.4:35483', memory: 0, processing: 0>, <Worker 'tcp://10.32.55.6:45985', memory: 0, processing: 0>, <Worker 'tcp://10.32.59.6:38797', memory: 0, processing: 0>, <Worker 'tcp://10.32.42.6:35077', memory: 0, processing: 0>, <Worker 'tcp://10.32.70.5:42785', memory: 0, processing: 0>, <Worker 'tcp://10.32.66.5:45791', memory: 0, processing: 0>, <Worker 'tcp://10.32.68.5:43315', memory: 0, processing: 0>, <Worker 'tcp://10.32.43.6:38595', memory: 0, processing: 0>, <Worker 'tcp://10.35.208.8:43639', memory: 0, processing: