In [1]:
import numpy as np
import allel
import pandas as pd

In [2]:
from ag3 import release_data
v3 = release_data()

## Overview

- how many segregating sites?

- biallelic/multiallelic

- how many undiscovered sites?

## Reporting

We want to report the headline number of SNPs. So that's gamb_colu snps with gamb_colu mask, plus arab snps with arab mask. 
Also report how many are private to arab and private to gamb_colu.
Do this between gamb_colu vs arab and also gamb vs colu.

Additionally for each species group, we want:
n_seg sites, n_biallelic, n_multiallelic.

## Definitions:
Where there are different masks, cannot say for certain if private/shared.

So, if a variant is seg in gamb_colu, and masked in arab, this does not count as private to gamb_colu.

For each species generate 4 arrays: is_seg, is_multi, is_bial, is_masked.

then the number of seg sites discovered is: `n_seg_sites = (is_seg & is_masked_).sum()`

For the group comparisons:
_shared_: segregating and accessible in both groups
_private A_: segregating in A, not B. Accessible in both.
_private B_: vv above
_total_: segregating and accessible in _either_ group.

In [3]:
from dask_kubernetes import KubeCluster
from dask.distributed import Client, progress
import dask

In [4]:
# kubernetes cluster setup

n_workers = 50
cluster = KubeCluster()
cluster.scale_up(n_workers)
#cluster.adapt(minimum=1, maximum=n_workers)
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:  tcp://10.35.63.110:37399
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [5]:
# dask client setup
client = Client(cluster)
client

distributed.scheduler - INFO - Receive client connection: Client-583a8b1a-dc7d-11ea-8468-6e01daaaba59
distributed.core - INFO - Starting established connection


0,1
Client  Scheduler: tcp://10.35.63.110:37399  Dashboard: /user/nicholasharding/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [6]:
chromosomes = "2R", "2L", "3R", "3L", "X"

In [7]:
meta = v3.load_sample_set_metadata(v3.all_sample_sets)

distributed.scheduler - INFO - Register tcp://10.33.137.36:34441
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.33.137.36:34441
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.43.3:39557
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.43.3:39557
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.41.3:40445
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.41.3:40445
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.79.3:38461
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.79.3:38461
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.52.3:46789
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.52.3:46789
distributed.core - 

In [8]:
is_gamb_colu = meta.species_gambcolu_arabiensis == "gamb_colu"
is_arab = meta.species_gambcolu_arabiensis == "arabiensis"
is_gamb = meta.species_gambiae_coluzzii == "coluzzii"
is_colu = meta.species_gambiae_coluzzii == "gambiae"

species_groups = {
    "gamb_colu": list(np.where(is_gamb_colu)[0]),
    "arab": list(np.where(is_arab)[0]),
    "gamb": list(np.where(is_gamb)[0]),
    "colu": list(np.where(is_colu)[0])}


In [17]:
snp_counts = {}
intersection_counts = {}

dataframe_frac_non_ref = {}
dataframe_frac_seg = {}

#allele_counts = {}

for chrom in chromosomes:
    gt = allel.GenotypeDaskArray(
        v3.load_sample_set_calldata(chrom, v3.all_sample_sets))
    
    ac = gt.count_alleles_subpops(species_groups)
    
    pos = v3.load_variants(chrom).compute()
    windows = allel.stats.window.position_windows(pos, 100_000, 1, pos[-1], 100_000)
    
    # make DFs
    df_frac_seg = pd.DataFrame(data=windows, columns=["start", "stop"])
    df_frac_non_ref = pd.DataFrame(data=windows, columns=["start", "stop"])
    
    holder = {}
    
    # now to apply masks?
    for species_id in species_groups.keys():
        
        try:
            pass_filter = v3.load_mask(chrom, species_id).compute()
        except ValueError:
            pass_filter = v3.load_mask(chrom, "gamb_colu").compute()
            print(f"for {species_id} using gamb_colu")
        
        allele_counts = ac[species_id].compute()
        
        # boolean arrays
        is_seg = allele_counts.is_segregating()
        is_bial = allele_counts.is_biallelic()
        is_multial = is_seg & ~is_bial
        # pass_filter also a bool array.
        
        # compute stats within windows
        is_seg_win, _, count = allel.windowed_statistic(pos, is_seg & pass_filter, np.sum, windows=windows)
        is_bial_seg_win, _, count = allel.windowed_statistic(pos, is_bial & pass_filter, np.sum, windows=windows)
        is_multial_seg_win, _, count = allel.windowed_statistic(pos, is_multial & pass_filter, np.sum, windows=windows)
        is_fixed_win, _, count = allel.windowed_statistic(pos, ~is_seg & pass_filter, np.sum, windows=windows)
        is_access_win = is_seg_win + is_fixed_win
        
        # count the chromosome wide stats
        n_seg = np.sum(is_seg_win)
        n_bial = np.sum(is_bial_seg_win)
        n_multial = np.sum(is_multial_seg_win)
        n_fixed = np.sum(is_fixed_win)
        n_access = (n_seg + n_fixed)
        
        # calculate the fraction of sites segregating
        frac_seg = is_seg_win / is_access_win
        frac_seg = np.where(is_access_win < 10_000, np.nan, frac_seg) # where <10% accessibility mask value
        df_frac_seg[species_id] = frac_seg
        
        
        # compute fraction of all alleles that are non reference
        tot_ac, _, count = allel.windowed_statistic(pos, allele_counts.sum(axis=1), np.sum, windows=windows)
        tot_non_ref_ac, _, count = allel.windowed_statistic(pos, allele_counts[:, 1:].sum(axis=1), np.sum, windows=windows)
        frac_non_ref = tot_non_ref_ac / tot_ac
        frac_non_ref = np.where(is_access_win < 10_000, np.nan, frac_non_ref) # where <10% accessibility mask value
        df_frac_non_ref[species_id] = frac_non_ref
        
        snp_counts[chrom, species_id] = pd.Series(
            [n_seg, n_fixed, n_bial, n_multial], 
            dtype=np.int64, 
            index=["n_segregating", "n_fixed", "n_biallelic", "n_multiallelic"]) 
        
        holder[species_id] = pass_filter, is_seg    
        
    dataframe_frac_non_ref[chrom] = df_frac_non_ref
    dataframe_frac_seg[chrom] = df_frac_seg
    
    # now look at X-species shared alleles.
    # count nseg sites, num bial sites, num multi sites.
    for pop_a, pop_b in [["gamb_colu", "arab"], ["gamb", "colu"]]:

        pass_filters_a, seg_a = holder[pop_a]

        pass_filters_b, seg_b = holder[pop_b]

        # corresponding to definitions above
        n_total = np.sum((pass_filters_a & seg_a) | (pass_filters_b & seg_b))
        n_priv_a = np.sum((pass_filters_a & seg_a) & (pass_filters_b & ~seg_b))
        n_priv_b = np.sum((pass_filters_a & ~seg_a) & (pass_filters_b & seg_b))
        n_shared = np.sum((pass_filters_a & seg_a & pass_filters_b & seg_b))

        intersection_counts[chrom, f"{pop_a}_vs_{pop_b}"] = pd.Series(
            [n_total, n_shared, n_priv_a, n_priv_b], 
            dtype=np.int64,
            index=["n_total", "n_shared", "n_private_1", "n_private_2"])


distributed.core - INFO - Event loop was unresponsive in Scheduler for 11.25s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.00s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.16s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.scheduler - INFO - Remove worker tcp://10.34.128.2:44135
distributed.core - INFO - Removing comms to tcp://10.34.128.2:44135


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 10.52s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.73s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.51s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.utils_perf - INFO - full garbage collection released 303.44 MB from 26020 reference cycles (threshold: 10.00 MB)


for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.69s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.13s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 10.89s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.20s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 7.32s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.77s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.38s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.99s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.08s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.57s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.57s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for gamb using gamb_colu




for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.92s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


In [18]:
intersection_df = pd.concat(
    intersection_counts, names=["chrom", "comparison", "statistic"]).reset_index(name="value")

intersection_df = pd.pivot_table(
    intersection_df, values="value", columns="statistic", index=["comparison", "chrom"])

In [19]:
intersection_df.head()

Unnamed: 0_level_0,statistic,n_private_1,n_private_2,n_shared,n_total
comparison,chrom,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gamb_colu_vs_arab,2L,17613665,803949,3355685,24749662
gamb_colu_vs_arab,2R,20666329,1167505,3998685,29208415
gamb_colu_vs_arab,3L,13672688,657460,2795117,19564154
gamb_colu_vs_arab,3R,17959082,874629,3829389,26151803
gamb_colu_vs_arab,X,6670271,239194,758691,11506750


In [20]:
snp_totals_df = pd.concat(snp_counts, names=["chrom", "species", "statistic"]).reset_index(name="value")

snp_totals_df = pd.pivot_table(
    snp_totals_df, values="value", columns="statistic", index=["species", "chrom"])

In [21]:
snp_totals_df.head()

Unnamed: 0_level_0,statistic,n_biallelic,n_fixed,n_multiallelic,n_segregating
species,chrom,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arab,2L,4455134,30512423,251140,4706274
arab,2R,5578527,38360553,316601,5895128
arab,3L,3769016,24401656,230278,3999294
arab,3R,5062574,31047934,333322,5395896
arab,X,1103302,11325534,39866,1143168


In [22]:
## write to csv

snp_totals_df.to_csv(
    "../content/tables/snp_totals.csv", 
    columns=["n_segregating", "n_biallelic", "n_multiallelic", "n_fixed"])

## write to csv
intersection_df.to_csv(
    "../content/tables/snp_intersection_totals.csv", 
    columns=["n_total", "n_shared", "n_private_1", "n_private_2"])

## Handle window - based stats

In [23]:
frac_seg_windows = pd.concat(dataframe_frac_seg, names=["chrom", "ix"]).reset_index().drop("ix", axis=1)
frac_seg_windows.head()

Unnamed: 0,chrom,start,stop,gamb_colu,arab,gamb,colu
0,2R,1,100000,0.423362,0.068882,0.18101,0.313134
1,2R,100001,200000,0.434902,0.069055,0.191402,0.322872
2,2R,200001,300000,0.411589,0.066568,0.178696,0.315792
3,2R,300001,400000,0.489416,0.086687,0.236691,0.381282
4,2R,400001,500000,0.514548,0.093772,0.250614,0.413397


In [24]:
frac_nonref_windows = pd.concat(dataframe_frac_non_ref, names=["chrom", "ix"]).reset_index().drop("ix", axis=1)
frac_nonref_windows.head()

Unnamed: 0,chrom,start,stop,gamb_colu,arab,gamb,colu
0,2R,1,100000,0.004444,0.009476,0.00404,0.00465
1,2R,100001,200000,0.005197,0.009228,0.005204,0.00523
2,2R,200001,300000,0.00695,0.00936,0.007128,0.006872
3,2R,300001,400000,0.009756,0.011938,0.009463,0.009908
4,2R,400001,500000,0.011665,0.012815,0.011478,0.011778




In [27]:
## write to csv

frac_seg_windows.to_csv(
    "../content/tables/fraction_segregating_windows.csv")

## write to csv
frac_nonref_windows.to_csv(
    "../content/tables/fraction_nonref_alleles_windows.csv")

## Headline stats for paper

In [28]:
snp_totals_df.groupby(level=0).agg(sum)

statistic,n_biallelic,n_fixed,n_multiallelic,n_segregating
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
arab,19968553,135648100,1171207,21139760
colu,61579247,75351982,25783728,87362975
gamb,49346025,103241758,10127174,59473199
gamb_colu,65900223,57936366,38878368,104778591


In [29]:
intersection_df.groupby(level=0).agg(sum)

statistic,n_private_1,n_private_2,n_shared,n_total
comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gamb_colu_vs_arab,76582035,3742737,14737567,111180784
gamb_vs_colu,15181111,43070887,44292088,102544086
