In [1]:
import numpy as np
import allel
import pandas as pd

In [2]:
from ag3 import release_data
v3 = release_data()

## Overview

- how many segregating sites?

- biallelic/multiallelic

- how many undiscovered sites?

## Reporting

We want to report the headline number of SNPs. So that's gamb_colu snps with gamb_colu mask, plus arab snps with arab mask. 
Also report how many are private to arab and private to gamb_colu.
Do this between gamb_colu vs arab and also gamb vs colu.

Additionally for each species group, we want:
n_seg sites, n_biallelic, n_multiallelic.

## Definitions:
Where there are different masks, cannot say for certain if private/shared.

So, if a variant is seg in gamb_colu, and masked in arab, this does not count as private to gamb_colu.

For each species generate 4 arrays: is_seg, is_multi, is_bial, is_masked.

then the number of seg sites discovered is: `n_seg_sites = (is_seg & is_masked_).sum()`

For the group comparisons:
_shared_: segregating and accessible in both groups
_private A_: segregating in A, not B. Accessible in both.
_private B_: vv above
_total_: segregating and accessible in _either_ group.

In [3]:
from dask_kubernetes import KubeCluster
from dask.distributed import Client, progress
import dask

In [4]:
# kubernetes cluster setup

n_workers = 50
cluster = KubeCluster()
cluster.scale_up(n_workers)
#cluster.adapt(minimum=1, maximum=n_workers)
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:   tcp://10.32.119.2:35359
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [5]:
# dask client setup
client = Client(cluster)
client

distributed.scheduler - INFO - Receive client connection: Client-eb172cde-d726-11ea-8483-820d5e2d0fd8
distributed.core - INFO - Starting established connection


0,1
Client  Scheduler: tcp://10.32.119.2:35359  Dashboard: /user/nicholasharding/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [6]:
chromosomes = "2R", "2L", "3R", "3L", "X"

In [7]:
meta = v3.load_sample_set_metadata(v3.all_sample_sets)

distributed.scheduler - INFO - Register tcp://10.32.146.4:42713
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.146.4:42713
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.188.3:43911
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.188.3:43911
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.159.4:41847
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.159.4:41847
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.169.4:33531
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.169.4:33531
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.127.4:43269
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.127.4:43269
distributed.c

In [8]:
is_gamb_colu = meta.species_gambcolu_arabiensis == "gamb_colu"
is_arab = meta.species_gambcolu_arabiensis == "arabiensis"
is_gamb = meta.species_gambiae_coluzzii == "coluzzii"
is_colu = meta.species_gambiae_coluzzii == "gambiae"

species_groups = {
    "gamb_colu": list(np.where(is_gamb_colu)[0]),
    "arab": list(np.where(is_arab)[0]),
    "gamb": list(np.where(is_gamb)[0]),
    "colu": list(np.where(is_colu)[0])}


In [9]:
snp_counts = {}
intersection_counts = {}

#allele_counts = {}

for chrom in chromosomes:
    gt = allel.GenotypeDaskArray(
        v3.load_sample_set_calldata(chrom, v3.all_sample_sets))
    
    ac = gt.count_alleles_subpops(species_groups)
    #allele_counts[chrom] = ac
    
    holder = {}
    
    # now to apply masks?
    for species_id in species_groups.keys():
        
        try:
            pass_filter = v3.load_mask(chrom, species_id).compute()
        except ValueError:
            pass_filter = v3.load_mask(chrom, "gamb_colu").compute()
            print(f"for {species_id} using gamb_colu")
            
        is_seg = ac[species_id].is_segregating().compute()
        is_bial = ac[species_id].is_biallelic().compute()
        is_multial = is_seg & ~is_bial
        
        n_seg = (is_seg & pass_filter).sum()
        n_bial = (is_bial & pass_filter).sum()
        n_multial = (is_multial & pass_filter).sum()
        n_fixed = (~is_seg & pass_filter).sum()
        
        snp_counts[chrom, species_id] = pd.Series(
            [n_seg, n_fixed, n_bial, n_multial], 
            dtype=np.int64, 
            index=["n_segregating", "n_fixed", "n_biallelic", "n_multiallelic"]) 
        
        holder[species_id] = pass_filter, is_seg    
    
    # count nseg sites, num bial sites, num multi sites.
    for pop_a, pop_b in [["gamb_colu", "arab"], ["gamb", "colu"]]:

        pass_filters_a, seg_a = holder[pop_a]

        pass_filters_b, seg_b = holder[pop_b]

        # corresponding to definitions above
        n_total = np.sum((pass_filters_a & seg_a) | (pass_filters_b & seg_b))
        n_priv_a = np.sum((pass_filters_a & seg_a) & (pass_filters_b & ~seg_b))
        n_priv_b = np.sum((pass_filters_a & ~seg_a) & (pass_filters_b & seg_b))
        n_shared = np.sum((pass_filters_a & seg_a & pass_filters_b & seg_b))

        intersection_counts[chrom, f"{pop_a}_vs_{pop_b}"] = pd.Series(
            [n_total, n_shared, n_priv_a, n_priv_b], 
            dtype=np.int64,
            index=["n_total", "n_shared", "n_private_1", "n_private_2"])


distributed.scheduler - INFO - Register tcp://10.32.170.4:42945
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.170.4:42945
distributed.core - INFO - Starting established connection
distributed.core - INFO - Event loop was unresponsive in Scheduler for 10.30s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.scheduler - INFO - Register tcp://10.32.196.2:45769
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.196.2:45769
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.212.2:40161
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.212.2:40161
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.32.209.2:43231
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.209.2:43231
distribute

for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 7.09s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.94s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.72s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.90s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.07s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.27s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.96s.  This is often caused by long-running GIL-holding functions or moving large ch

for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.84s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.23s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.49s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.54s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 10.09s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.40s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 10.16s.  This is often caused by long-running GIL-holding functions or moving large 

for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.75s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.85s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.scheduler - INFO - Remove worker tcp://10.32.214.2:43169
distributed.core - INFO - Removing comms to tcp://10.32.214.2:43169
distributed.scheduler - INFO - Remove worker tcp://10.32.161.4:36829
distributed.core - INFO - Removing comms to tcp://10.32.161.4:36829


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.58s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.scheduler - INFO - Remove worker tcp://10.32.210.2:42651
distributed.core - INFO - Removing comms to tcp://10.32.210.2:42651
distributed.scheduler - INFO - Remove worker tcp://10.32.205.2:34583
distributed.core - INFO - Removing comms to tcp://10.32.205.2:34583
distributed.core - INFO - Event loop was unresponsive in Scheduler for 9.33s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.scheduler - INFO - Remove worker tcp://10.32.202.2:37565
distributed.core - INFO - Removing comms to tcp://10.32.202.2:37565
distributed.scheduler - INFO - Remove worker tcp://10.32.140.4:33823
distributed.core - INFO - Removing comms to tcp://10.32.140.4:33823
distributed.scheduler - INFO -

for gamb using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.05s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.67s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.19s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.19s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 8.37s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.09s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.03s.  This is often caused by long-running GIL-holding functions or moving large ch

for gamb using gamb_colu




for colu using gamb_colu


distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.53s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Scheduler for 5.76s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


In [10]:
intersection_df = pd.concat(
    intersection_counts, names=["chrom", "comparison", "statistic"]).reset_index(name="value")

intersection_df = pd.pivot_table(
    intersection_df, values="value", columns="statistic", index=["comparison", "chrom"])

In [11]:
intersection_df.head()

Unnamed: 0_level_0,statistic,n_private_1,n_private_2,n_shared,n_total
comparison,chrom,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gamb_colu_vs_arab,2L,17613665,803949,3355685,24749662
gamb_colu_vs_arab,2R,20666329,1167505,3998685,29208415
gamb_colu_vs_arab,3L,13672688,657460,2795117,19564154
gamb_colu_vs_arab,3R,17959082,874629,3829389,26151803
gamb_colu_vs_arab,X,6670271,239194,758691,11506750


In [13]:
snp_totals_df = pd.concat(snp_counts, names=["chrom", "species", "statistic"]).reset_index(name="value")

snp_totals_df = pd.pivot_table(
    snp_totals_df, values="value", columns="statistic", index=["species", "chrom"])

In [14]:
snp_totals_df.head()

Unnamed: 0_level_0,statistic,n_biallelic,n_fixed,n_multiallelic,n_segregating
species,chrom,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arab,2L,4455134,30512423,251140,4706274
arab,2R,5578527,38360553,316601,5895128
arab,3L,3769016,24401656,230278,3999294
arab,3R,5062574,31047934,333322,5395896
arab,X,1103302,11325534,39866,1143168


In [15]:
## write to csv

snp_totals_df.to_csv(
    "../content/tables/snp_totals.csv", 
    columns=["n_segregating", "n_biallelic", "n_multiallelic", "n_fixed"])

## write to csv
intersection_df.to_csv(
    "../content/tables/snp_intersection_totals.csv", 
    columns=["n_total", "n_shared", "n_private_1", "n_private_2"])