In [1]:
#Load modules
import zarr
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import yaml
from pathlib import Path
import allel

from dask.distributed import Client
import dask
dask.config.set(**{'array.slicing.split_large_chunks': False}) # Silence large chunk warnings
import dask.array as da
from dask import delayed, compute
from dask_gateway import Gateway
import functools
import numcodecs
from fsspec.implementations.zip import ZipFileSystem
from collections.abc import Mapping
import gcsfs
import numba
import psutil
from humanize import naturalsize

import pickle
import platform

import traceback
import logging

from pyprojroot import here
from bokeh.plotting import *
import plotly.express as px
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator

### Connect to gcs

In [None]:
gcs = gcsfs.GCSFileSystem()

In [None]:
gcs.ls('vo_afun_release_master_us_central1')[:3]

### Set up data access

In [4]:
production_root = Path('vo_afun_release_master_us_central1')
vo_afun_staging = Path(production_root, 'v1.0')
sampleset_staging_dir = Path(vo_afun_staging, 'snp_genotypes', 'all')

#Decision tree or static filters
genomic_positions_site_filter_dt_data_cloud_zarr_dir = 'vo_afun_release_master_us_central1/v1.0/site_filters/dt_20200416/funestus'
genomic_positions_site_filter_sc_data_cloud_zarr_dir = 'vo_afun_release_master_us_central1/v1.0/site_filters/sc_20220908/funestus'

repo_clone_path = here()
release_config_path = repo_clone_path / 'analysis' / 'config.yml'

with open(release_config_path) as fh:
    config = yaml.load(fh, Loader=yaml.BaseLoader)
    
samplesets = config["sample_sets"]

In [None]:
meta = pd.read_csv("../../metadata/supp1_tab2.csv")
meta.columns

### Functions

In [6]:
# load a single array from field/chrom/sampleset
# internal path for calldata is chrom/calldata/field
# sampleset_calldata = sampleset_staging_dir / sset
# sampleset is needed to load species spec.
def load_single_field(zarr_path, internal_path, sset, exclude_males=False, samples=None):
      
    inz = zarr.group(is_gcloud(zarr_path), overwrite=False)
    
    oo = da.from_zarr(inz[internal_path])  
    
    if oo.ndim == 1:
        oo = oo.reshape((1, -1))
           
    return oo

In [7]:
## General function to concatenate data.
## Selected chunk size may be more appropriate for some than others.
def concatenate_along_axis(base_dir, internal_path, req_samplesets):
    
    # work out shape
    data = [load_single_field(base_dir / ss, internal_path, ss) for ss in req_samplesets]
    
    return da.concatenate(data, axis=1)

In [8]:
def is_gcloud(path):
    
    try: 
        return gcs.get_mapper(path.as_posix())
    except NameError as e:
        return path.as_posix()

In [9]:
def load_filter(chrom, filter_dir = genomic_positions_site_filter_sc_data_cloud_zarr_dir):
    gcsmap = gcs.get_mapper(filter_dir)
    genomic_positions_site_filter_data = zarr.Group(gcsmap, read_only=True)
    filter_pass = da.from_zarr(
            genomic_positions_site_filter_data[chrom]['variants/filter_pass'])
    return filter_pass

In [10]:
def load_position(chrom):
    store = gcs.get_mapper(
        f'gs://vo_afun_release/v1.0/snp_genotypes/all/sites')
    root = zarr.open(store, mode='r')
    pos = root[chrom]['variants/POS'][:]
    return pos

In [13]:
def read_in_genotypes(chrom, pos_min=None, pos_max=None, \
                      samples_idx = None, samplesets = samplesets, \
                      filter_dir = genomic_positions_site_filter_sc_data_cloud_zarr_dir):

    # load the genotypes and positions
    gt_d = concatenate_along_axis(sampleset_staging_dir, f"{chrom}/calldata/GT", samplesets)
    gt = allel.GenotypeDaskArray(gt_d)
    
    #load the filter
    is_accessible = load_filter(chrom, filter_dir)
    
    #load positions
    pos = load_position(chrom)
    if pos_min is None:
        pos_min=pos.min()
    if pos_max is None:
        pos_max=pos.max()
    is_in_pos = (pos>=pos_min) & (pos<=pos_max)
    
    #select samples if desired
    if not samples_idx is None:
        gt = gt.take(samples_idx, axis=1)
    
    #return accessible genotypes
    
    return gt.compress((is_accessible) & (is_in_pos), axis=0)

In [26]:
def compute_number_variants_reading_frame(chrom, arm, pos_min=None, pos_max=None, \
                      samples_idx = None, samplesets = samplesets, \
                      filter_dir = genomic_positions_site_filter_sc_data_cloud_zarr_dir):
    
    gt = read_in_genotypes(chrom, pos_min, pos_max, samples_idx, samplesets, \
                          filter_dir)
    print(f"read in genotypes on {arm}")
    #compute allele counts
    ac = gt.count_alleles(max_allele=3)
    seg = ac.count_segregating()
    
    return seg, gt.shape[0]
    

In [41]:
def compute_number_multiallelic_reading_frame(chrom, arm, pos_min=None, pos_max=None, \
                      samples_idx = None, samplesets = samplesets, \
                      filter_dir = genomic_positions_site_filter_sc_data_cloud_zarr_dir):
    
    gt = read_in_genotypes(chrom, pos_min, pos_max, samples_idx, samplesets, \
                          filter_dir)
    print(f"read in genotypes on {arm}")
    #compute allele counts
    ac = gt.count_alleles(max_allele=3)
    allelism = ac.allelism()
    
    return np.sum(allelism.compute()>=3)

In [24]:
def compute_number_variants(df, df_called, colname, samples_idx=None, samplesets=samplesets,\
                            filter_dir = genomic_positions_site_filter_sc_data_cloud_zarr_dir):
    
    for start,end in zip([1,20_000_001,40_000_001], [20_000_000,40_000_000,57_335_000]):
        seg, called = compute_number_variants_reading_frame('2RL', '2R', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['2R',colname] += seg
        df_called.loc['2R',colname]+=called
    df.to_csv("total_number_of_variants.csv")
    df_called.to_csv("total_number_of_called_sites.csv")
    print("Arm 2R done")
    for start,end in zip([57_335_001, 80_000_001], [80_000_000, None]):
        seg, called = compute_number_variants_reading_frame('2RL', '2L', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['2L',colname] += seg
        df_called.loc['2L',colname] +=called
    df.to_csv("total_number_of_variants.csv")
    df_called.to_csv("total_number_of_called_sites.csv")
    print("Arm 2L done")
    for start,end in zip([1,20_000_001], [20_000_000,44_700_000]):
        seg, called = compute_number_variants_reading_frame('3RL', '3R', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['3R',colname] += seg
        df_called.loc['3R',colname] +=called
    df.to_csv("total_number_of_variants.csv")
    df_called.to_csv("total_number_of_called_sites.csv")
    print("Arm 3R done")
    for start,end in zip([44_700_001, 65_000_001], [65_000_000, None]):
        seg, called = compute_number_variants_reading_frame('3RL', '3L', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['3L',colname] += seg
        df_called.loc['3L',colname] +=called
    df.to_csv("total_number_of_variants.csv")
    df_called.to_csv("total_number_of_called_sites.csv")
    print("Arm 3L done")
    for start,end in zip([1], [None]):
        seg, called = compute_number_variants_reading_frame('X', 'X', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['X',colname] += seg
        df_called.loc['X',colname] +=called
    df.to_csv("total_number_of_variants.csv")
    df_called.to_csv("total_number_of_called_sites.csv")
    print("Arm X done")
    
    return df, df_called
    

In [45]:
def compute_number_multiallelic(df, colname, samples_idx=None, samplesets=samplesets,\
                            filter_dir = genomic_positions_site_filter_sc_data_cloud_zarr_dir):
    
    for start,end in zip([1,20_000_001,40_000_001], [20_000_000,40_000_000,57_335_000]):
        seg = compute_number_multiallelic_reading_frame('2RL', '2R', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['2R',colname] += seg
    df.to_csv("number_of_multiallelic_variants.csv")
    print("Arm 2R done")
    for start,end in zip([57_335_001, 80_000_001], [80_000_000, None]):
        seg = compute_number_multiallelic_reading_frame('2RL', '2L', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['2L',colname] += seg
    df.to_csv("number_of_multiallelic_variants.csv")
    print("Arm 2L done")
    for start,end in zip([1,20_000_001], [20_000_000,44_700_000]):
        seg = compute_number_multiallelic_reading_frame('3RL', '3R', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['3R',colname] += seg
    df.to_csv("number_of_multiallelic_variants.csv")
    print("Arm 3R done")
    for start,end in zip([44_700_001, 65_000_001], [65_000_000, None]):
        seg = compute_number_multiallelic_reading_frame('3RL', '3L', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['3L',colname] += seg
    df.to_csv("number_of_multiallelic_variants.csv")
    print("Arm 3L done")
    for start,end in zip([1], [None]):
        seg = compute_number_multiallelic_reading_frame('X', 'X', pos_min=start,\
                                        pos_max=end,samples_idx=samples_idx, samplesets=samplesets,\
                                        filter_dir = filter_dir)
        df.loc['X',colname] += seg
    df.to_csv("number_of_multiallelic_variants.csv")
    print("Arm X done")
    
    return df

In [None]:
#put meta in order as dask is stored
sample_order = concatenate_along_axis(sampleset_staging_dir, "samples", samplesets).compute()
sample_order = (sample_order[0]).astype(str)
meta.set_index('VBS_sample_id', inplace=True)
meta = meta.loc[sample_order]
meta.reset_index(inplace=True)
meta.head()

### Set up dask cluster

In [15]:
gateway = Gateway()
conda_prefix = os.environ["CONDA_PREFIX"]
current_environment = 'global/'+conda_prefix.split('/')[5]
cluster = gateway.new_cluster(
    profile='standard', 
    conda_environment = current_environment,
)
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [16]:
client=cluster.get_client()

In [17]:
cluster.scale(50)

### Compute

In [20]:
df = pd.DataFrame(np.zeros((5,4)), index=['2R', '2L', '3R', '3L', 'X'], columns = [
    'all_sc', 'all_dt', 'hicov_sc', 'hicov_dt'])
df

Unnamed: 0,all_sc,all_dt,hicov_sc,hicov_dt
2R,0.0,0.0,0.0,0.0
2L,0.0,0.0,0.0,0.0
3R,0.0,0.0,0.0,0.0
3L,0.0,0.0,0.0,0.0
X,0.0,0.0,0.0,0.0


In [22]:
df_called = pd.DataFrame(np.zeros((5,4)), index=['2R', '2L', '3R', '3L', 'X'], columns = [
    'all_sc', 'all_dt', 'hicov_sc', 'hicov_dt'])
df_called

Unnamed: 0,all_sc,all_dt,hicov_sc,hicov_dt
2R,0.0,0.0,0.0,0.0
2L,0.0,0.0,0.0,0.0
3R,0.0,0.0,0.0,0.0
3L,0.0,0.0,0.0,0.0
X,0.0,0.0,0.0,0.0


In [27]:
df, df_called = compute_number_variants(df, df_called, 'all_sc')

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [31]:
df, df_called = compute_number_variants(df, df_called, 'all_dt', 
                                       filter_dir = genomic_positions_site_filter_dt_data_cloud_zarr_dir)

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [36]:
hi_cov_idx = meta.loc[meta.subset_2=='Y'].index
hi_cov_idx

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            646, 647, 648, 649, 650, 651, 652, 653, 654, 655],
           dtype='int64', length=619)

In [37]:
df, df_called = compute_number_variants(df, df_called, 'hicov_sc', samples_idx = hi_cov_idx)

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [38]:
df, df_called = compute_number_variants(df, df_called, 'hicov_dt',  samples_idx = hi_cov_idx,
                                       filter_dir = genomic_positions_site_filter_dt_data_cloud_zarr_dir)

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [40]:
df

Unnamed: 0,all_sc,all_dt,hicov_sc,hicov_dt
2R,21690522.0,14399188.0,19665129.0,12915499.0
2L,20184975.0,13313064.0,18787274.0,12305596.0
3R,16819330.0,11190408.0,15293287.0,10075676.0
3L,13932157.0,9282280.0,12786616.0,8452756.0
X,7019229.0,5069166.0,6478461.0,4652724.0


In [42]:
df_mult = pd.DataFrame(np.zeros((5,4)), index=['2R', '2L', '3R', '3L', 'X'], columns = [
    'all_sc', 'all_dt', 'hicov_sc', 'hicov_dt'])

In [43]:
df_mult = compute_number_multiallelic(df_mult, 'all_sc')

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [48]:
df_mult = compute_number_multiallelic(df_mult, 'all_dt', 
                                      filter_dir = genomic_positions_site_filter_dt_data_cloud_zarr_dir)

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [49]:
df_mult = compute_number_multiallelic(df_mult, 'hicov_sc', samples_idx = hi_cov_idx)

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [50]:
df_mult = compute_number_multiallelic(df_mult, 'hicov_dt', samples_idx = hi_cov_idx,
                                      filter_dir = genomic_positions_site_filter_dt_data_cloud_zarr_dir)

read in genotypes on 2R
read in genotypes on 2R
read in genotypes on 2R
Arm 2R done
read in genotypes on 2L
read in genotypes on 2L
Arm 2L done
read in genotypes on 3R
read in genotypes on 3R
Arm 3R done
read in genotypes on 3L
read in genotypes on 3L
Arm 3L done
read in genotypes on X
Arm X done


In [51]:
cluster.shutdown()

In [14]:
gateway = Gateway()

In [15]:
for report in gateway.list_clusters():
    gateway.connect(report.name).shutdown()

In [36]:
df.loc['total'] = df.sum(axis=0)

In [59]:
df_mult.loc['total'] = df_mult.sum(axis=0)

In [37]:
df_called.loc['total'] = df_called.sum(axis=0)

In [38]:
df

Unnamed: 0,all_sc,all_dt,hicov_sc,hicov_dt
2R,21690522.0,14399188.0,19665129.0,12915499.0
2L,20184975.0,13313064.0,18787274.0,12305596.0
3R,16819330.0,11190408.0,15293287.0,10075676.0
3L,13932157.0,9282280.0,12786616.0,8452756.0
X,7019229.0,5069166.0,6478461.0,4652724.0
total,79646213.0,53254106.0,73010767.0,48402251.0


In [64]:
df_mult

Unnamed: 0,all_sc,all_dt,hicov_sc,hicov_dt
2R,5257496.0,3178357.0,4453829.0,2653757.0
2L,6406145.0,3898523.0,5670264.0,3422477.0
3R,4075774.0,2449219.0,3463523.0,2053152.0
3L,3629627.0,2176746.0,3125074.0,1853371.0
X,2125512.0,1410230.0,1858582.0,1226107.0
total,21494554.0,13113075.0,18571272.0,11208864.0


In [39]:
df_called

Unnamed: 0,all_sc,all_dt,hicov_sc,hicov_dt
2R,47266449.0,33181056.0,47266449.0,33181056.0
2L,37066332.0,25648309.0,37066332.0,25648309.0
3R,36351913.0,25731711.0,36351913.0,25731711.0
3L,28367188.0,20105966.0,28367188.0,20105966.0
X,13331835.0,10094797.0,13331835.0,10094797.0
total,162383717.0,114761839.0,162383717.0,114761839.0


In [73]:
df.loc['total', 'all_sc'] / np.sum(1/np.arange(1,656*2)) / df_called.loc['total', 'all_sc']

0.0632378176151201

In [74]:
df.loc['total', 'hicov_sc'] / np.sum(1/np.arange(1,619*2))/ df_called.loc['total', 'all_sc']

0.058406731673569384

In [75]:
df.loc['total', 'all_dt'] / np.sum(1/np.arange(1,656*2)) / df_called.loc['total', 'all_dt']

0.059828735018907855

In [76]:
df.loc['total', 'hicov_dt'] / np.sum(1/np.arange(1,619*2))/ df_called.loc['total', 'all_dt']

0.05478813693590849

In [79]:
df_mult.loc['total'] / df_called.loc['total']

all_sc      0.132369
all_dt      0.114263
hicov_sc    0.114367
hicov_dt    0.097671
Name: total, dtype: float64

In [40]:
df.loc['total'] / df_called.loc['total']

all_sc      0.490482
all_dt      0.464040
hicov_sc    0.449619
hicov_dt    0.421763
Name: total, dtype: float64

In [34]:
df_called = pd.read_csv("total_number_of_called_sites.csv", index_col=0)

In [35]:
df = pd.read_csv("total_number_of_variants.csv", index_col=0)

In [30]:
df_mult = pd.read_csv("number_of_multiallelic_variants.csv", index_col=0)

In [13]:
px.shape

(22263624,)

In [15]:
p3.shape

(84634641,)

In [17]:
p2.shape

(102882611,)

In [19]:
callab = px.shape[0]+p2.shape[0]+p3.shape[0]

In [42]:
(callab - df_called.loc['total', 'hicov_dt'])/callab*100

45.29442283385259

In [41]:
df.loc['total', 'hicov_dt']/ df_called.loc['total', 'hicov_dt']

0.4217625947942504

In [33]:
df

Unnamed: 0,all_sc,all_dt,hicov_sc,hicov_dt
2R,21690522.0,14399188.0,19665129.0,12915499.0
2L,20184975.0,13313064.0,18787274.0,12305596.0
3R,16819330.0,11190408.0,15293287.0,10075676.0
3L,13932157.0,9282280.0,12786616.0,8452756.0
X,7019229.0,5069166.0,6478461.0,4652724.0
total,79646213.0,53254106.0,73010767.0,48402251.0
