Get cumulative numbers of variants, targets and genes for hicov geographic cohorts 
and hicov all 

## Imports

In [1]:
import pandas as pd
import allel
import collections
import numpy as np
import malariagen_data
import gcsfs
import zarr
import dask
import dask.array as da
import cython
import functools
import os

In [2]:
from dask.distributed import Client
import dask
dask.config.set(**{'array.slicing.split_large_chunks': False}) # Silence large chunk warnings
import dask.array as da
from dask import delayed, compute
from dask_gateway import Gateway
import functools
import numcodecs
from fsspec.implementations.zip import ZipFileSystem
from collections.abc import Mapping
import gcsfs
import numba
import psutil
from humanize import naturalsize


In [None]:
gcs = gcsfs.GCSFileSystem()

## Read ref genome, accessibility and annotation

### Ref genome

In [4]:
#Load af1 to access reference genome
#Sequence will be read for chromosomes separately
ag3 = malariagen_data.Ag3(release='3.0')

### Annotation

In [5]:
#Uploaded local copy of annotation
#Because the release contains a preliminary version
gff_fn = 'VectorBase-65_AgambiaePEST.gff'
features = allel.FeatureTable.from_gff3(gff_fn, attributes=['ID', 'Parent'])
features

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent,Unnamed: 11
0,AgamP4_X,VEuPathDB,protein_coding_gene,582,22703,-1.0,-,-1,AGAP000002,.,
1,AgamP4_X,VEuPathDB,mRNA,582,22533,-1.0,-,-1,AGAP000002-RA,AGAP000002,
2,AgamP4_X,VEuPathDB,mRNA,582,22703,-1.0,-,-1,AGAP000002-RB,AGAP000002,
...,...,...,...,...,...,...,...,...,...,...,...
205565,AgamP4_3L,VEuPathDB,five_prime_UTR,9463141,9463142,-1.0,+,-1,utr_AGAP029992.R846_3,AGAP029992.R846,
205566,AgamP4_3L,VEuPathDB,three_prime_UTR,9767129,9768776,-1.0,+,-1,utr_AGAP029992.R845_4,AGAP029992.R845,
205567,AgamP4_3L,VEuPathDB,three_prime_UTR,9767129,9768776,-1.0,+,-1,utr_AGAP029992.R846_4,AGAP029992.R846,


In [6]:
#Eyeball the features present
collections.Counter(features.type).most_common()

[('exon', 75811),
 ('CDS', 68943),
 ('five_prime_UTR', 18226),
 ('mRNA', 15328),
 ('protein_coding_gene', 13107),
 ('three_prime_UTR', 12677),
 ('ncRNA_gene', 729),
 ('tRNA', 362),
 ('rRNA', 242),
 ('pre_miRNA', 77),
 ('snRNA', 35),
 ('pseudogene', 9),
 ('pseudogenic_transcript', 9),
 ('ncRNA', 4),
 ('SRP_RNA', 3),
 ('snoRNA', 2),
 ('lnc_RNA', 2),
 ('RNase_P_RNA', 1),
 ('RNase_MRP_RNA', 1)]

In [7]:
# index features by their parent
idx_feature_parent = dict(features[['ID', 'Parent']])
idx_feature_parent['AGAP000002-RA']

'AGAP000002'

In [8]:
#Check scaffold names
collections.Counter(features.seqid).most_common()[:10]

[('AgamP4_2R', 57832),
 ('AgamP4_2L', 48548),
 ('AgamP4_3R', 43003),
 ('AgamP4_3L', 32236),
 ('AgamP4_X', 19213),
 ('AgamP4_UNKN', 4593),
 ('AgamP4_Mt', 124),
 ('AgamP4_Y_unplaced', 19)]

In [9]:
# subset to features annotated on the chromosomes
loc_features_chroms = (
    (features.seqid == 'AgamP4_2R') |
    (features.seqid == 'AgamP4_2L') |
    (features.seqid == 'AgamP4_3R') |
    (features.seqid == 'AgamP4_3L') |
    (features.seqid == 'AgamP4_X') 
)
features_chroms = features[loc_features_chroms]
features_chroms

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent,Unnamed: 11
0,AgamP4_X,VEuPathDB,protein_coding_gene,582,22703,-1.0,-,-1,AGAP000002,.,
1,AgamP4_X,VEuPathDB,mRNA,582,22533,-1.0,-,-1,AGAP000002-RA,AGAP000002,
2,AgamP4_X,VEuPathDB,mRNA,582,22703,-1.0,-,-1,AGAP000002-RB,AGAP000002,
...,...,...,...,...,...,...,...,...,...,...,...
200829,AgamP4_3L,VEuPathDB,five_prime_UTR,9463141,9463142,-1.0,+,-1,utr_AGAP029992.R846_3,AGAP029992.R846,
200830,AgamP4_3L,VEuPathDB,three_prime_UTR,9767129,9768776,-1.0,+,-1,utr_AGAP029992.R845_4,AGAP029992.R845,
200831,AgamP4_3L,VEuPathDB,three_prime_UTR,9767129,9768776,-1.0,+,-1,utr_AGAP029992.R846_4,AGAP029992.R846,


In [10]:
#Check that it worked
np.unique(features_chroms.seqid)

array(['AgamP4_2L', 'AgamP4_2R', 'AgamP4_3L', 'AgamP4_3R', 'AgamP4_X'],
      dtype=object)

In [11]:
#subset to coding sequences on the chromosomes
cdss_chroms = features_chroms[features_chroms.type == 'CDS']
cdss_chroms

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent,Unnamed: 11
0,AgamP4_X,VEuPathDB,CDS,582,865,-1.0,-,2,AGAP000002-PA-CDS5,AGAP000002-RA,
1,AgamP4_X,VEuPathDB,CDS,582,865,-1.0,-,2,AGAP000002-PB-CDS5,AGAP000002-RB,
2,AgamP4_X,VEuPathDB,CDS,950,3120,-1.0,-,1,AGAP000002-PA-CDS4,AGAP000002-RA,
...,...,...,...,...,...,...,...,...,...,...,...
67668,AgamP4_3L,VEuPathDB,CDS,9766368,9766462,-1.0,+,2,AGAP029992.P846-CDS22,AGAP029992.R846,
67669,AgamP4_3L,VEuPathDB,CDS,9766577,9767128,-1.0,+,0,AGAP029992.P845-CDS22,AGAP029992.R845,
67670,AgamP4_3L,VEuPathDB,CDS,9766577,9767128,-1.0,+,0,AGAP029992.P846-CDS23,AGAP029992.R846,


In [12]:
#subset to protein coding genes on the chromosomes
genes_coding_chroms = features_chroms[features_chroms.type =='protein_coding_gene']
genes_coding_chroms

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent,Unnamed: 11
0,AgamP4_X,VEuPathDB,protein_coding_gene,582,22703,-1.0,-,-1,AGAP000002,.,
1,AgamP4_X,VEuPathDB,protein_coding_gene,32382,38843,-1.0,-,-1,AGAP000005,.,
2,AgamP4_X,VEuPathDB,protein_coding_gene,83817,88773,-1.0,-,-1,AGAP000007,.,
...,...,...,...,...,...,...,...,...,...,...,...
12611,AgamP4_3R,VEuPathDB,protein_coding_gene,35431150,35594823,-1.0,-,-1,AGAP029990,.,
12612,AgamP4_2R,VEuPathDB,protein_coding_gene,36940814,37029742,-1.0,-,-1,AGAP029991,.,
12613,AgamP4_3L,VEuPathDB,protein_coding_gene,9456051,9768776,-1.0,+,-1,AGAP029992,.,


In [13]:
# double-check that you get the same number of protein coding genes via cds
uq_genes_coding_chroms = np.unique([idx_feature_parent[t] for t in cdss_chroms.Parent])
assert set(genes_coding_chroms.ID) == set(uq_genes_coding_chroms)
len(uq_genes_coding_chroms)

12614

# Functions

In [14]:
%load_ext cython

Don't know how to safely use comments in cython - so I'll include them here

`%%cython` instructs to compile this cell with cython. the `-a` flag tells it to showcase the compilation.  
Next define the nucleotides as integers corresponding to their ascii value.  
`@cython.boundscheck(False)` tells cython to not perform a boundscheck -- so we have to guarantee the bounds.  
`opt_is_cas9_target()` takes as arguments a 'view' of seq (which we explicitly say cannot be None), a 'view' of is_variant and a boolean specifying whether to check reverse complements. I guess a view is an dynamic array?  
Initialise the `out` array, a counter `i` and boleans `fwd`, `rev` and fill `out` with as many zeroes as `seq` is long (but now make it a 1bit integer, whereas it was initialised as an 8bit integer?)  
Then release the GIL (it's something of a global lock on python code, and it has to be released to allow for multithreading).  
Loop through the positions in seq, taking into account the length of the targetsite. Check that of 21 consecutive positions, none are `N` (except the first of the PAM, which can be anything) and it ends in `GG`. Returns `fwd = True` if this check is satisfied. In that case, that that `is_variant` is zero at all positions except the third last (so if we give a boolean array of where bases are segregating within the population, it gets targets that are non-segregating; but if we give a boolean array of where bases are non-accessbile, it gets targets that are accessible). Repeat in reverse complement if `revcomp = True`.  
After finishing the loop, return a view of an array of booleans specifiying for each target starting postition whether its a valid target either forwards or reverse. 

In [15]:
%%cython

import numpy as np
cimport numpy as cnp
cimport cython

cdef:
    cnp.uint8_t A = ord('A')
    cnp.uint8_t C = ord('C')
    cnp.uint8_t G = ord('G')
    cnp.uint8_t T = ord('T')
    cnp.uint8_t N = ord('N')
    
@cython.boundscheck(False)
def opt_is_cas9_target(cnp.uint8_t[:] seq not None, cnp.uint8_t[:] is_variant, bint revcomp=True):
    """Locate Cas9 target sequences."""
    
    cdef:
        cnp.uint8_t[:] out
        Py_ssize_t i
        bint fwd, rev
        
    out = np.zeros(seq.shape[0], dtype='u1')
    
    with nogil:
        for i in range(seq.shape[0] - 20):
            # check if forward strand has -NGG
            fwd = ((seq[i] != N) and
                   (seq[i+1] != N) and
                   (seq[i+2] != N) and
                   (seq[i+3] != N) and
                   (seq[i+4] != N) and
                   (seq[i+5] != N) and
                   (seq[i+6] != N) and
                   (seq[i+7] != N) and
                   (seq[i+8] != N) and
                   (seq[i+9] != N) and
                   (seq[i+10] != N) and
                   (seq[i+11] != N) and
                   (seq[i+12] != N) and
                   (seq[i+13] != N) and
                   (seq[i+14] != N) and
                   (seq[i+15] != N) and
                   (seq[i+16] != N) and
                   (seq[i+17] != N) and
                   # anything and
                   (seq[i+19] == G) and 
                   (seq[i+20] == G))
            if fwd and is_variant is not None:
                fwd = (fwd and 
                       (is_variant[i] == 0) and
                       (is_variant[i+1] == 0) and
                       (is_variant[i+2] == 0) and
                       (is_variant[i+3] == 0) and
                       (is_variant[i+4] == 0) and
                       (is_variant[i+5] == 0) and
                       (is_variant[i+6] == 0) and
                       (is_variant[i+7] == 0) and
                       (is_variant[i+8] == 0) and
                       (is_variant[i+9] == 0) and
                       (is_variant[i+10] == 0) and
                       (is_variant[i+11] == 0) and
                       (is_variant[i+12] == 0) and
                       (is_variant[i+13] == 0) and
                       (is_variant[i+14] == 0) and
                       (is_variant[i+15] == 0) and
                       (is_variant[i+16] == 0) and
                       (is_variant[i+17] == 0) and
                       # anything and
                       (is_variant[i+19] == 0) and
                       (is_variant[i+20] == 0))
            if revcomp:
                # check if reverse strand has -NGG
                rev = ((seq[i] == C) and
                       (seq[i+1] == C) and
                       # anything and
                       (seq[i+3] != N) and
                       (seq[i+4] != N) and
                       (seq[i+5] != N) and
                       (seq[i+6] != N) and
                       (seq[i+7] != N) and
                       (seq[i+8] != N) and
                       (seq[i+9] != N) and
                       (seq[i+10] != N) and
                       (seq[i+11] != N) and
                       (seq[i+12] != N) and
                       (seq[i+13] != N) and
                       (seq[i+14] != N) and
                       (seq[i+15] != N) and
                       (seq[i+16] != N) and
                       (seq[i+17] != N) and
                       (seq[i+18] != N) and
                       (seq[i+19] != N) and 
                       (seq[i+20] != N))
                if rev and is_variant is not None:
                    rev = (rev and 
                           (is_variant[i] == 0) and
                           (is_variant[i+1] == 0) and
                           # anything and
                           (is_variant[i+3] == 0) and
                           (is_variant[i+4] == 0) and
                           (is_variant[i+5] == 0) and
                           (is_variant[i+6] == 0) and
                           (is_variant[i+7] == 0) and
                           (is_variant[i+8] == 0) and
                           (is_variant[i+9] == 0) and
                           (is_variant[i+10] == 0) and
                           (is_variant[i+11] == 0) and
                           (is_variant[i+12] == 0) and
                           (is_variant[i+13] == 0) and
                           (is_variant[i+14] == 0) and
                           (is_variant[i+15] == 0) and
                           (is_variant[i+16] == 0) and
                           (is_variant[i+17] == 0) and
                           (is_variant[i+18] == 0) and
                           (is_variant[i+19] == 0) and
                           (is_variant[i+20] == 0))
            else:
                rev = False
            out[i] = fwd or rev
    
    return np.asarray(out).view(bool)

@cython.boundscheck(False)
def opt_all_subsequent(cython.integral[:] t, Py_ssize_t n):
    """Locate contiguous regions with the same non-zero value."""
    
    cdef:
        cython.integral[:] out
        Py_ssize_t i, j
        cython.integral x
        bint subs
        
    out = np.zeros_like(t)
    
    with nogil:
        for i in range(t.shape[0] - n):
            x = t[i]
            if x > 0:
                subs = True
                for j in range(i+1, i+n):
                    subs = subs and t[j] == x
                if subs:
                    out[i] = x
                
    return np.asarray(out)
  
@cython.boundscheck(False)
def opt_zero_subsequent(cython.integral[:] t, Py_ssize_t n):
    """Locate non-overlapping targets."""

    cdef:
        cython.integral[:] out
        Py_ssize_t i, j
        cython.integral x
        
    out = np.zeros_like(t)
    i = 0

    with nogil:
        while i < t.shape[0]:
            x = t[i]
            if x > 0:
                out[i] = x
                i += n
            else:
                i += 1

    return np.asarray(out)

`opt_all_subsequent()` scans an array of integer values and reports for each entry, whether the `n` subsequent entries have the same non-zero value (used to check e.g. whether all entries are in the same exon). If the requirement is met, it replaces the corresponding entry in `out` by its value in `t`. Returns numpy array `out`, with zeroes where the condition is not met.  
`opt_zero_subsequent()` scans an array of integer values and returns an array of integers where non-zero values are spaces apart by at least `n`

### Python functions

In [16]:
@functools.lru_cache(maxsize=None)
def get_reference_sequence(chrom):
    """Load the reference sequence."""
    seq = ag3.genome_sequence(chrom).compute()
    seq = np.char.upper(seq)
    return seq

In [17]:
# can't cache, but it's pretty quick
def get_is_cas9_target(chrom, revcomp=True, is_variant=None):
    """Find Cas9 target sequences."""
    seq = get_reference_sequence(chrom).view('u1')
    if is_variant is not None:
        is_variant = is_variant.view('u1')
    out = opt_is_cas9_target(seq, revcomp=revcomp, is_variant=is_variant)
    return out

In [18]:
@functools.lru_cache(maxsize=None)
def get_cds_idx(chrom):
    """Locate CDS features on the reference sequence."""
    seq = get_reference_sequence(chrom)
    out = np.zeros_like(seq, dtype=int)
    chrom = f'AgamP4_{chrom}'
    for i, (seqid, start, end) in enumerate(cdss_chroms[['seqid', 'start', 'end']]):
        if seqid == chrom:
            out[start-1:end] = i + 1  # start indexing from 1
    return out
    

This gives each CDS a unique index -- however, if there are ovelapping CDS, the last listed one counts.

In [19]:
@functools.lru_cache(maxsize=None)
def get_gene_idx(chrom):
    """Locate protein-coding genes on the reference sequence."""
    seq = get_reference_sequence(chrom)
    out = np.zeros_like(seq, dtype=int)
    chrom = f'AgamP4_{chrom}'
    for i, (seqid, start, end) in enumerate(genes_coding_chroms[['seqid', 'start', 'end']]):
        if seqid == chrom:
            out[start-1:end] = i + 1  # start indexing from 1
    return out

In [20]:
@functools.lru_cache(maxsize=None)
def get_target_in_cds(chrom):
    """Locate targets within a single CDS."""
    t = get_cds_idx(chrom)
    out = opt_all_subsequent(t, 21) > 0
    return out

In [21]:
#@functools.lru_cache(maxsize=None)
def get_is_variant_individual(chrom, sample_idx):
    """Locate variant sites for a given sample."""
    seq = get_reference_sequence(chrom)
    out = np.zeros_like(seq, dtype=bool)
    pos = ag3.snp_calls(chrom, sample_sets='3.0').variant_position.values
    sample_idx_list = [int(sample_idx)]
    g = ag3.snp_calls(chrom, sample_sets = ['AG1000G-AO', 'AG1000G-GW', 
                                            'AG1000G-BF-A', 'AG1000G-GN-A', 
               'AG1000G-CM-A', 'AG1000G-GA-A', 'AG1000G-UG', 'AG1000G-KE'], 
                      sample_indices = sample_idx_list).call_genotype
    loc = np.any(g > 0, axis=(1,2))
    loc = loc.compute().values
    idx = pos[loc] - 1
    out[idx] = True

    return out

# Set up cluster

In [22]:
gateway = Gateway()
for cl in gateway.list_clusters():
    gateway.connect(cl.name).shutdown()

In [23]:
gateway = Gateway()
conda_prefix = os.environ["CONDA_PREFIX"]
current_environment = 'global/'+conda_prefix.split('/')[5]
cluster = gateway.new_cluster(
    profile='standard', 
    conda_environment = current_environment,
)
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [24]:
client=cluster.get_client()

In [25]:
cluster.scale(60)

# Run analysis

In [26]:
@functools.lru_cache(maxsize=None)
def identify_targets_population_cumulative(chrom, sample_idx):

    #Find targets in cds (not using site filter)
    base_loc = get_target_in_cds(chrom) 
    loc = base_loc & get_is_cas9_target(chrom, revcomp=True)
    
    #index genes
    gene_idx = get_gene_idx(chrom)
    
    # setup outputs
    out_sites = []
    out_targets = []
    out_genes = []
    
    # setup variation
    is_variant = np.zeros(gene_idx.shape[0], dtype=bool)
    
    #initial numbers
    out_sites.append(0)
    
    n_targets = np.count_nonzero(loc)
    out_targets.append(n_targets)
    
    n_genes = len(np.unique(gene_idx[loc]))
    out_genes.append(n_genes)
    
    #randomise order of samples
    rng = np.random.default_rng()
    sample_idx = rng.choice(sample_idx, size=len(sample_idx),
                              replace=False)
    
    for i, sidx in enumerate(sample_idx):
        
        #accumulate variation
        is_variant |= get_is_variant_individual(chrom, sidx)
        
        # locate sites
        n_sites = np.count_nonzero(is_variant)
        out_sites.append(n_sites)
        
        # locate targets and genes
        loc = base_loc & get_is_cas9_target(chrom, revcomp=True, is_variant=is_variant)
        n_targets = np.count_nonzero(loc)
        out_targets.append(n_targets)

        n_genes = len(np.unique(gene_idx[loc]))
        out_genes.append(n_genes)
        if i%10 == 0:
            print(f'Sample {i} done')


    return np.array([out_sites, out_targets, out_genes]).T
    
    

In [27]:
def run_analysis(chrom, pop, sample_idx):
    
    outdir = f'cumulative_results_gam/{pop}/'
    if os.path.exists(f'{outdir}/target_info_{chrom}.npy'): 
        print(f'{outdir}/target_info_{chrom}.npy already exists, skipping computation')
    else:
        if not os.path.isdir(outdir):
            !mkdir {outdir}
        
        print(f'Computing for {len(sample_idx)} samples in cohort {pop}')
        target_info = identify_targets_population_cumulative(chrom, sample_idx)
        np.save(f'{outdir}/target_info_{chrom}.npy', target_info)
    

In [28]:
#The sample sets that contain Ag1 samples
sample_sets = ['AG1000G-AO', 'AG1000G-GW', 'AG1000G-BF-A', 'AG1000G-GN-A', 
               'AG1000G-CM-A', 'AG1000G-GA-A', 'AG1000G-UG', 'AG1000G-KE']
#The sample ids of Ag1 samples
sample_ids = pd.read_csv('ag1_sample_ids.csv')
sample_idsx = sample_ids.sample_id + 'x'

In [29]:
#Get metadata subsetted to Ag1 samples
meta_p1 = ag3.sample_metadata(sample_sets = sample_sets)
meta_p1 = meta_p1.loc[meta_p1.sample_id.isin(sample_ids.sample_id)|meta_p1.sample_id.isin(sample_idsx)]

                                     

In [30]:
meta_p1.shape

(762, 32)

Three samples missing: 1 from Angola and 2 from Kenya. 

In [31]:
#Overwrite sample_ids to use it for sample index selection
sample_ids = meta_p1.sample_id

In [32]:
sample_idx_dict = dict({'phase1': tuple(meta_p1.index.values)})

In [37]:
sample_idx_dict 

{'phase1': (0,
  1,
  2,
  3,
  4,
  5,
  6,
  8,
  10,
  11,
  12,
  13,
  15,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  33,
  34,
  35,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  46,
  47,
  49,
  51,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  69,
  70,
  71,
  72,
  74,
  75,
  77,
  79,
  80,
  81,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  107,
  109,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  132,
  133,
  134,
  135,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  146,
  148,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158,
  159,
  160,
  162,
  163,
  164,
  165,
  166,
  167,
  168,
  169,
  170,
  171,
  172,
  173,
  177,
  178,
  179,
  181,
  182,
  183,
  184,
  185,
  186,
  187,
  188,
  189,
  190,
  191,
  192,
  1

In [33]:
for chrom in ['X', '3R', '3L', '2R', '2L']:
    for pop in sample_idx_dict.keys():
        run_analysis(chrom, pop, sample_idx=sample_idx_dict[pop])
        print(f'Done for cohort {pop} on chrom {chrom}')


cumulative_results_gam/phase1//target_info_X.npy already exists, skipping computation
Done for cohort phase1 on chrom X
cumulative_results_gam/phase1//target_info_3R.npy already exists, skipping computation
Done for cohort phase1 on chrom 3R
cumulative_results_gam/phase1//target_info_3L.npy already exists, skipping computation
Done for cohort phase1 on chrom 3L
Computing for 762 samples in cohort phase1
Sample 0 done                    
Sample 10 done                   
Sample 20 done                   
Sample 30 done                   
Sample 40 done                   
Sample 50 done                   
Sample 60 done                   
Sample 70 done                   
Sample 80 done                   
                                 



Sample 90 done                   
Sample 100 done                  
Sample 110 done                  
Sample 120 done                  
Sample 130 done                  
Sample 140 done                  
Sample 150 done                  
Sample 160 done                  
Sample 170 done                  
Sample 180 done                  
Sample 190 done                  
Sample 200 done                  
Sample 210 done                  
Sample 220 done                  
Sample 230 done                  
Sample 240 done                  
Sample 250 done                  
Sample 260 done                  
Sample 270 done                  
Sample 280 done                  
Sample 290 done                  
Sample 300 done                  
Sample 310 done                  
Sample 320 done                  
Sample 330 done                  
Sample 340 done                  
Sample 350 done                  
Sample 360 done                  
Sample 370 done                  
Sample 380 don

### Sanity check total number of non-variant targets

In [32]:
def get_is_variant_all(chrom, sample_idx):
    """Locate variant sites for a given sample."""
    seq = get_reference_sequence(chrom)
    out = np.zeros_like(seq, dtype=bool)
    pos = ag3.snp_calls(chrom, sample_sets='3.0').variant_position.values
    g = ag3.snp_calls(chrom, sample_sets = ['AG1000G-AO', 'AG1000G-GW', 
                                            'AG1000G-BF-A', 'AG1000G-GN-A', 
               'AG1000G-CM-A', 'AG1000G-GA-A', 'AG1000G-UG', 'AG1000G-KE'], 
                      sample_indices = sample_idx).call_genotype
    loc = np.any(g > 0, axis=(1,2))
    loc = loc.compute().values
    idx = pos[loc] - 1
    out[idx] = True

    return out

In [33]:
def identify_targets_population_total(chrom, sample_idx):

    #Find targets in cds (not using site filter)
    base_loc = get_target_in_cds(chrom) 
    loc = base_loc & get_is_cas9_target(chrom, revcomp=True)
    
    #index genes
    gene_idx = get_gene_idx(chrom)
    
    # setup outputs
    out_sites = []
    out_targets = []
    out_genes = []
    
    # setup variation
    is_variant = np.zeros(gene_idx.shape[0], dtype=bool)
    
    #initial numbers
    out_sites.append(0)
    
    n_targets = np.count_nonzero(loc)
    out_targets.append(n_targets)
    
    n_genes = len(np.unique(gene_idx[loc]))
    out_genes.append(n_genes)
        
    #accumulate variation
    is_variant |= get_is_variant_all(chrom, sample_idx)
        
    # locate sites
    n_sites = np.count_nonzero(is_variant)
    out_sites.append(n_sites)
        
    # locate targets and genes
    loc = base_loc & get_is_cas9_target(chrom, revcomp=True, is_variant=is_variant)
    n_targets = np.count_nonzero(loc)
    out_targets.append(n_targets)

    n_genes = len(np.unique(gene_idx[loc]))
    out_genes.append(n_genes)


    return np.array([out_sites, out_targets, out_genes]).T

In [34]:
sample_idx = meta_p1.index.to_list()
sample_idx

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 8,
 10,
 11,
 12,
 13,
 15,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 33,
 34,
 35,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 46,
 47,
 49,
 51,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 69,
 70,
 71,
 72,
 74,
 75,
 77,
 79,
 80,
 81,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 107,
 109,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 132,
 133,
 134,
 135,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 146,
 148,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 177,
 178,
 179,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 197,
 199,
 200,
 201,
 202,
 203,
 205,
 206,
 207,
 208,
 210,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 227,
 228,
 229

In [36]:
outdir = f'results_gam_no_site_filter/phase1/'
for chrom in ['X', '3R', '3L', '2R', '2L']:
    if os.path.exists(f'{outdir}/target_info_{chrom}.npy'): 
        print(f'{outdir}/target_info_{chrom}.npy already exists, skipping computation')
    else:
        if not os.path.isdir(outdir):
            !mkdir {outdir}
        
        print(f'Computing for {len(sample_idx)} samples on chrom {chrom}')
        target_info = identify_targets_population_total(chrom, sample_idx)
        np.save(f'{outdir}/target_info_{chrom}.npy', target_info)
        print(f'Done for chrom {chrom}, total {target_info[-1,1]} targets')

results_gam_no_site_filter/phase1//target_info_X.npy already exists, skipping computation
Computing for 762 samples on chrom 3R
Done for chrom 3R, total 5474 targets
Computing for 762 samples on chrom 3L
Done for chrom 3L, total 7462 targets
Computing for 762 samples on chrom 2R
Done for chrom 2R, total 7183 targets
Computing for 762 samples on chrom 2L
Done for chrom 2L, total 6528 targets


In [37]:
np.load('results_gam_no_site_filter/phase1/target_info_X.npy')

array([[       0,   283999,     1059],
       [11200479,     1774,      179]])

In [38]:
1774+5474+7462+7183+6528

28421

In [39]:
cluster.shutdown()