# FWS Calculations 

In [1]:
# NB based on the below 
# Are the fws calculations meant to be on the exact same set 

http://localhost:8888/notebooks/20200206_recalculate_fws_for_genome_wide_pairs_IBD_analysis/02062020_calculate_fws.ipynb

In [1]:
from malariagen_data.pv4 import Pv4
import numpy as np
import logging
import sys
from datetime import datetime
import pandas as pd

In [2]:
# Output filenames 
pop_freq_fn = f'FWS_results/{datetime.now().strftime("%Y_%m_%d")}_pop_freq.tsv'
pop_het_values_fn = f'FWS_results/{datetime.now().strftime("%Y_%m_%d")}_pop_het_values.tsv'
fws_fn = f'FWS_results/{datetime.now().strftime("%Y_%m_%d")}_fws.tsv'
logging_fn = f'FWS_results/{datetime.now().strftime("%Y_%m_%d")}_fws.log'

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                    level=logging.INFO,
                    #stream=sys.stdout
                    filename = logging_fn)

## Load Data

In [3]:
pv4 = Pv4("gs://pv4_staging/")

In [4]:
pv4_metadata = pv4.sample_metadata()

pv4_metadata.head()

Unnamed: 0,Sample,Study,Site,First-level administrative division,Country,Lat,Long,Year,ENA,All samples same individual,Population,% callable,QC pass,Exclusion reason,Is returning traveller
0,BBH-1-125,X0009-PV-ET-LO,Jimma,Ethiopia: Oromia,Ethiopia,7.683331,36.851318,2016,ERR2678989,BBH-1-125,AF,88.52,True,Analysis_set,False
1,BBH_1_132,X0009-PV-ET-LO,Jimma,Ethiopia: Oromia,Ethiopia,7.683331,36.851318,2016,ERR2678991,BBH_1_132,AF,90.2,True,Analysis_set,False
2,BBH_1_137,X0009-PV-ET-LO,Jimma,Ethiopia: Oromia,Ethiopia,7.683331,36.851318,2016,ERR2679003,BBH_1_137,AF,87.09,True,Analysis_set,False
3,BBH_1_153,X0009-PV-ET-LO,Jimma,Ethiopia: Oromia,Ethiopia,7.683331,36.851318,2016,ERR2678992,BBH_1_153,AF,90.6,True,Analysis_set,False
4,BBH_1_162,X0009-PV-ET-LO,Jimma,Ethiopia: Oromia,Ethiopia,7.683331,36.851318,2016,ERR2678993,BBH_1_162,AF,91.67,True,Analysis_set,False


In [6]:
#Filter to only be % callable over 50%
loc_filtered_samples = (pv4_metadata['% callable'] >= 50)
pv4_metadata_50_callable = pv4_metadata.loc[loc_filtered_samples]

print(pv4_metadata_50_callable.shape)
print(pv4_metadata_50_callable.Population.value_counts())

(1245, 15)
ESEA          313
OCE           268
LAM           175
AF            155
WSEA          135
MSEA           85
unassigned     63
WAS            51
Name: Population, dtype: int64


In [7]:
all_populations = pv4_metadata_50_callable.Population.unique()

## PASS bi-allelic coding SNPs, with less than 50% missingness

In [8]:
variant_dataset = pv4.variant_calls(extended=True)
variant_dataset

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type int32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,14.80 kiB,14.80 kiB
Shape,"(1895,)","(1895,)"
Count,1 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 14.80 kiB 14.80 kiB Shape (1895,) (1895,) Count 1 Tasks 1 Chunks Type object numpy.ndarray",1895  1,

Unnamed: 0,Array,Chunk
Bytes,14.80 kiB,14.80 kiB
Shape,"(1895,)","(1895,)"
Count,1 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,244.12 MiB,3.00 MiB
Shape,"(4571056, 7)","(65536, 6)"
Count,350 Tasks,140 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 244.12 MiB 3.00 MiB Shape (4571056, 7) (65536, 6) Count 350 Tasks 140 Chunks Type object numpy.ndarray",7  4571056,

Unnamed: 0,Array,Chunk
Bytes,244.12 MiB,3.00 MiB
Shape,"(4571056, 7)","(65536, 6)"
Count,350 Tasks,140 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type int32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.13 GiB,8.00 MiB
Shape,"(4571056, 1895, 2)","(65536, 64, 2)"
Count,2100 Tasks,2100 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 16.13 GiB 8.00 MiB Shape (4571056, 1895, 2) (65536, 64, 2) Count 2100 Tasks 2100 Chunks Type int8 numpy.ndarray",2  1895  4571056,

Unnamed: 0,Array,Chunk
Bytes,16.13 GiB,8.00 MiB
Shape,"(4571056, 1895, 2)","(65536, 64, 2)"
Count,2100 Tasks,2100 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,112.94 GiB,56.00 MiB
Shape,"(4571056, 1895, 7)","(65536, 64, 7)"
Count,2100 Tasks,2100 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 112.94 GiB 56.00 MiB Shape (4571056, 1895, 7) (65536, 64, 7) Count 2100 Tasks 2100 Chunks Type int16 numpy.ndarray",7  1895  4571056,

Unnamed: 0,Array,Chunk
Bytes,112.94 GiB,56.00 MiB
Shape,"(4571056, 1895, 7)","(65536, 64, 7)"
Count,2100 Tasks,2100 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.13 GiB,8.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 16.13 GiB 8.00 MiB Shape (4571056, 1895) (65536, 64) Count 2100 Tasks 2100 Chunks Type int16 numpy.ndarray",1895  4571056,

Unnamed: 0,Array,Chunk
Bytes,16.13 GiB,8.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.07 GiB,4.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 8.07 GiB 4.00 MiB Shape (4571056, 1895) (65536, 64) Count 2100 Tasks 2100 Chunks Type int8 numpy.ndarray",1895  4571056,

Unnamed: 0,Array,Chunk
Bytes,8.07 GiB,4.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.54 GiB,32.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 64.54 GiB 32.00 MiB Shape (4571056, 1895) (65536, 64) Count 2100 Tasks 2100 Chunks Type object numpy.ndarray",1895  4571056,

Unnamed: 0,Array,Chunk
Bytes,64.54 GiB,32.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.54 GiB,32.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 64.54 GiB 32.00 MiB Shape (4571056, 1895) (65536, 64) Count 2100 Tasks 2100 Chunks Type object numpy.ndarray",1895  4571056,

Unnamed: 0,Array,Chunk
Bytes,64.54 GiB,32.00 MiB
Shape,"(4571056, 1895)","(65536, 64)"
Count,2100 Tasks,2100 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,96.81 GiB,48.00 MiB
Shape,"(4571056, 1895, 3)","(65536, 64, 3)"
Count,2100 Tasks,2100 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 96.81 GiB 48.00 MiB Shape (4571056, 1895, 3) (65536, 64, 3) Count 2100 Tasks 2100 Chunks Type int32 numpy.ndarray",3  1895  4571056,

Unnamed: 0,Array,Chunk
Bytes,96.81 GiB,48.00 MiB
Shape,"(4571056, 1895, 3)","(65536, 64, 3)"
Count,2100 Tasks,2100 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,104.62 MiB,1.50 MiB
Shape,"(4571056, 6)","(65536, 6)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 104.62 MiB 1.50 MiB Shape (4571056, 6) (65536, 6) Count 70 Tasks 70 Chunks Type int32 numpy.ndarray",6  4571056,

Unnamed: 0,Array,Chunk
Bytes,104.62 MiB,1.50 MiB
Shape,"(4571056, 6)","(65536, 6)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,104.62 MiB,1.50 MiB
Shape,"(4571056, 6)","(65536, 6)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 104.62 MiB 1.50 MiB Shape (4571056, 6) (65536, 6) Count 70 Tasks 70 Chunks Type float32 numpy.ndarray",6  4571056,

Unnamed: 0,Array,Chunk
Bytes,104.62 MiB,1.50 MiB
Shape,"(4571056, 6)","(65536, 6)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type int32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type int32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.36 MiB 64.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type bool numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,4.36 MiB,64.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type float32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type float32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type float32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type float32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type float32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 17.44 MiB 256.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type float32 numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,17.44 MiB,256.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 34.87 MiB 512.00 kiB Shape (4571056,) (65536,) Count 70 Tasks 70 Chunks Type object numpy.ndarray",4571056  1,

Unnamed: 0,Array,Chunk
Bytes,34.87 MiB,512.00 kiB
Shape,"(4571056,)","(65536,)"
Count,70 Tasks,70 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,104.62 MiB,1.50 MiB
Shape,"(4571056, 6)","(65536, 6)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 104.62 MiB 1.50 MiB Shape (4571056, 6) (65536, 6) Count 70 Tasks 70 Chunks Type int32 numpy.ndarray",6  4571056,

Unnamed: 0,Array,Chunk
Bytes,104.62 MiB,1.50 MiB
Shape,"(4571056, 6)","(65536, 6)"
Count,70 Tasks,70 Chunks
Type,int32,numpy.ndarray


In [9]:
variant_dataset_filtered = variant_dataset.isel(samples=loc_filtered_samples)

In [10]:
filters = (
    (variant_dataset_filtered["variant_filter_pass"].data)
    & (variant_dataset_filtered["variant_is_snp"].data)
    & (variant_dataset_filtered["variant_CDS"].data)
)
variant_dataset_filtered = variant_dataset_filtered.isel(variants=filters)

In [11]:
biallelic_filter = (variant_dataset_filtered["variant_numalt"] == 1).data
variant_dataset_filtered = variant_dataset_filtered.isel(variants=biallelic_filter)
variant_dataset_filtered

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type int32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.73 kiB,9.73 kiB
Shape,"(1245,)","(1245,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 9.73 kiB 9.73 kiB Shape (1245,) (1245,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",1245  1,

Unnamed: 0,Array,Chunk
Bytes,9.73 kiB,9.73 kiB
Shape,"(1245,)","(1245,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,23.51 MiB,744.80 kiB
Shape,"(440222, 7)","(15889, 6)"
Count,514 Tasks,82 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 23.51 MiB 744.80 kiB Shape (440222, 7) (15889, 6) Count 514 Tasks 82 Chunks Type object numpy.ndarray",7  440222,

Unnamed: 0,Array,Chunk
Bytes,23.51 MiB,744.80 kiB
Shape,"(440222, 7)","(15889, 6)"
Count,514 Tasks,82 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type int32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.02 GiB,1.94 MiB
Shape,"(440222, 1245, 2)","(15889, 64, 2)"
Count,6660 Tasks,1230 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 1.02 GiB 1.94 MiB Shape (440222, 1245, 2) (15889, 64, 2) Count 6660 Tasks 1230 Chunks Type int8 numpy.ndarray",2  1245  440222,

Unnamed: 0,Array,Chunk
Bytes,1.02 GiB,1.94 MiB
Shape,"(440222, 1245, 2)","(15889, 64, 2)"
Count,6660 Tasks,1230 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.15 GiB,13.58 MiB
Shape,"(440222, 1245, 7)","(15889, 64, 7)"
Count,6660 Tasks,1230 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 7.15 GiB 13.58 MiB Shape (440222, 1245, 7) (15889, 64, 7) Count 6660 Tasks 1230 Chunks Type int16 numpy.ndarray",7  1245  440222,

Unnamed: 0,Array,Chunk
Bytes,7.15 GiB,13.58 MiB
Shape,"(440222, 1245, 7)","(15889, 64, 7)"
Count,6660 Tasks,1230 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.02 GiB,1.94 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 1.02 GiB 1.94 MiB Shape (440222, 1245) (15889, 64) Count 6660 Tasks 1230 Chunks Type int16 numpy.ndarray",1245  440222,

Unnamed: 0,Array,Chunk
Bytes,1.02 GiB,1.94 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,522.69 MiB,0.97 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 522.69 MiB 0.97 MiB Shape (440222, 1245) (15889, 64) Count 6660 Tasks 1230 Chunks Type int8 numpy.ndarray",1245  440222,

Unnamed: 0,Array,Chunk
Bytes,522.69 MiB,0.97 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.08 GiB,7.76 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 4.08 GiB 7.76 MiB Shape (440222, 1245) (15889, 64) Count 6660 Tasks 1230 Chunks Type object numpy.ndarray",1245  440222,

Unnamed: 0,Array,Chunk
Bytes,4.08 GiB,7.76 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.08 GiB,7.76 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 4.08 GiB 7.76 MiB Shape (440222, 1245) (15889, 64) Count 6660 Tasks 1230 Chunks Type object numpy.ndarray",1245  440222,

Unnamed: 0,Array,Chunk
Bytes,4.08 GiB,7.76 MiB
Shape,"(440222, 1245)","(15889, 64)"
Count,6660 Tasks,1230 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.13 GiB,11.64 MiB
Shape,"(440222, 1245, 3)","(15889, 64, 3)"
Count,6660 Tasks,1230 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 6.13 GiB 11.64 MiB Shape (440222, 1245, 3) (15889, 64, 3) Count 6660 Tasks 1230 Chunks Type int32 numpy.ndarray",3  1245  440222,

Unnamed: 0,Array,Chunk
Bytes,6.13 GiB,11.64 MiB
Shape,"(440222, 1245, 3)","(15889, 64, 3)"
Count,6660 Tasks,1230 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.08 MiB,372.40 kiB
Shape,"(440222, 6)","(15889, 6)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 10.08 MiB 372.40 kiB Shape (440222, 6) (15889, 6) Count 152 Tasks 41 Chunks Type int32 numpy.ndarray",6  440222,

Unnamed: 0,Array,Chunk
Bytes,10.08 MiB,372.40 kiB
Shape,"(440222, 6)","(15889, 6)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.08 MiB,372.40 kiB
Shape,"(440222, 6)","(15889, 6)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.08 MiB 372.40 kiB Shape (440222, 6) (15889, 6) Count 152 Tasks 41 Chunks Type float32 numpy.ndarray",6  440222,

Unnamed: 0,Array,Chunk
Bytes,10.08 MiB,372.40 kiB
Shape,"(440222, 6)","(15889, 6)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type int32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type int32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 429.90 kiB 15.52 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type bool numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,429.90 kiB,15.52 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type float32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type float32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type float32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type float32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type float32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.68 MiB 62.07 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type float32 numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,1.68 MiB,62.07 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.36 MiB 124.13 kiB Shape (440222,) (15889,) Count 152 Tasks 41 Chunks Type object numpy.ndarray",440222  1,

Unnamed: 0,Array,Chunk
Bytes,3.36 MiB,124.13 kiB
Shape,"(440222,)","(15889,)"
Count,152 Tasks,41 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.08 MiB,372.40 kiB
Shape,"(440222, 6)","(15889, 6)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 10.08 MiB 372.40 kiB Shape (440222, 6) (15889, 6) Count 152 Tasks 41 Chunks Type int32 numpy.ndarray",6  440222,

Unnamed: 0,Array,Chunk
Bytes,10.08 MiB,372.40 kiB
Shape,"(440222, 6)","(15889, 6)"
Count,152 Tasks,41 Chunks
Type,int32,numpy.ndarray


In [12]:
def fgt_525(ad):
    fgt = (ad[:, :, 1] / (ad[:, :, 0] + ad[:, :, 1]))
    fgt[(ad[:, :, 0] + ad[:, :, 1]) < 5] = np.nan
    homs = (ad[:, :, 0] <= 1) | (ad[:, :, 1] <= 1) | (fgt < 0.05) | (fgt > 0.95)
    fgt[homs] = np.round(fgt[homs])
    return(fgt)
        
def sam_het(bb):
    bin_size = np.count_nonzero(bin_indices[bb])
    if bin_size == 0:
        return np.nan
    else:
        bin_sample_indices = np.where(het_bins == bb)
        return np.nansum(2*sample_frequencies[bin_sample_indices]*(1-sample_frequencies[bin_sample_indices]))/bin_size

def het(idx,ff):
    if np.count_nonzero(idx) == 0:
        return np.nan
    else:
        return np.nanmean(2*ff[idx]*(1-ff[idx]))    

In [13]:
N_var = variant_dataset_filtered.dims['variants']
N_sam = variant_dataset_filtered.dims['samples']

AD_PASS = variant_dataset_filtered.call_AD

In [14]:
N_bins = 10

In [15]:
chroms = variant_dataset_filtered.variant_chrom.data.compute()
pos = variant_dataset_filtered.variant_position.data.compute()
pop_freq = pd.DataFrame(np.full([N_var,all_populations.size],np.nan),columns=all_populations,dtype='f4',
                        index=[(chroms[N_var-1]+':'+
                                str(pos[N_var-1])) 
                               for i in range(N_var)])

pop_het_values = pd.DataFrame(np.full([all_populations.size,N_bins],np.nan),dtype='f4',
                              index=all_populations, columns=['bin_'+str(i) for i in range(0,N_bins)])

fws = pd.DataFrame(np.full([N_sam,1+N_bins],np.nan),dtype='f4',
                   index=pv4_metadata_50_callable.Sample, columns=['fws']+['bin_'+str(i) for i in range(0,N_bins)])

In [16]:
for population in all_populations: 
    
    logging.info(f'Starting population {population}')
    
    pop_selection = (pv4_metadata_50_callable.Population == population)
    n_pop_samples = np.count_nonzero(pop_selection)
    print(n_pop_samples)
    
    logging.info(f'[{population}] Data extraction complete!')
    
    # Calculate population allele frequency in SNP blocks - FUNCTION 
    block_size = 10000
    n_blocks = int(np.ceil(N_var/block_size))
    for block in range(n_blocks):
        # Set boundaries 
        block_start = block*block_size
        block_end = np.min([((block+1)*block_size)-1,N_var -1])
        logging.info(f'[{population}] Processing SNPs {block_start} to {block_end} of {N_var}')
        
        fracGT = fgt_525(np.array(AD_PASS[block_start:block_end,pop_selection,0:2],dtype='u2')) 
        pop_freq[population][block_start:block_end] = np.nanmean(fracGT,axis=1)
        
    pf = pop_freq[population].values
    # Adjust to MAF
    pf[pf > 0.5] = 1-pf[pf > 0.5]
    
    # Bin SNPs according to their frequency.
    binning = pd.cut(pf, bins=np.linspace(0,0.5, N_bins+1), include_lowest = True)
    # Compute the indices for the SNPs in each bin.
    bin_indices = {}
    for b in binning.categories:
        bin_indices[b] = np.where(binning == b)
        
    # Compute the mean expected heterozygosity in the population for each bin. - FUNCTION 
    pop_bins_het = [het(bin_indices[b],pf) for b in binning.categories]

    pop_het_values.loc[population] = pop_bins_het


    # Calculate Fws - FUNCTION 
    for sam_idx in range(n_pop_samples):
        #sam_idx=1
        if sam_idx % 10 == 0:
            logging.info(f'[{population}] Sample {sam_idx} of {n_pop_samples}')


        sample_frequencies_GT = fgt_525(np.array(AD_PASS[:,np.where(pv4_metadata_50_callable.Population == population)[0][sam_idx],0:2],dtype='u2').reshape([N_var,1,2]))

        # Get the indices of the mixed calls in the sample.
        het_indices = np.where((sample_frequencies_GT > 0) & (sample_frequencies_GT < 1))
        het_bins = binning[het_indices[0]]

        sample_frequencies = sample_frequencies_GT[het_indices]

        # MAF
        sample_frequencies[sample_frequencies > 0.5] = 1-sample_frequencies[sample_frequencies > 0.5]

        # Compute sample mean heterozygosity for each bin.
        sample_het = [sam_het(b) for b in binning.categories]

        fws.loc[pv4_metadata_50_callable.Sample[pop_selection].values[sam_idx]][1:N_bins+1] = sample_het

        good_values = ((~np.isnan(pop_bins_het)) & (~np.isnan(sample_het)))
        xi=np.array(pop_bins_het)[good_values]
        yi=np.array(sample_het)[good_values]

        m, _, _, _ = np.linalg.lstsq(xi[:,np.newaxis], yi) #add rcond=None

        if m < 0:
            fval = np.nan
        elif m > 1:
            fval = 0
        else:
            fval = 1-m

        fws.loc[pv4_metadata_50_callable.Sample[pop_selection].values[sam_idx]]['fws'] = fval

    logging.info(f'[{population}] Writing interim results')
    
    pop_freq.to_csv(pop_freq_fn, sep='\t',na_rep='NaN')
    pop_het_values.to_csv(pop_het_values_fn, sep='\t',na_rep='NaN',header=binning.categories)
    fws.to_csv(fws_fn, sep='\t',na_rep='NaN')
        
    logging.info(f'[{population}] Done!')


155


  


175
313
51
63
135
85


In [None]:
pop_freq.to_csv(pop_freq_fn, sep='\t',na_rep='NaN')
pop_het_values.to_csv(pop_het_values_fn, sep='\t',na_rep='NaN',header=binning.categories)
fws.to_csv(fws_fn, sep='\t',na_rep='NaN')

# To Do 
* Change any names that don't make sense 
* Split FWS into functions