# Introduction

This notebook is used to create the sample metadata files for the internal release of Pf 6.0 and Pv 3.0. This brings together the following three pieces of information:
1. Summaries of read data 
1. Source code and run accessions
1. Summaries of genotypes

1 is created during build and stored as metadata in VR-PIPE. 2 was created by Jim from Solaris, and sent via emails 01/12/2016 17:31 and 06/12/2016 13:58. 3 was created using notebook 20161205_sample_level_summaries.ipynb

Note that PH0385-C wasn't originally in the pv file (it is annotated as Pf but we also put it in Pv build). I manually added this from the pf file. This means pv_30_all_samples_src_acc.tab isn't exactly the same as the file Jim emailed.

In [1]:
%run _standard_imports.ipynb
%run _plotting_setup.ipynb


python 3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
numpy 1.11.2
scipy 0.18.1
pandas 0.19.0
numexpr 2.6.1
pysam 0.8.4
petl 1.1.0
petlx 1.0.3
vcf 0.6.8
vcfnp 2.2.0
h5py 2.6.0
tables 3.3.0


In [2]:
output_dir = '/nfs/team112_internal/rp7/data/methods-dev/builds/Pf6.0/20161212_release_sample_metadata'
!mkdir -p {output_dir}

releases = ['pf_60', 'pv_30']
vrpipe_metadata_format = "%s/{release}_vrpipe_bam_summaries.txt" % output_dir
solaris_metadata_format = "%s/{release}_all_samples_src_acc.tab" % output_dir
genotype_summary_format = "/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161205_sample_level_summaries/{release}_summaries.txt"

# hdf_fn = collections.OrderedDict()
# hdf_fn['pf_60'] = '/nfs/team112_internal/production/release_build/Pf/6_0_release_packages/hdf5/Pf_60.h5'
# hdf_fn['pv_30'] = '/nfs/team112_internal/production/release_build/Pv/3_0_release_packages/hdf5/Pv_30.h5'

sample_metadata_fn = collections.OrderedDict()
sample_metadata_fn['pf_60'] = '/nfs/team112_internal/production/release_build/Pf/6_0_release_packages/Pf_60_sample_metadata.txt'
sample_metadata_fn['pv_30'] = '/nfs/team112_internal/production/release_build/Pv/3_0_release_packages/Pv_30_sample_metadata.txt'

GENOME_FN = collections.OrderedDict()
genome_fn = collections.OrderedDict()
genome = collections.OrderedDict()
GENOME_FN['pf_60'] = "/lustre/scratch116/malaria/pfalciparum/resources/Pfalciparum.genome.fasta"
GENOME_FN['pv_30'] = "/lustre/scratch109/malaria/pvivax/resources/gatk/PvivaxP01.genome.fasta"
genome_fn['pf_60'] = "%s/Pfalciparum.genome.fasta" % output_dir
genome_fn['pv_30'] = "%s/PvivaxP01.genome.fasta" % output_dir

nfs_release_dir = collections.OrderedDict()
nfs_release_dir['pf_60'] = '/nfs/team112_internal/production/release_build/Pf/6_0_release_packages'
nfs_release_dir['pv_30'] = '/nfs/team112_internal/production/release_build/Pv/3_0_release_packages'


In [3]:
!ls -altr {output_dir}

total 8
drwxrwsr-x 10 rp7 team112 4096 Dec 12 14:47 ..
drwxrwsr-x  2 rp7 team112 4096 Dec 12 14:47 .


# Deteremine genome lengths

In [4]:
genome_length = collections.OrderedDict()
for release in releases:
    !cp {GENOME_FN[release]} {genome_fn[release]}
    genome = pyfasta.Fasta(genome_fn[release])
    genome
    genome_length[release] = 0
    for chrom in genome.keys():
        genome_length[release] += len(genome[chrom])
    print(release, genome_length[release])

pf_60 23332839
pv_30 29052596


# Get VR-PIPE metadata

In [5]:
vrpipe_columns = [
    'path', 'sample', 'study', 'bases_of_1X_coverage', 'bases_of_2X_coverage', 'bases_of_5X_coverage',
    'mean_coverage', 'mean_insert_size', 'sd_insert_size', 'avg_read_length', 'bases_callable_percent',
    'bases_no_coverage_percent', 'bases_low_coverage_percent', 'bases_excessive_coverage_percent',
    'bases_poor_mapping_quality_percent', 'bases_ref_n_percent', 'reads', 'reads_mapped', 'reads_mapped_and_paired',
    'reads_properly_paired', 'reads_qc_failed', 'pairs_on_different_chromosomes', 'non_primary_alignments',
    'center_name'
]
print(",".join(vrpipe_columns[1:]))

sample,study,bases_of_1X_coverage,bases_of_2X_coverage,bases_of_5X_coverage,mean_coverage,mean_insert_size,sd_insert_size,avg_read_length,bases_callable_percent,bases_no_coverage_percent,bases_low_coverage_percent,bases_excessive_coverage_percent,bases_poor_mapping_quality_percent,bases_ref_n_percent,reads,reads_mapped,reads_mapped_and_paired,reads_properly_paired,reads_qc_failed,pairs_on_different_chromosomes,non_primary_alignments,center_name


In [6]:
for release in releases:
    !vrpipe-fileinfo --setup {release}_mergelanes --metadata {",".join(vrpipe_columns[1:])} \
> {vrpipe_metadata_format.format(release=release)}



# Load individual tables

In [8]:
delimiters = collections.OrderedDict()
delimiters['pf_60'] = '\t'
delimiters['pv_30'] = ' '

tbl_solaris = collections.OrderedDict()
for release in releases:
    tbl_solaris[release] = (
        etl
        .fromcsv(solaris_metadata_format.format(release=release), delimiter=delimiters[release])
        .setheader(['sample', 'source_code', 'study', 'run_accessions'])
    )
    print(len(tbl_solaris[release].data()))
    tbl_solaris[release].display(index_header=True)


8247


0|sample,1|source_code,2|study,3|run_accessions
FP0008-C,KB02.14,1147-PF-MR-CONWAY,ERR1081237
FP0009-C,KB03.14,1147-PF-MR-CONWAY,ERR1081238
FP0015-C,KB18.14,1147-PF-MR-CONWAY,ERR1081239
FP0016-C,KB19.14,1147-PF-MR-CONWAY,ERR1081240
FP0017-C,KB20.14,1147-PF-MR-CONWAY,ERR1081241


734


0|sample,1|source_code,2|study,3|run_accessions
PD0165-C,WP_0240,1154-PV-TH-PRICE,ERR111724
PD0166-C,WPP_517,1154-PV-TH-PRICE,ERR111710
PD0167-C,WPP_812,1154-PV-TH-PRICE,ERR111727
PD0168-C,WP_1081,1154-PV-TH-PRICE,ERR111721
PD0169-C,WP_5062,1154-PV-TH-PRICE,ERR111728


In [9]:
tbl_vrpipe = collections.OrderedDict()
for release in releases:
    tbl_vrpipe[release] = (
        etl
        .fromtsv(vrpipe_metadata_format.format(release=release))
        .setheader(vrpipe_columns)
        .select(lambda rec: 'pe' in rec['path'] or rec['sample'] == 'PN0002-C')
        .convertnumbers()
    )
    print(len(tbl_vrpipe[release].data()))
    tbl_vrpipe[release].display(index_header=True)


8247


0|path,1|sample,2|study,3|bases_of_1X_coverage,4|bases_of_2X_coverage,5|bases_of_5X_coverage,6|mean_coverage,7|mean_insert_size,8|sd_insert_size,9|avg_read_length,10|bases_callable_percent,11|bases_no_coverage_percent,12|bases_low_coverage_percent,13|bases_excessive_coverage_percent,14|bases_poor_mapping_quality_percent,15|bases_ref_n_percent,16|reads,17|reads_mapped,18|reads_mapped_and_paired,19|reads_properly_paired,20|reads_qc_failed,21|pairs_on_different_chromosomes,22|non_primary_alignments,23|center_name
/lustre/scratch116/malaria/pfalciparum/output/4/4/f/e/42104/4_bam_mark_duplicates_v2/pe.1.markdup.bam,PL0006-C,1039-PF-PFLAB9-BAKER,23328732,23323174,23303359,155.17,359.1,57.9,75,95.03,0.03,0.16,0.0,4.79,0.0,48462580,47724735,47069254,45992388,0,379610,121810,SC
/lustre/scratch116/malaria/pfalciparum/output/3/f/1/3/45100/4_bam_mark_duplicates_v2/pe.1.markdup.bam,WS0081-C,1040-PF-GB-HAMILTON,22734762,22315539,20965156,71.23,515.9,136.8,99,80.57,3.57,9.76,0.0,6.1,0.0,17065794,16721832,16568962,15040567,0,696794,291404,SC
/lustre/scratch116/malaria/pfalciparum/output/5/4/3/5/44258/4_bam_mark_duplicates_v2/pe.1.markdup.bam,QJ0018-C,1103-PF-PDN-GMSN-NGWA,23085894,22901856,22282958,82.26,245.4,83.2,99,86.85,1.72,4.64,0.0,6.79,0.0,20481614,19234352,19115971,17902861,0,552103,263904,SC
/lustre/scratch116/malaria/pfalciparum/output/d/6/0/c/41088/4_bam_mark_duplicates_v2/pe.1.markdup.bam,PH0458-CW,1044-PF-KH-FAIRHURST,23012247,22828712,22292781,41.68,338.4,425.9,74,88.53,1.94,4.89,0.0,4.64,0.0,13193460,13010248,12937989,11893443,0,199394,118346,SC
/lustre/scratch116/malaria/pfalciparum/output/3/9/6/1/38157/4_bam_mark_duplicates_v2/pe.1.markdup.bam,PD0986-C,1125-PF-TH-NOSTEN,22281579,21302986,18332682,15.17,240.8,79.1,99,70.37,5.72,19.97,0.0,3.93,0.0,4300147,3513687,3490149,3250191,0,109457,54893,SC


1007


0|path,1|sample,2|study,3|bases_of_1X_coverage,4|bases_of_2X_coverage,5|bases_of_5X_coverage,6|mean_coverage,7|mean_insert_size,8|sd_insert_size,9|avg_read_length,10|bases_callable_percent,11|bases_no_coverage_percent,12|bases_low_coverage_percent,13|bases_excessive_coverage_percent,14|bases_poor_mapping_quality_percent,15|bases_ref_n_percent,16|reads,17|reads_mapped,18|reads_mapped_and_paired,19|reads_properly_paired,20|reads_qc_failed,21|pairs_on_different_chromosomes,22|non_primary_alignments,23|center_name
/lustre/scratch116/malaria/pvivax/output/8/c/9/1/74387/4_bam_mark_duplicates_v2/pe.1.markdup.bam,PY0107-C,1157-PV-MULTI-PRICE,27628488,27373491,26735871,31.31,256.2,84.5,99,87.89,5.68,3.78,0.0,2.13,0.52,9218011,8733329,8669057,8527014,0,59796,33259,SC
/lustre/scratch116/malaria/pvivax/output/c/7/2/1/74031/4_bam_mark_duplicates_v2/pe.1.markdup.bam,PJ0038-Cx,1156-PV-ID-PRICE,27583887,27018196,24928907,10.56,251.1,98.6,99,81.78,5.64,11.18,0.0,0.88,0.52,3792717,3132690,3088366,3006902,0,33036,12497,SC
/lustre/scratch116/malaria/pvivax/output/0/9/e/4/73742/4_bam_mark_duplicates_v2/pe.1.markdup.bam,IVC10-OM-0303,chapelhill,28352150,27699929,23316288,9.65,297.3,38.7,99,64.38,2.47,31.37,0.0,1.27,0.52,12192991,3153915,2828060,2604838,0,77719,14963,CHAPELHILL
/lustre/scratch116/malaria/pvivax/output/3/d/2/b/74482/4_bam_mark_duplicates_v2/pe.1.markdup.bam,QS0013-C,1157-PV-MULTI-PRICE,27562512,27301558,26632165,19.17,227.5,83.6,99,87.25,6.09,3.95,0.0,2.2,0.52,6974159,5664480,5579789,5448822,0,53956,21763,SC
/lustre/scratch116/malaria/pvivax/output/a/b/a/8/74269/4_bam_mark_duplicates_v2/pe.1.markdup.bam,PV0062-C,1049-PV-VN-BONI,28478405,28357308,28057132,62.81,360.0,65.1,75,91.12,2.01,2.78,0.0,3.57,0.52,26003444,23865831,23477753,22913696,0,201014,30578,SC


In [10]:
tbl_genotype_summary = collections.OrderedDict()
for release in releases:
    tbl_genotype_summary[release] = (
        etl
        .fromtsv(genotype_summary_format.format(release=release))
        .convertnumbers()
    )
    print(len(tbl_genotype_summary[release].data()))
    tbl_genotype_summary[release].display(index_header=True)


7182


0|sample_id,1|num_variants,2|num_pass_variants,3|num_missing,4|num_pass_missing,5|num_called,6|num_pass_called,7|num_hom_ref,8|num_het,9|num_pass_het,10|num_hom_alt,11|num_pass_hom_alt,12|num_pass_non_ref,13|num_biallelic_het,14|num_biallelic_hom_alt,15|num_spanning_del_het,16|num_spanning_del_hom_alt,17|num_multiallelic_het,18|num_multiallelic_hom_alt,19|num_snp_het,20|num_snp_hom_alt,21|num_snp,22|num_indel_het,23|num_indel_hom_alt,24|num_indel,25|num_ins_het,26|num_ins_hom_alt,27|num_ins,28|num_del_het,29|num_del_hom_alt,30|num_del,31|num_coding_het,32|num_coding_hom_alt,33|num_coding,34|num_hq_snp_called,35|num_hq_snp_hom_ref,36|num_hq_snp_het,37|num_hq_snp_hom_alt,38|num_vhq_snp_called,39|num_vhq_snp_hom_ref,40|num_vhq_snp_het,41|num_vhq_snp_hom_alt,42|num_singleton,43|num_biallelic_singleton,44|num_hq_snp_singleton,45|num_vhq_snp_singleton,46|num_bi_nonsynonymous,47|num_bi_synonymous,48|num_frameshift,49|num_inframe,50|num_bi_frameshift,51|num_bi_inframe,52|num_hq_frameshift,53|num_hq_inframe,54|num_bi_frameshift_snpeff,55|num_bi_inframe_snpeff,56|num_bi_transition,57|num_bi_transversion,58|num_bi_AT_to_AT,59|num_bi_CG_to_CG,60|num_bi_AT_to_CG,61|num_bi_CG_to_AT,62|num_phased,63|num_phased_non_ref,64|num_phased_hom_ref,65|num_phased_missing,66|num_GQ_30,67|num_het_GQ_30,68|num_hom_alt_GQ_30,69|num_GQ_99,70|num_het_GQ_99,71|num_hom_alt_GQ_99,72|pc_pass,73|pc_missing,74|pc_pass_missing,75|pc_het,76|pc_pass_het,77|pc_hq_snp_het,78|pc_vhq_snp_het,79|pc_hom_alt,80|pc_pass_hom_alt,81|pc_snp,82|pc_biallelic,83|pc_spanning_del,84|pc_mutliallelic,85|pc_ins,86|pc_coding,87|pc_bi_nonsynonymous,88|pc_frameshift,89|pc_bi_frameshift,90|pc_hq_frameshift,91|pc_bi_frameshift_snpeff,92|pc_bi_transition,93|pc_bi_AT_to_AT,94|pc_bi_CG_to_CG,95|pc_bi_AT_to_CG,96|pc_bi_CG_to_AT,97|pc_phased,98|pc_phased_hom_ref,99|pc_phased_missing,100|pc_GQ_30,101|pc_het_GQ_30,102|pc_hom_alt_GQ_30,103|pc_GQ_99,104|pc_het_GQ_99,105|pc_hom_alt_GQ_99,106|mean_GQ,107|mean_GQ_hom_ref,108|mean_GQ_het,109|mean_GQ_hom_alt,110|mean_DP,111|mean_DP_hom_ref,112|mean_DP_het,113|mean_DP_hom_alt,114|mean_indel_len,115|total_indel_len
FP0008-C,6051696,3114760,705552,214960,5346144,2899800,5173409,75703,30237,97032,65157,95394,3556,5476,10087,30724,16594,28957,9188,19951,29139,21049,21049,42098,4105,4728,8833,13145,31726,44871,13744,17108,30852,1042923,1037626,2260,3037,83095,79328,1295,2472,396,251,142,11,3510,1754,6589,7780,39,62,0,5,32,62,4656,3837,1752,472,3172,3097,20733,17693,2953,87,2098968,21382,28661,1195439,11750,8487,0.5424096320637828,0.1165874822529089,0.069013342922087,0.0141602994606954,0.0104272708462652,0.0021669864409932,0.0155845718755641,0.0181499039307583,0.0224694806538381,0.3054594628592993,0.0946810071912279,0.4278151665723211,0.4775038262364509,0.1644756442723074,0.3234165670796905,0.6667933130699089,0.4585566149349294,0.3861386138613861,0.0,0.3404255319148936,0.548216178028965,0.2062875309078064,0.0555751795596373,0.3734840456846815,0.3646532438478747,0.1854728808939765,0.1424299426035788,0.0041962089422659,0.723831988412994,0.7071468730363462,0.4398759918351059,0.4122487757776398,0.3885967523233125,0.1302546157742069,58.409232172,63.3459959079,60.7741508748,37.5281397241,23.9188714379,25.8241581283,23.5613652148,15.0279325322,-0.188319164237,-586569
PD0658-C,6051696,3114760,524040,85572,5527656,3029188,5303092,75208,19249,149356,110605,129854,93,8506,5260,53018,13896,49081,2529,33619,36148,16720,16720,33440,3780,6980,10760,10990,55816,66806,2783,29634,32417,1040633,1035712,20,4901,83094,78899,2,4193,6,0,0,0,3403,1490,7277,8424,32,55,1,10,29,54,4435,3597,1644,470,2918,3000,23340,20308,2964,68,2626162,10033,83883,2078763,2681,43101,0.5480058816974138,0.0865939068981654,0.0274730637352476,0.0136057670737831,0.006354508204839,1.921907146900012e-05,2.4069126531398172e-05,0.0270197711290282,0.0365130853548871,0.2783741740724198,0.0662205245891539,0.4487963405054908,0.4849831349053552,0.1387205734471289,0.2496419055246661,0.6954833435520131,0.4634736640978281,0.367816091954023,0.0909090909090909,0.3493975903614458,0.5521663346613546,0.2046812749003984,0.05851593625498,0.363296812749004,0.3735059760956175,0.1563910237651516,0.1269922879177377,0.0029134532990574,0.8669524638285904,0.5212218816561899,0.7584015189186746,0.6862443004527946,0.1392799625954595,0.3896840106685954,78.0518598544,81.1523677507,40.4480232739,63.7078613083,56.2662417008,58.2032839266,30.4262559094,45.9739794765,-0.283569841657,-883252
PD0659-C,6051696,3114760,549879,92015,5501817,3022745,5280371,73290,18457,148156,109727,128184,129,8492,4865,52431,13463,48804,2273,33241,35514,16184,16184,32368,3647,6904,10551,10566,55534,66100,2581,29373,31954,1040704,1035788,28,4888,83101,78906,2,4193,0,0,0,0,3397,1490,7168,8298,34,58,1,10,31,57,4446,3587,1636,471,2920,3006,22610,19950,2597,63,2642340,9722,85043,2104583,2626,43490,0.5494084954116067,0.0908636190581946,0.0295416019211753,0.0133210537536962,0.0061060393781149,2.6904864399483427e-05,2.406709907221333e-05,0.0269285583290029,0.0363004487642854,0.277054858640704,0.067254883604818,0.4469824627098546,0.4857626536853273,0.1376498675816362,0.2492822817200274,0.695109474114999,0.4634682529419371,0.3695652173913043,0.0909090909090909,0.3522727272727273,0.5534669488360513,0.2036599029005353,0.0586331383044939,0.3635005601892194,0.3742063986057513,0.1556356487549148,0.114860681114551,0.0027863777089783,0.8741524673765071,0.526737823048166,0.7750416943869786,0.696248939291935,0.1422766430080728,0.396347298294859,78.7335614943,82.0069675505,41.0608983042,64.7435635714,49.631911929,51.4609130711,28.0173917755,39.737557757,-0.279191013112,-869613
PD0660-C,6051696,3114760,834290,270657,5217406,2844103,5039331,59148,14764,118927,85574,100338,2135,7984,3694,39096,8935,38494,4347,27158,31505,10417,10417,20834,2350,5235,7585,6569,41954,48523,5708,27710,33418,1042193,1035409,1927,4857,83132,79071,26,4035,43,22,12,1,3801,2909,7081,7823,92,65,1,6,80,64,5333,4249,2138,458,3806,3180,22450,19644,2700,106,1905092,8557,45277,1220992,4043,20167,0.5451182062503858,0.137860527032422,0.0868949774621479,0.0113366680683849,0.005191091883803,0.001848985744483,0.000312755617572,0.0227942774627851,0.0300882211368575,0.3139887181327114,0.1008491299408001,0.4264585700332874,0.4726923000259124,0.1351857132672702,0.333054276545277,0.566467958271237,0.4751073537305421,0.5859872611464968,0.1428571428571428,0.5555555555555556,0.5565643915675225,0.2231266958881235,0.0477979544980171,0.3972030891254435,0.3318722604884158,0.1957782694492615,0.1202672605790645,0.0047216035634743,0.6698393131331741,0.5795854781901923,0.5290976231098231,0.4293065335538129,0.2738417772961257,0.2356673756047397,54.894927057,60.6195330139,48.4737875914,46.0782831234,26.0158997804,28.4419387229,30.846586291,22.2184425176,-0.206259551298,-642449
PD0662-C,6051696,3114760,525522,82397,5526174,3032363,5298188,77745,19295,150241,111246,130541,162,8625,5021,53465,14112,49156,2479,33924,36403,16816,16816,33632,3838,7035,10873,10976,55977,66953,2651,29992,32643,1040428,1035468,54,4906,83049,78880,2,4167,1,0,0,0,3426,1501,7411,8393,32,57,0,10,29,57,4504,3696,1694,492,2955,3059,23481,20691,2717,73,2654836,10422,86821,2145563,3143,46576,0.5487273835387738,0.0868387969256882,0.0264537235613658,0.014068503814755,0.006363024479589,5.19017173701592e-05,2.4082168358438997e-05,0.0271871642116227,0.0366862410601896,0.2788625795727013,0.0673121854436537,0.4480278226764005,0.4846599918799457,0.1397090946470331,0.2500593683210639,0.6953521412624315,0.4689319159706403,0.3595505617977528,0.0,0.3372093023255814,0.5492682926829269,0.2065853658536585,0.06,0.3603658536585366,0.3730487804878049,0.1585019265977738,0.1157105745070482,0.0031088965546612,0.8755007233632649,0.5401399326250323,0.7804415439656257,0.7075548013216095,0.1628919409173361,0.4186757276666127,79.3544054759,82.3621841726,42.9638248251,65.9847814753,58.9806819787,60.9063667585,33.6668566986,48.8595095554,-0.284364766467,-885728


1001


0|sample_id,1|num_variants,2|num_pass_variants,3|num_missing,4|num_pass_missing,5|num_called,6|num_pass_called,7|num_hom_ref,8|num_het,9|num_pass_het,10|num_hom_alt,11|num_pass_hom_alt,12|num_pass_non_ref,13|num_biallelic_het,14|num_biallelic_hom_alt,15|num_spanning_del_het,16|num_spanning_del_hom_alt,17|num_multiallelic_het,18|num_multiallelic_hom_alt,19|num_snp_het,20|num_snp_hom_alt,21|num_snp,22|num_indel_het,23|num_indel_hom_alt,24|num_indel,25|num_ins_het,26|num_ins_hom_alt,27|num_ins,28|num_del_het,29|num_del_hom_alt,30|num_del,31|num_coding_het,32|num_coding_hom_alt,33|num_coding,34|num_hq_snp_called,35|num_hq_snp_hom_ref,36|num_hq_snp_het,37|num_hq_snp_hom_alt,38|num_vhq_snp_called,39|num_vhq_snp_hom_ref,40|num_vhq_snp_het,41|num_vhq_snp_hom_alt,42|num_singleton,43|num_biallelic_singleton,44|num_hq_snp_singleton,45|num_vhq_snp_singleton,46|num_bi_nonsynonymous,47|num_bi_synonymous,48|num_frameshift,49|num_inframe,50|num_bi_frameshift,51|num_bi_inframe,52|num_hq_frameshift,53|num_hq_inframe,54|num_bi_frameshift_snpeff,55|num_bi_inframe_snpeff,56|num_bi_transition,57|num_bi_transversion,58|num_bi_AT_to_AT,59|num_bi_CG_to_CG,60|num_bi_AT_to_CG,61|num_bi_CG_to_AT,62|num_phased,63|num_phased_non_ref,64|num_phased_hom_ref,65|num_phased_missing,66|num_GQ_30,67|num_het_GQ_30,68|num_hom_alt_GQ_30,69|num_GQ_99,70|num_het_GQ_99,71|num_hom_alt_GQ_99,72|pc_pass,73|pc_missing,74|pc_pass_missing,75|pc_het,76|pc_pass_het,77|pc_hq_snp_het,78|pc_vhq_snp_het,79|pc_hom_alt,80|pc_pass_hom_alt,81|pc_snp,82|pc_biallelic,83|pc_spanning_del,84|pc_mutliallelic,85|pc_ins,86|pc_coding,87|pc_bi_nonsynonymous,88|pc_frameshift,89|pc_bi_frameshift,90|pc_hq_frameshift,91|pc_bi_frameshift_snpeff,92|pc_bi_transition,93|pc_bi_AT_to_AT,94|pc_bi_CG_to_CG,95|pc_bi_AT_to_CG,96|pc_bi_CG_to_AT,97|pc_phased,98|pc_phased_hom_ref,99|pc_phased_missing,100|pc_GQ_30,101|pc_het_GQ_30,102|pc_hom_alt_GQ_30,103|pc_GQ_99,104|pc_het_GQ_99,105|pc_hom_alt_GQ_99,106|mean_GQ,107|mean_GQ_hom_ref,108|mean_GQ_het,109|mean_GQ_hom_alt,110|mean_DP,111|mean_DP_hom_ref,112|mean_DP_het,113|mean_DP_hom_alt,114|mean_indel_len,115|total_indel_len
Brazil01,4084419,1182447,2146665,77366,1937754,1105081,1857725,43912,27350,36117,25383,52733,11378,11306,7962,6808,8010,7269,15617,14390,30007,11733,11733,23466,2083,1522,3605,6684,6769,13453,11286,11016,22302,388899,378330,5170,5399,121217,114604,3047,3566,390,273,95,8,6216,4313,2134,3852,70,189,31,102,58,187,12643,8832,1722,2441,8442,8870,13075,11449,1601,25,317025,21423,7114,57159,8338,1756,0.5702896239667161,0.5255741391860139,0.0654287253466751,0.0226612872428595,0.0247493170183905,0.0132939400718438,0.0251367382462855,0.0186385887991974,0.0229693569973603,0.569036466728614,0.4301670680598486,0.2800902660573076,0.2897426658828437,0.2113377887208348,0.4229230273263421,0.5903694557887739,0.3564984964918142,0.2702702702702703,0.2330827067669172,0.236734693877551,0.5887310826542491,0.0801862630966239,0.1136670547147846,0.3931082654249127,0.4130384167636787,0.2171126239736028,0.1224474187380497,0.0019120458891013,0.2868794233182907,0.783290676416819,0.280266319977938,0.0517238102908293,0.3048628884826325,0.0691801599495725,23.2768225553,23.9084646904,61.7368555759,26.5957924595,14.6020176803,15.3025900178,13.3342595978,23.7348619155,-0.151584806761,-179241
mauritania,4084419,1182447,662563,6801,3421856,1175646,3036955,164289,9995,220612,63295,73290,319,23076,5938,20044,3738,20175,4486,32272,36758,5509,5509,11018,1275,4592,5867,3091,19284,22375,2014,27205,29219,406895,396294,68,10533,125865,118620,22,7223,2084,1397,538,380,6280,4285,3719,6134,112,226,44,118,92,223,12828,9115,1758,2540,9229,8416,16551,16053,496,2,1146498,8893,62593,1140502,7375,61524,0.3435696884965352,0.1622171966196416,0.0057516319970366,0.0480116638455855,0.0085017088477313,0.000167119281387,0.0001747904500854,0.0644714447364237,0.0538384853944129,0.501541820166462,0.3192113521626415,0.354509482876245,0.3262791649611133,0.2077402450251398,0.398676490653568,0.5944155229531471,0.3774484928448188,0.3313609467455621,0.2716049382716049,0.2920634920634921,0.5846055689741603,0.080116665907123,0.115754454723602,0.4205897097024108,0.3835391696668641,0.2190339746213671,0.0299679777656939,0.0001208386200229,0.9752068224618636,0.8897448724362181,0.9889090765463306,0.9701066477494076,0.7378689344672336,0.972019906785686,95.84867567,96.4590966983,83.7236618309,97.4310293072,419.354600248,419.472902583,246.873736868,453.430270953,-0.28494131238,-336928
IVC10-OM-0292,4084419,1182447,685572,8608,3398847,1173839,3073140,117052,5706,208655,54460,60166,389,19771,2753,17087,2564,17602,2325,27399,29724,3381,3381,6762,772,4382,5154,1841,16442,18283,1215,23008,24223,407355,398263,95,8997,125947,119940,13,5994,395,283,118,72,5442,3613,3021,5131,83,188,41,95,67,186,11070,7854,1569,2235,7512,7608,16373,14753,1594,26,1092034,3918,48011,335649,1904,12343,0.3453638836934996,0.1678505559787083,0.0072798188840599,0.0344387376071944,0.0048609732680546,0.000233211817702,0.0001032180202783,0.0613899360577278,0.0463947781595261,0.4940331748828241,0.3350729647973939,0.3297543463085463,0.3351726888940597,0.2199086913854162,0.4026027989229797,0.6009939260077305,0.3705839057899902,0.3062730627306273,0.3014705882352941,0.2648221343873518,0.5849714648065948,0.0829105897273303,0.1181039949270767,0.3969562460367787,0.4020291693088142,0.2452049330186484,0.0973554021865266,0.0015879802113235,0.9303098636184348,0.68664563617245,0.8815828130738157,0.2859412577022914,0.3336838415702769,0.2266434080058758,71.218775979,72.1300354772,57.4244654749,65.2862835108,28.3793920573,28.6896548628,27.7276550999,23.6087219978,-0.223808762676,-264642
IVC10-OM-0296,4084419,1182447,682270,9496,3402149,1172951,3076952,127729,5342,197468,53177,58519,349,19689,2590,16460,2403,17028,2124,27101,29225,3218,3218,6436,732,4334,5066,1732,15614,17346,1237,22454,23691,407305,398177,58,9070,125942,119832,7,6103,369,241,106,56,5507,3589,2833,4865,96,187,36,88,78,185,11012,7818,1576,2227,7439,7588,15556,13939,1586,31,1063927,3408,44823,179703,1749,6957,0.3447676747843789,0.1670421178630302,0.0080308039176385,0.0375436231628891,0.0045543249462253,0.0001423994304022,5.5581140525003575e-05,0.0580421374842783,0.045336079682783,0.4994104478887199,0.3424187016182778,0.3255352962285753,0.3320460021531468,0.2260396216312689,0.4048428715459936,0.6054309586631487,0.3680176669264744,0.3392226148409894,0.2903225806451613,0.2965779467680608,0.5848114710568242,0.0836962294211364,0.1182687201274561,0.39506107275624,0.4029739776951673,0.2381961414241528,0.1019542298791463,0.0019928002057084,0.907051530711854,0.6379633096218644,0.8429020065065724,0.1532058883960199,0.3274054661175589,0.1308272373394512,63.251817629,64.064401417,54.3148633471,58.415348741,24.8226609734,25.1298841024,21.7229502059,20.3678281964,-0.20844908905,-246480
IVC10-OM-0299,4084419,1182447,584444,10473,3499975,1171974,3154158,147765,4999,198052,52853,57852,373,19715,2348,16326,2278,16812,2021,27087,29108,2978,2978,5956,707,4332,5039,1548,15502,17050,1232,22074,23306,407309,398237,95,8977,125928,120016,9,5903,641,456,193,91,5379,3657,2669,4825,103,204,51,88,87,203,11098,7758,1517,2187,7492,7660,15382,13961,1361,60,1025611,3172,41964,96021,1474,4239,0.334852106086472,0.1430910981463948,0.0088570565953484,0.0422188729919499,0.0042654529878649,0.0002332381557981,7.14694110920526e-05,0.0565866899049278,0.0450974168368922,0.5031459586531148,0.3472308649657747,0.3227891861992671,0.3299799488349582,0.2281225949567658,0.4028555624697504,0.5952855245683931,0.3561515879370163,0.3355048859934853,0.3669064748201439,0.3,0.5885659736953754,0.0804518455663979,0.1159843020789138,0.3973271107339838,0.4062367416207043,0.2413226854732766,0.0884800416070732,0.0039006631127291,0.8751141236921638,0.6345269053810763,0.7939757440448035,0.0819309984692493,0.2948589717943589,0.080203583524114,56.5896695581,57.3247768198,52.4772954591,52.6962707888,22.0369877043,22.3273878444,19.4160832166,17.8907157588,-0.193370189108,-228650


# Create full metadata table

In [21]:
initial_columns = ['sample', 'study', 'source_code', 'run_accessions', 'center_name',
                   'pc_bases_callable', 'pc_bases_no_coverage', 'pc_bases_low_coverage', 'pc_bases_excessive_coverage',
                   'pc_bases_poor_mapping_quality', 'pc_bases_ref_n',
                   'bases_of_1X_coverage', 'bases_of_2X_coverage', 'bases_of_5X_coverage',
                   'pc_genome_covered_at_1x', 'pc_genome_covered_at_5x',
                  ]

tbl_sample_metadata = collections.OrderedDict()
for release in releases:
    tbl_sample_metadata[release] = (
        tbl_vrpipe[release]
        .leftjoin(tbl_solaris[release], key=(['sample', 'study']))
        .addfield('pc_bases_callable', lambda rec: 0.0 if rec['bases_callable_percent'] == 'unknown' else round(rec['bases_callable_percent'] / 100, 4))
        .cutout('bases_callable_percent')
        .addfield('pc_bases_no_coverage', lambda rec: 0.0 if rec['bases_no_coverage_percent'] == 'unknown' else round(rec['bases_no_coverage_percent'] / 100, 4))
        .cutout('bases_no_coverage_percent')
        .addfield('pc_bases_low_coverage', lambda rec: 0.0 if rec['bases_low_coverage_percent'] == 'unknown' else round(rec['bases_low_coverage_percent'] / 100, 4))
        .cutout('bases_low_coverage_percent')
        .addfield('pc_bases_excessive_coverage', lambda rec: 0.0 if rec['bases_excessive_coverage_percent'] == 'unknown' else round(rec['bases_excessive_coverage_percent'] / 100, 4))
        .cutout('bases_excessive_coverage_percent')
        .addfield('pc_bases_poor_mapping_quality', lambda rec: 0.0 if rec['bases_poor_mapping_quality_percent'] == 'unknown' else round(rec['bases_poor_mapping_quality_percent'] / 100, 4))
        .cutout('bases_poor_mapping_quality_percent')
        .addfield('pc_bases_ref_n', lambda rec: 0.0 if rec['bases_ref_n_percent'] == 'unknown' else round(rec['bases_ref_n_percent'] / 100, 4))
        .cutout('bases_ref_n_percent')
        .addfield('pc_genome_covered_at_1x', lambda rec: 0.0 if rec['bases_of_1X_coverage'] == 'unknown' else round(rec['bases_of_1X_coverage'] / genome_length[release], 4))
        .addfield('pc_genome_covered_at_5x', lambda rec: 0.0 if rec['bases_of_5X_coverage'] == 'unknown' else round(rec['bases_of_5X_coverage'] / genome_length[release], 4))
        .addfield('pc_reads_mapped', lambda rec: 0.0 if rec['reads_mapped'] == 'unknown' else round(rec['reads_mapped'] / rec['reads'], 4))
        .addfield('pc_mapped_reads_properly_paired', lambda rec: 0.0 if rec['reads_mapped'] == 'unknown' else round(rec['reads_properly_paired'] / rec['reads_mapped'], 4))
        # Note in the following we use reads_properly_paired/2 to get numbers of pairs of reads
        .addfield('pc_pairs_on_different_chromosomes', lambda rec: 0.0 if rec['pairs_on_different_chromosomes'] == 'unknown' or rec['pairs_on_different_chromosomes'] == 0.0 else round(rec['pairs_on_different_chromosomes'] / (rec['pairs_on_different_chromosomes'] + ( rec['reads_properly_paired'] / 2)), 4))
        .addfield('pc_non_primary_alignments', lambda rec: 0.0 if rec['reads_mapped'] == 'unknown' else round(rec['non_primary_alignments'] / rec['reads_mapped'], 4))
        .addfield('pc_reads_qc_failed', lambda rec: 0.0 if rec['reads_qc_failed'] == 'unknown' else round(rec['reads_qc_failed'] / rec['reads'], 4))
        .convert('run_accessions', lambda x: 'NULL', where=lambda rec: rec['study'] == '1156-PV-ID-PRICE') # These were wrongly accessioned and are currently being removed from ENA
        .replaceall('unknown', 0)
        .cutout('path')
        .join(tbl_genotype_summary[release], lkey='sample', rkey='sample_id')
        .sort('sample')
    )
#     header_
#     print(initial_columns, tbl_sample_metadata[release].header())
#     + (np.setdiff1d(tbl_sample_metadata[release].header(), initial_columns)))
    column_order = initial_columns + list(np.array(list(tbl_sample_metadata[release].header()))[
        np.logical_not(np.in1d(list(tbl_sample_metadata[release].header()), initial_columns))
    ])
    tbl_sample_metadata[release] = tbl_sample_metadata[release].cut(column_order)
#     print(column_order)
    print(len(tbl_sample_metadata[release].data()))
    tbl_sample_metadata[release].display(index_header=True)
    tbl_sample_metadata[release].totsv(sample_metadata_fn[release], lineterminator='\n')


7182


0|sample,1|study,2|source_code,3|run_accessions,4|center_name,5|pc_bases_callable,6|pc_bases_no_coverage,7|pc_bases_low_coverage,8|pc_bases_excessive_coverage,9|pc_bases_poor_mapping_quality,10|pc_bases_ref_n,11|bases_of_1X_coverage,12|bases_of_2X_coverage,13|bases_of_5X_coverage,14|pc_genome_covered_at_1x,15|pc_genome_covered_at_5x,16|mean_coverage,17|mean_insert_size,18|sd_insert_size,19|avg_read_length,20|reads,21|reads_mapped,22|reads_mapped_and_paired,23|reads_properly_paired,24|reads_qc_failed,25|pairs_on_different_chromosomes,26|non_primary_alignments,27|pc_reads_mapped,28|pc_mapped_reads_properly_paired,29|pc_pairs_on_different_chromosomes,30|pc_non_primary_alignments,31|pc_reads_qc_failed,32|num_variants,33|num_pass_variants,34|num_missing,35|num_pass_missing,36|num_called,37|num_pass_called,38|num_hom_ref,39|num_het,40|num_pass_het,41|num_hom_alt,42|num_pass_hom_alt,43|num_pass_non_ref,44|num_biallelic_het,45|num_biallelic_hom_alt,46|num_spanning_del_het,47|num_spanning_del_hom_alt,48|num_multiallelic_het,49|num_multiallelic_hom_alt,50|num_snp_het,51|num_snp_hom_alt,52|num_snp,53|num_indel_het,54|num_indel_hom_alt,55|num_indel,56|num_ins_het,57|num_ins_hom_alt,58|num_ins,59|num_del_het,60|num_del_hom_alt,61|num_del,62|num_coding_het,63|num_coding_hom_alt,64|num_coding,65|num_hq_snp_called,66|num_hq_snp_hom_ref,67|num_hq_snp_het,68|num_hq_snp_hom_alt,69|num_vhq_snp_called,70|num_vhq_snp_hom_ref,71|num_vhq_snp_het,72|num_vhq_snp_hom_alt,73|num_singleton,74|num_biallelic_singleton,75|num_hq_snp_singleton,76|num_vhq_snp_singleton,77|num_bi_nonsynonymous,78|num_bi_synonymous,79|num_frameshift,80|num_inframe,81|num_bi_frameshift,82|num_bi_inframe,83|num_hq_frameshift,84|num_hq_inframe,85|num_bi_frameshift_snpeff,86|num_bi_inframe_snpeff,87|num_bi_transition,88|num_bi_transversion,89|num_bi_AT_to_AT,90|num_bi_CG_to_CG,91|num_bi_AT_to_CG,92|num_bi_CG_to_AT,93|num_phased,94|num_phased_non_ref,95|num_phased_hom_ref,96|num_phased_missing,97|num_GQ_30,98|num_het_GQ_30,99|num_hom_alt_GQ_30,100|num_GQ_99,101|num_het_GQ_99,102|num_hom_alt_GQ_99,103|pc_pass,104|pc_missing,105|pc_pass_missing,106|pc_het,107|pc_pass_het,108|pc_hq_snp_het,109|pc_vhq_snp_het,110|pc_hom_alt,111|pc_pass_hom_alt,112|pc_snp,113|pc_biallelic,114|pc_spanning_del,115|pc_mutliallelic,116|pc_ins,117|pc_coding,118|pc_bi_nonsynonymous,119|pc_frameshift,120|pc_bi_frameshift,121|pc_hq_frameshift,122|pc_bi_frameshift_snpeff,123|pc_bi_transition,124|pc_bi_AT_to_AT,125|pc_bi_CG_to_CG,126|pc_bi_AT_to_CG,127|pc_bi_CG_to_AT,128|pc_phased,129|pc_phased_hom_ref,130|pc_phased_missing,131|pc_GQ_30,132|pc_het_GQ_30,133|pc_hom_alt_GQ_30,134|pc_GQ_99,135|pc_het_GQ_99,136|pc_hom_alt_GQ_99,137|mean_GQ,138|mean_GQ_hom_ref,139|mean_GQ_het,140|mean_GQ_hom_alt,141|mean_DP,142|mean_DP_hom_ref,143|mean_DP_het,144|mean_DP_hom_alt,145|mean_indel_len,146|total_indel_len
FP0008-C,1147-PF-MR-CONWAY,KB02.14,ERR1081237,SC,0.8216,0.0265,0.0942,0.0,0.0577,0.0,22897930,22494301,21155419,0.9814,0.9067,34.09,232.6,81.3,99,8893235,7876872,7836096,7377894,0,208885,107847,0.8857,0.9367,0.0536,0.0137,0.0,6051696,3114760,705552,214960,5346144,2899800,5173409,75703,30237,97032,65157,95394,3556,5476,10087,30724,16594,28957,9188,19951,29139,21049,21049,42098,4105,4728,8833,13145,31726,44871,13744,17108,30852,1042923,1037626,2260,3037,83095,79328,1295,2472,396,251,142,11,3510,1754,6589,7780,39,62,0,5,32,62,4656,3837,1752,472,3172,3097,20733,17693,2953,87,2098968,21382,28661,1195439,11750,8487,0.5424096320637828,0.1165874822529089,0.069013342922087,0.0141602994606954,0.0104272708462652,0.0021669864409932,0.0155845718755641,0.0181499039307583,0.0224694806538381,0.3054594628592993,0.0946810071912279,0.4278151665723211,0.4775038262364509,0.1644756442723074,0.3234165670796905,0.6667933130699089,0.4585566149349294,0.3861386138613861,0.0,0.3404255319148936,0.548216178028965,0.2062875309078064,0.0555751795596373,0.3734840456846815,0.3646532438478747,0.1854728808939765,0.1424299426035788,0.0041962089422659,0.723831988412994,0.7071468730363462,0.4398759918351059,0.4122487757776398,0.3885967523233125,0.1302546157742069,58.409232172,63.3459959079,60.7741508748,37.5281397241,23.9188714379,25.8241581283,23.5613652148,15.0279325322,-0.188319164237,-586569
FP0009-C,1147-PF-MR-CONWAY,KB03.14,ERR1081238,SC,0.8885,0.0127,0.027,0.0,0.0718,0.0,23176020,23074498,22753562,0.9933,0.9752,115.68,249.4,85.3,99,28121601,27340516,27190475,25462624,0,781402,401309,0.9722,0.9313,0.0578,0.0147,0.0,6051696,3114760,459029,84770,5592667,3029990,5372312,80727,19396,139628,101341,120737,149,7689,5046,48657,14201,44995,2466,30860,33326,16930,16930,33860,4114,7946,12060,10662,49313,59975,3104,26099,29203,1040808,1036352,34,4422,83125,79870,2,3253,479,272,168,15,2956,1477,6416,7453,31,47,0,5,26,46,3996,3311,1507,392,2758,2650,22678,19872,2695,111,2608053,10493,76113,2079946,3119,39816,0.541779083217363,0.075851298545069,0.0272155800125852,0.0144344370941448,0.006401341258552,3.266692800209068e-05,2.406015037593985e-05,0.024966263859443,0.0334459849702474,0.2760214350199193,0.0649179621822639,0.4447932282564582,0.4902888095612778,0.1674186159505795,0.2418728310294276,0.6668170539138281,0.4626144639123224,0.3974358974358974,0.0,0.3611111111111111,0.5468728616395238,0.2062405912139044,0.0536471876283016,0.3774462843848364,0.3626659367729574,0.1645891483141042,0.1188376400035276,0.0048946115177705,0.8607464051036472,0.5409878325427924,0.7510583080885328,0.6864530906042594,0.1608063518251185,0.392891327300895,77.6572618757,80.6596696815,42.3997215921,63.1724770823,61.4682839127,63.5573406644,34.6579707156,48.8584975479,-0.253752134996,-790377
FP0015-C,1147-PF-MR-CONWAY,KB18.14,ERR1081239,SC,0.8845,0.0124,0.0313,0.0,0.0718,0.0,23169945,23053308,22651985,0.993,0.9708,120.9,238.1,79.0,99,30594162,28481371,28335665,26687134,0,751964,370588,0.9309,0.937,0.0533,0.013,0.0,6051696,3114760,462196,115386,5589500,2999374,5363122,103364,30437,123014,86367,116804,2035,6429,9664,42100,18738,37838,6959,26376,33335,23478,23478,46956,4940,6319,11259,14853,42329,57182,9091,21174,30265,1041257,1036405,1293,3559,83103,79691,706,2706,269,133,71,6,3204,1615,6541,7661,26,49,0,3,23,49,4379,3556,1613,442,2986,2894,21218,18587,2511,120,2559401,20484,60687,2064597,10730,30665,0.5366086412022543,0.0763746229156256,0.0370449087570149,0.018492530637803,0.0101477841709636,0.0012417683626616,0.0084954815108961,0.0220080508095536,0.0287950085584525,0.2853926235402897,0.07246327180576,0.4431697544604637,0.4843669737337762,0.1645066553673967,0.259109277079552,0.6648682299232206,0.4605689339529644,0.3466666666666667,0.0,0.3194444444444444,0.5518588531821046,0.203276622558286,0.0557025834908632,0.3763074984247007,0.3647132955261499,0.1591298243210848,0.1183429163917428,0.0056555754548025,0.8533117243798206,0.6729966816703354,0.7026642120254264,0.6883426341629953,0.3525314584223149,0.3550545926106035,76.5758434037,80.3652455968,57.4483687617,59.1472668959,61.9658323595,64.6949715011,52.2513059763,47.0408257784,-0.24147093195,-752124
FP0016-C,1147-PF-MR-CONWAY,KB19.14,ERR1081240,SC,0.5983,0.0783,0.2944,0.0,0.0289,0.0,21781881,20171385,15696745,0.9335,0.6727,9.33,242.6,82.4,99,10818502,2054666,2038902,1907925,0,59725,29154,0.1899,0.9286,0.0589,0.0142,0.0,6051696,3114760,1243151,485466,4808545,2629294,4713949,31941,10546,62655,41150,51696,1721,5250,3459,17346,5366,18554,3993,13943,17936,6553,6553,13106,1229,3055,4284,3804,18596,22400,7151,16465,23616,1040585,1036041,1242,3302,83035,79698,686,2651,107,61,48,4,3031,1487,4588,5851,39,51,0,5,37,51,3691,2961,1293,385,2508,2466,10342,9027,1244,71,891098,6508,6743,2381,2006,3,0.5467961722308932,0.2054219180870949,0.1558598415287213,0.0066425498773537,0.0040109626386398,0.0011935593920727,0.0082615764436683,0.0130299290117904,0.0156505890934981,0.3469514082327453,0.1348460229031259,0.4024489322191272,0.4627050448777468,0.1605456453305351,0.4568245125348189,0.6708720672864099,0.4395056997796724,0.4333333333333333,0.0,0.4204545454545454,0.5548707155742634,0.194377630787733,0.0578773301262778,0.3770294648226097,0.3707155742633794,0.174616991643454,0.1202862115644942,0.0068652098240185,0.3389115100859775,0.6171060117580125,0.1638639125151883,0.000905566285094,0.190214299260383,7.290400972053463e-05,18.6956895555,22.1302693438,47.4102977432,16.75963548,7.06715188329,8.3291009692,8.92869334345,5.76488456865,-0.0960273022641,-299102
FP0017-C,1147-PF-MR-CONWAY,KB20.14,ERR1081241,SC,0.8902,0.0122,0.0246,0.0,0.073,0.0,23174333,23085110,22780496,0.9932,0.9763,131.45,233.1,80.3,99,32198131,31259561,31077624,28875004,0,996203,499895,0.9709,0.9237,0.0645,0.016,0.0,6051696,3114760,438862,94691,5612834,3020069,5370269,120076,42724,122489,84615,127339,3067,6019,14799,42100,24858,36496,10334,26067,36401,32390,32390,64780,6347,6082,12429,20943,41326,62269,13184,19490,32674,1041206,1036003,1901,3302,83111,79462,1060,2589,531,285,164,20,3487,1677,6956,8319,29,66,0,5,25,66,4651,3812,1763,483,3156,3061,23723,20767,2835,121,2620060,31878,60120,2150796,20356,31154,0.5380649062487862,0.0725188443041421,0.0304007371354454,0.0213931144231238,0.0141466966483216,0.001825767427387,0.0127540277460263,0.0218230220241681,0.0280175717839559,0.2858590062745899,0.0713528455539936,0.446830900195541,0.4818162542504653,0.1663899970548073,0.2565906752840842,0.6752517428350117,0.4553846153846154,0.3052631578947368,0.0,0.2747252747252747,0.5495687108590335,0.2083185631572728,0.0570719602977667,0.3729174051754697,0.3616920713694907,0.1630843653554684,0.1195042785482443,0.0051005353454453,0.8675497149237319,0.7461380020597322,0.7105123205105478,0.712167834575965,0.476453515588428,0.3681853099332269,78.7031357151,82.0185648159,65.7366585526,59.9808899131,67.1338876832,69.6350966042,61.5276425428,48.8776930804,-0.262139298052,-816501


1001


0|sample,1|study,2|source_code,3|run_accessions,4|center_name,5|pc_bases_callable,6|pc_bases_no_coverage,7|pc_bases_low_coverage,8|pc_bases_excessive_coverage,9|pc_bases_poor_mapping_quality,10|pc_bases_ref_n,11|bases_of_1X_coverage,12|bases_of_2X_coverage,13|bases_of_5X_coverage,14|pc_genome_covered_at_1x,15|pc_genome_covered_at_5x,16|mean_coverage,17|mean_insert_size,18|sd_insert_size,19|avg_read_length,20|reads,21|reads_mapped,22|reads_mapped_and_paired,23|reads_properly_paired,24|reads_qc_failed,25|pairs_on_different_chromosomes,26|non_primary_alignments,27|pc_reads_mapped,28|pc_mapped_reads_properly_paired,29|pc_pairs_on_different_chromosomes,30|pc_non_primary_alignments,31|pc_reads_qc_failed,32|num_variants,33|num_pass_variants,34|num_missing,35|num_pass_missing,36|num_called,37|num_pass_called,38|num_hom_ref,39|num_het,40|num_pass_het,41|num_hom_alt,42|num_pass_hom_alt,43|num_pass_non_ref,44|num_biallelic_het,45|num_biallelic_hom_alt,46|num_spanning_del_het,47|num_spanning_del_hom_alt,48|num_multiallelic_het,49|num_multiallelic_hom_alt,50|num_snp_het,51|num_snp_hom_alt,52|num_snp,53|num_indel_het,54|num_indel_hom_alt,55|num_indel,56|num_ins_het,57|num_ins_hom_alt,58|num_ins,59|num_del_het,60|num_del_hom_alt,61|num_del,62|num_coding_het,63|num_coding_hom_alt,64|num_coding,65|num_hq_snp_called,66|num_hq_snp_hom_ref,67|num_hq_snp_het,68|num_hq_snp_hom_alt,69|num_vhq_snp_called,70|num_vhq_snp_hom_ref,71|num_vhq_snp_het,72|num_vhq_snp_hom_alt,73|num_singleton,74|num_biallelic_singleton,75|num_hq_snp_singleton,76|num_vhq_snp_singleton,77|num_bi_nonsynonymous,78|num_bi_synonymous,79|num_frameshift,80|num_inframe,81|num_bi_frameshift,82|num_bi_inframe,83|num_hq_frameshift,84|num_hq_inframe,85|num_bi_frameshift_snpeff,86|num_bi_inframe_snpeff,87|num_bi_transition,88|num_bi_transversion,89|num_bi_AT_to_AT,90|num_bi_CG_to_CG,91|num_bi_AT_to_CG,92|num_bi_CG_to_AT,93|num_phased,94|num_phased_non_ref,95|num_phased_hom_ref,96|num_phased_missing,97|num_GQ_30,98|num_het_GQ_30,99|num_hom_alt_GQ_30,100|num_GQ_99,101|num_het_GQ_99,102|num_hom_alt_GQ_99,103|pc_pass,104|pc_missing,105|pc_pass_missing,106|pc_het,107|pc_pass_het,108|pc_hq_snp_het,109|pc_vhq_snp_het,110|pc_hom_alt,111|pc_pass_hom_alt,112|pc_snp,113|pc_biallelic,114|pc_spanning_del,115|pc_mutliallelic,116|pc_ins,117|pc_coding,118|pc_bi_nonsynonymous,119|pc_frameshift,120|pc_bi_frameshift,121|pc_hq_frameshift,122|pc_bi_frameshift_snpeff,123|pc_bi_transition,124|pc_bi_AT_to_AT,125|pc_bi_CG_to_CG,126|pc_bi_AT_to_CG,127|pc_bi_CG_to_AT,128|pc_phased,129|pc_phased_hom_ref,130|pc_phased_missing,131|pc_GQ_30,132|pc_het_GQ_30,133|pc_hom_alt_GQ_30,134|pc_GQ_99,135|pc_het_GQ_99,136|pc_hom_alt_GQ_99,137|mean_GQ,138|mean_GQ_hom_ref,139|mean_GQ_het,140|mean_GQ_hom_alt,141|mean_DP,142|mean_DP_hom_ref,143|mean_DP_het,144|mean_DP_hom_alt,145|mean_indel_len,146|total_indel_len
Brazil01,Broad,,,BROAD,0.5016,0.2212,0.2645,0.0,0.0075,0.0052,22756624,22310663,21033334,0.7833,0.724,72.11,237.6,150.4,99,42517489,41161952,40542979,37808178,0,192797,944505,0.9681,0.9185,0.0101,0.0229,0.0,4084419,1182447,2146665,77366,1937754,1105081,1857725,43912,27350,36117,25383,52733,11378,11306,7962,6808,8010,7269,15617,14390,30007,11733,11733,23466,2083,1522,3605,6684,6769,13453,11286,11016,22302,388899,378330,5170,5399,121217,114604,3047,3566,390,273,95,8,6216,4313,2134,3852,70,189,31,102,58,187,12643,8832,1722,2441,8442,8870,13075,11449,1601,25,317025,21423,7114,57159,8338,1756,0.5702896239667161,0.5255741391860139,0.0654287253466751,0.0226612872428595,0.0247493170183905,0.0132939400718438,0.0251367382462855,0.0186385887991974,0.0229693569973603,0.569036466728614,0.4301670680598486,0.2800902660573076,0.2897426658828437,0.2113377887208348,0.4229230273263421,0.5903694557887739,0.3564984964918142,0.2702702702702703,0.2330827067669172,0.236734693877551,0.5887310826542491,0.0801862630966239,0.1136670547147846,0.3931082654249127,0.4130384167636787,0.2171126239736028,0.1224474187380497,0.0019120458891013,0.2868794233182907,0.783290676416819,0.280266319977938,0.0517238102908293,0.3048628884826325,0.0691801599495725,23.2768225553,23.9084646904,61.7368555759,26.5957924595,14.6020176803,15.3025900178,13.3342595978,23.7348619155,-0.151584806761,-179241
Brazil02,Broad,,,BROAD,0.0939,0.6732,0.2251,0.0,0.0026,0.0052,10450978,6809040,3643590,0.3597,0.1254,33.98,215.6,156.0,100,30045277,27949243,27644773,26568530,0,67542,435685,0.9302,0.9506,0.0051,0.0156,0.0,4084419,1182447,3427954,793579,656465,388868,638093,4301,1290,14071,9350,10640,302,3883,544,2772,444,2695,621,5131,5752,669,669,1338,197,622,819,331,2573,2904,340,3990,4330,139269,137342,116,1811,43258,42164,38,1056,339,248,102,25,1143,767,520,757,27,26,6,7,26,25,2346,1590,313,448,1624,1551,2426,2266,145,15,106740,978,5611,71732,616,3803,0.592366691293519,0.8392757941827222,0.6711328287864065,0.0065517582810964,0.0033173210446732,0.0008329204632761,0.0008784502288594,0.0214345014585697,0.0240441486571278,0.5406015037593985,0.3933270676691729,0.3116541353383458,0.2950187969924812,0.2199838839645447,0.4069548872180451,0.5984293193717277,0.4072043852779953,0.5094339622641509,0.4615384615384615,0.5098039215686274,0.5960365853658537,0.0795223577235772,0.1138211382113821,0.4126016260162601,0.3940548780487805,0.2129699248120301,0.0597691673536685,0.0061830173124484,0.2744890296964523,0.7581395348837209,0.6001069518716577,0.1844636226174434,0.4775193798449612,0.4067379679144385,9.04423707786,26.6729856066,67.8682170543,55.4343315508,19.1441426127,54.6580554586,143.65503876,169.742780749,-0.0346214248926,-40938
Brazil03,Broad,,,BROAD,0.0047,0.7182,0.2714,0.0,0.0006,0.0052,8755806,5063045,1835872,0.3014,0.0632,4.54,190.8,128.9,100,1617191,532720,464127,444118,0,5246,7061,0.3294,0.8337,0.0231,0.0133,0.0,4084419,1182447,3577487,862287,506932,320160,503448,398,245,3086,2561,2806,135,1405,45,527,65,629,153,1713,1866,92,92,184,14,103,117,32,512,544,128,1211,1339,117880,117087,79,714,36467,36119,39,309,17,15,4,1,500,291,94,167,3,12,1,5,3,12,850,635,115,168,600,602,452,433,13,6,1189,125,44,415,20,26,0.6315639967490709,0.8758863867786336,0.7292394500556896,0.0007851151633749,0.0007652423788105,0.0006701730573464,0.00106946005978,0.0060876014929024,0.0079991254372813,0.6650035637918745,0.5488239486813971,0.2038488952245188,0.2473271560940841,0.1770045385779122,0.477191732002851,0.6321112515802781,0.3601532567049808,0.2,0.1666666666666666,0.2,0.5723905723905723,0.0774410774410774,0.1131313131313131,0.404040404040404,0.4053872053872054,0.154312188168211,0.0287610619469026,0.0132743362831858,0.003713768115942,0.5102040816326531,0.0171807887543928,0.0012962268865567,0.0816326530612244,0.0101522842639593,1.30026631215,4.74568462978,36.7591836735,8.75751659508,0.542229799729,1.88609565343,7.43673469388,4.47325263569,-0.00582436252957,-6887
Brazil08,Broad,,,BROAD,0.0586,0.7161,0.2188,0.0,0.0013,0.0052,8856798,6086966,3444515,0.3049,0.1186,23.74,274.2,204.6,98,14317402,13507571,13379742,12080973,0,28256,485896,0.9434,0.8944,0.0047,0.036,0.0,4084419,1182447,3510847,845737,573572,336710,561171,2912,871,9489,5939,6810,261,2907,319,1458,291,1574,436,3585,4021,435,435,870,133,351,484,241,1407,1648,223,2322,2545,116513,115203,82,1228,36456,35748,23,685,362,285,112,21,816,474,233,397,37,21,5,7,35,20,1723,1230,288,301,1181,1183,1570,1415,146,9,52481,576,2221,27662,292,1108,0.5870405110430774,0.8595707247468979,0.7152430510627538,0.0050769563367807,0.0025867957589617,0.0007037841270931,0.0006308975202984,0.0165436946015495,0.0176383237801075,0.5904552129221733,0.4651982378854625,0.2609397944199706,0.2738619676945668,0.2270168855534709,0.3737151248164464,0.6325581395348837,0.3698412698412698,0.6379310344827587,0.4166666666666667,0.6363636363636364,0.5834744327802235,0.0975279376904842,0.1019302404334575,0.3999322722654927,0.4006095496105655,0.2077826725403818,0.0929936305732484,0.0057324840764331,0.1558640967004247,0.661308840413318,0.3739686815962283,0.0821537821864512,0.3352468427095292,0.1865633945108604,4.80428890259,16.4265625947,57.2916188289,35.6613908065,6.44663058894,21.635498636,63.7439724455,57.106920357,-0.0173859800904,-20558
Brazil12,Broad,,,BROAD,0.3067,0.4302,0.2527,0.0,0.0052,0.0052,17329936,14332738,10162756,0.5965,0.3498,57.67,197.7,124.3,100,44567339,42877267,42367314,40728103,0,203899,730217,0.9621,0.9499,0.0099,0.017,0.0,4084419,1182447,2687667,433969,1396752,748478,1338017,13428,3707,45307,25746,29453,792,10970,1514,7285,1401,7491,1696,14325,16021,2011,2011,4022,606,1743,2349,1016,6898,7914,1181,10881,12062,261161,255803,339,5019,81642,78394,79,3169,843,667,270,37,3111,2202,1306,2168,113,106,25,53,100,102,6486,4494,879,1250,4429,4422,7527,6930,572,25,337242,2770,16497,218161,1691,10651,0.5358703620972084,0.6580292080709643,0.3670092613030436,0.0096137324306677,0.0049527173811387,0.0012980498619625,0.0009676392053109,0.0324373976196203,0.0343978046114915,0.5439513801650087,0.3993481139442502,0.2987471564866057,0.301904729569144,0.2288804443145279,0.4095338335653414,0.585544889892716,0.3759355210132412,0.5159817351598174,0.3205128205128205,0.495049504950495,0.5907103825136611,0.0800546448087431,0.1138433515482695,0.4033697632058288,0.4027322404371585,0.2352901232472074,0.0759930915371329,0.0033213763783711,0.4505703574453758,0.7472349608848126,0.6407597296667443,0.2914728288607013,0.4561640140275155,0.4136953313136021,26.6831003842,41.4643718925,65.4340437011,58.062145576,55.5787616697,84.128884253,140.448340977,163.582653616,-0.0958114824597,-113292


# Create files for individual studies

In [22]:
studies = collections.OrderedDict()
for release in releases:        
    studies[release] = tbl_sample_metadata[release].distinct('study').values('study').array()
    for study in studies[release]:
        print(release, study)
        study_sample_metadata_fn = "%s/%s/%s__%s__sample_metadata.txt" % (
            nfs_release_dir[release], study, release.title(), study
        )
        tbl_sample_metadata[release].selecteq('study', study).totsv(study_sample_metadata_fn, lineterminator='\n')


pf_60 1001-PF-ML-DJIMDE
pf_60 1004-PF-BF-OUEDRAOGO
pf_60 1006-PF-GM-CONWAY
pf_60 1007-PF-TZ-DUFFY
pf_60 1008-PF-SEA-RINGWALD
pf_60 1009-PF-KH-PLOWE
pf_60 1010-PF-TH-ANDERSON
pf_60 1011-PF-KH-SU
pf_60 1012-PF-KH-WHITE
pf_60 1013-PF-PEGB-BRANCH
pf_60 1014-PF-SSA-SUTHERLAND
pf_60 1015-PF-KE-NZILA
pf_60 1016-PF-TH-NOSTEN
pf_60 1017-PF-GH-AMENGA-ETEGO
pf_60 1018-PF-GB-NEWBOLD
pf_60 1020-PF-VN-BONI
pf_60 1021-PF-PG-MUELLER
pf_60 1022-PF-MW-OCHOLLA
pf_60 1023-PF-CO-ECHEVERRI-GARCIA
pf_60 1024-PF-UG-BOUSEMA
pf_60 1025-PF-KH-PLOWE
pf_60 1026-PF-GN-CONWAY
pf_60 1027-PF-KE-BULL
pf_60 1031-PF-SEA-PLOWE
pf_60 1044-PF-KH-FAIRHURST
pf_60 1052-PF-TRAC-WHITE
pf_60 1062-PF-PG-BARRY
pf_60 1083-PF-GH-CONWAY
pf_60 1093-PF-CM-APINJOH
pf_60 1094-PF-GH-AMENGA-ETEGO
pf_60 1095-PF-TZ-ISHENGOMA
pf_60 1096-PF-GH-GHANSAH
pf_60 1097-PF-ML-MAIGA
pf_60 1098-PF-ET-GOLASSA
pf_60 1100-PF-CI-YAVO
pf_60 1101-PF-CD-ONYAMBOKO
pf_60 1102-PF-MG-RANDRIANARIVELOJOSIA
pf_60 1103-PF-PDN-GMSN-NGWA
pf_60 1107-PF-KEN-KAMAU
pf_60 112

# Some sanity checks

In [23]:
tbl_sample_metadata['pf_60'].selecteq('bases_of_5X_coverage', 'unknown')

sample,study,source_code,run_accessions,center_name,pc_bases_callable,pc_bases_no_coverage,pc_bases_low_coverage,pc_bases_excessive_coverage,pc_bases_poor_mapping_quality,pc_bases_ref_n,bases_of_1X_coverage,bases_of_2X_coverage,bases_of_5X_coverage,pc_genome_covered_at_1x,pc_genome_covered_at_5x,mean_coverage,mean_insert_size,sd_insert_size,avg_read_length,reads,reads_mapped,reads_mapped_and_paired,reads_properly_paired,reads_qc_failed,pairs_on_different_chromosomes,non_primary_alignments,pc_reads_mapped,pc_mapped_reads_properly_paired,pc_pairs_on_different_chromosomes,pc_non_primary_alignments,pc_reads_qc_failed,num_variants,num_pass_variants,num_missing,num_pass_missing,num_called,num_pass_called,num_hom_ref,num_het,num_pass_het,num_hom_alt,num_pass_hom_alt,num_pass_non_ref,num_biallelic_het,num_biallelic_hom_alt,num_spanning_del_het,num_spanning_del_hom_alt,num_multiallelic_het,num_multiallelic_hom_alt,num_snp_het,num_snp_hom_alt,num_snp,num_indel_het,num_indel_hom_alt,num_indel,num_ins_het,num_ins_hom_alt,num_ins,num_del_het,num_del_hom_alt,num_del,num_coding_het,num_coding_hom_alt,num_coding,num_hq_snp_called,num_hq_snp_hom_ref,num_hq_snp_het,num_hq_snp_hom_alt,num_vhq_snp_called,num_vhq_snp_hom_ref,num_vhq_snp_het,num_vhq_snp_hom_alt,num_singleton,num_biallelic_singleton,num_hq_snp_singleton,num_vhq_snp_singleton,num_bi_nonsynonymous,num_bi_synonymous,num_frameshift,num_inframe,num_bi_frameshift,num_bi_inframe,num_hq_frameshift,num_hq_inframe,num_bi_frameshift_snpeff,num_bi_inframe_snpeff,num_bi_transition,num_bi_transversion,num_bi_AT_to_AT,num_bi_CG_to_CG,num_bi_AT_to_CG,num_bi_CG_to_AT,num_phased,num_phased_non_ref,num_phased_hom_ref,num_phased_missing,num_GQ_30,num_het_GQ_30,num_hom_alt_GQ_30,num_GQ_99,num_het_GQ_99,num_hom_alt_GQ_99,pc_pass,pc_missing,pc_pass_missing,pc_het,pc_pass_het,pc_hq_snp_het,pc_vhq_snp_het,pc_hom_alt,pc_pass_hom_alt,pc_snp,pc_biallelic,pc_spanning_del,pc_mutliallelic,pc_ins,pc_coding,pc_bi_nonsynonymous,pc_frameshift,pc_bi_frameshift,pc_hq_frameshift,pc_bi_frameshift_snpeff,pc_bi_transition,pc_bi_AT_to_AT,pc_bi_CG_to_CG,pc_bi_AT_to_CG,pc_bi_CG_to_AT,pc_phased,pc_phased_hom_ref,pc_phased_missing,pc_GQ_30,pc_het_GQ_30,pc_hom_alt_GQ_30,pc_GQ_99,pc_het_GQ_99,pc_hom_alt_GQ_99,mean_GQ,mean_GQ_hom_ref,mean_GQ_het,mean_GQ_hom_alt,mean_DP,mean_DP_hom_ref,mean_DP_het,mean_DP_hom_alt,mean_indel_len,total_indel_len


In [24]:
tbl_sample_metadata['pf_60'].selecteq('bases_of_5X_coverage', 0)

sample,study,source_code,run_accessions,center_name,pc_bases_callable,pc_bases_no_coverage,pc_bases_low_coverage,pc_bases_excessive_coverage,pc_bases_poor_mapping_quality,pc_bases_ref_n,bases_of_1X_coverage,bases_of_2X_coverage,bases_of_5X_coverage,pc_genome_covered_at_1x,pc_genome_covered_at_5x,mean_coverage,mean_insert_size,sd_insert_size,avg_read_length,reads,reads_mapped,reads_mapped_and_paired,reads_properly_paired,reads_qc_failed,pairs_on_different_chromosomes,non_primary_alignments,pc_reads_mapped,pc_mapped_reads_properly_paired,pc_pairs_on_different_chromosomes,pc_non_primary_alignments,pc_reads_qc_failed,num_variants,num_pass_variants,num_missing,num_pass_missing,num_called,num_pass_called,num_hom_ref,num_het,num_pass_het,num_hom_alt,num_pass_hom_alt,num_pass_non_ref,num_biallelic_het,num_biallelic_hom_alt,num_spanning_del_het,num_spanning_del_hom_alt,num_multiallelic_het,num_multiallelic_hom_alt,num_snp_het,num_snp_hom_alt,num_snp,num_indel_het,num_indel_hom_alt,num_indel,num_ins_het,num_ins_hom_alt,num_ins,num_del_het,num_del_hom_alt,num_del,num_coding_het,num_coding_hom_alt,num_coding,num_hq_snp_called,num_hq_snp_hom_ref,num_hq_snp_het,num_hq_snp_hom_alt,num_vhq_snp_called,num_vhq_snp_hom_ref,num_vhq_snp_het,num_vhq_snp_hom_alt,num_singleton,num_biallelic_singleton,num_hq_snp_singleton,num_vhq_snp_singleton,num_bi_nonsynonymous,num_bi_synonymous,num_frameshift,num_inframe,num_bi_frameshift,num_bi_inframe,num_hq_frameshift,num_hq_inframe,num_bi_frameshift_snpeff,num_bi_inframe_snpeff,num_bi_transition,num_bi_transversion,num_bi_AT_to_AT,num_bi_CG_to_CG,num_bi_AT_to_CG,num_bi_CG_to_AT,num_phased,num_phased_non_ref,num_phased_hom_ref,num_phased_missing,num_GQ_30,num_het_GQ_30,num_hom_alt_GQ_30,num_GQ_99,num_het_GQ_99,num_hom_alt_GQ_99,pc_pass,pc_missing,pc_pass_missing,pc_het,pc_pass_het,pc_hq_snp_het,pc_vhq_snp_het,pc_hom_alt,pc_pass_hom_alt,pc_snp,pc_biallelic,pc_spanning_del,pc_mutliallelic,pc_ins,pc_coding,pc_bi_nonsynonymous,pc_frameshift,pc_bi_frameshift,pc_hq_frameshift,pc_bi_frameshift_snpeff,pc_bi_transition,pc_bi_AT_to_AT,pc_bi_CG_to_CG,pc_bi_AT_to_CG,pc_bi_CG_to_AT,pc_phased,pc_phased_hom_ref,pc_phased_missing,pc_GQ_30,pc_het_GQ_30,pc_hom_alt_GQ_30,pc_GQ_99,pc_het_GQ_99,pc_hom_alt_GQ_99,mean_GQ,mean_GQ_hom_ref,mean_GQ_het,mean_GQ_hom_alt,mean_DP,mean_DP_hom_ref,mean_DP_het,mean_DP_hom_alt,mean_indel_len,total_indel_len
PA0174-C,1026-PF-GN-CONWAY,ZRY030,ERR059394,SC,0.0,0.9945,0.0055,0.0,0.0,0.0,136516,5415,0,0.0059,0.0,1.04,240.0,69.0,99,1734,1435,1383,1294,0,42,28,0.8276,0.9017,0.061,0.0195,0.0,6051696,3114760,6028384,3101755,23312,13005,23312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6235,6235,0,0,468,468,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.5578671928620453,0.9961478567330546,0.9958247184373756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0128504282834,3.07773933103,,,0.00446936521594,1.02906574394,,,0.0,0
PA0194-C,1026-PF-GN-CONWAY,ZRB230,ERR059395,SC,0.0,0.9945,0.0055,0.0,0.0,0.0,133832,5940,0,0.0057,0.0,1.04,326.9,801.2,99,1543,1410,1392,1294,0,40,21,0.9138,0.9177,0.0582,0.0149,0.0,6051696,3114760,6029859,3101790,21837,12970,21835,0,0,2,2,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6501,6501,0,0,506,506,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.5939460548610157,0.9963915900600426,0.99583595525819,0.0,0.0,0.0,0.0,9.158767229930853e-05,0.000154202004626,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012858133532,3.08744602097,,6.0,0.00444721262633,1.03300431832,,2.0,-6.4210404654e-07,-2
PA0241-C,1026-PF-GN-CONWAY,ZRK285,ERR055502,SC,0.0,0.9989,0.0011,0.0,0.0,0.0,25607,2198,0,0.0011,0.0,1.09,275.0,107.4,99,325,280,279,256,0,8,3,0.8615,0.9143,0.0588,0.0107,0.0,6051696,3114760,6047475,3112132,4221,2628,4221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1268,1268,0,0,89,89,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.6226012793176973,0.9993025095774803,0.9991562752828468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00256295830176,3.03767123288,,,0.000897019353016,1.01636225266,,,0.0,0
PA0244-C,1026-PF-GN-CONWAY,ZRS049,ERR055485,SC,0.0,0.998,0.002,0.0,0.0,0.0,48050,3959,0,0.0021,0.0,1.09,257.3,73.8,99,705,527,513,478,0,18,9,0.7475,0.907,0.07,0.0171,0.0,6051696,3114760,6043738,3109987,7958,4773,7958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2127,2127,0,0,148,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.5997738125157075,0.9986849967347996,0.998467618692933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00466552800216,3.04462602137,,,0.00159883907588,1.01697045883,,,0.0,0
PA0322-C,1026-PF-GN-CONWAY,GNV058,ERR403210,SC,0.0,0.9989,0.0011,0.0,0.0,0.0,30063,1246,0,0.0013,0.0,1.04,279.7,397.0,99,1239,319,303,269,0,15,9,0.2575,0.8433,0.1003,0.0282,0.0,6051696,3114760,6047957,3112640,3739,2120,3739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1434,1434,0,0,147,147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.566996523134528,0.999382156671452,0.999319369710668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00206211714546,3.02971698113,,,0.000702140774891,1.00990566038,,,0.0,0


In [25]:
tbl_sample_metadata['pv_30'].selectnone('source_code').valuecounts('study')

study,count,frequency
Broad,190,0.7089552238805971
chapelhill,78,0.291044776119403


In [26]:
733+190+78

1001

In [27]:
# I manually checked and none of the following 4 run accessions are available on the ENA, so I guess these have been
# removed but Solaris didn't get updated
tbl_solaris['pv_30'].selecteq('study', '1156-PV-ID-PRICE').selectne('run_accessions', 'NULL')

sample,source_code,study,run_accessions
PJ0024-C,UVM_14_RECODED,1156-PV-ID-PRICE,ERR662598
PJ0046-C,UVC_38_RECODED,1156-PV-ID-PRICE,ERR662600
PJ0093-C,UVC_44_RECODED,1156-PV-ID-PRICE,ERR662599
PJ0128-C,UVC_112_RECODED,1156-PV-ID-PRICE,ERR662597


In [28]:
tbl_sample_metadata['pv_30'].selecteq('study', '1156-PV-ID-PRICE').selectne('run_accessions', 'NULL')

sample,study,source_code,run_accessions,center_name,pc_bases_callable,pc_bases_no_coverage,pc_bases_low_coverage,pc_bases_excessive_coverage,pc_bases_poor_mapping_quality,pc_bases_ref_n,bases_of_1X_coverage,bases_of_2X_coverage,bases_of_5X_coverage,pc_genome_covered_at_1x,pc_genome_covered_at_5x,mean_coverage,mean_insert_size,sd_insert_size,avg_read_length,reads,reads_mapped,reads_mapped_and_paired,reads_properly_paired,reads_qc_failed,pairs_on_different_chromosomes,non_primary_alignments,pc_reads_mapped,pc_mapped_reads_properly_paired,pc_pairs_on_different_chromosomes,pc_non_primary_alignments,pc_reads_qc_failed,num_variants,num_pass_variants,num_missing,num_pass_missing,num_called,num_pass_called,num_hom_ref,num_het,num_pass_het,num_hom_alt,num_pass_hom_alt,num_pass_non_ref,num_biallelic_het,num_biallelic_hom_alt,num_spanning_del_het,num_spanning_del_hom_alt,num_multiallelic_het,num_multiallelic_hom_alt,num_snp_het,num_snp_hom_alt,num_snp,num_indel_het,num_indel_hom_alt,num_indel,num_ins_het,num_ins_hom_alt,num_ins,num_del_het,num_del_hom_alt,num_del,num_coding_het,num_coding_hom_alt,num_coding,num_hq_snp_called,num_hq_snp_hom_ref,num_hq_snp_het,num_hq_snp_hom_alt,num_vhq_snp_called,num_vhq_snp_hom_ref,num_vhq_snp_het,num_vhq_snp_hom_alt,num_singleton,num_biallelic_singleton,num_hq_snp_singleton,num_vhq_snp_singleton,num_bi_nonsynonymous,num_bi_synonymous,num_frameshift,num_inframe,num_bi_frameshift,num_bi_inframe,num_hq_frameshift,num_hq_inframe,num_bi_frameshift_snpeff,num_bi_inframe_snpeff,num_bi_transition,num_bi_transversion,num_bi_AT_to_AT,num_bi_CG_to_CG,num_bi_AT_to_CG,num_bi_CG_to_AT,num_phased,num_phased_non_ref,num_phased_hom_ref,num_phased_missing,num_GQ_30,num_het_GQ_30,num_hom_alt_GQ_30,num_GQ_99,num_het_GQ_99,num_hom_alt_GQ_99,pc_pass,pc_missing,pc_pass_missing,pc_het,pc_pass_het,pc_hq_snp_het,pc_vhq_snp_het,pc_hom_alt,pc_pass_hom_alt,pc_snp,pc_biallelic,pc_spanning_del,pc_mutliallelic,pc_ins,pc_coding,pc_bi_nonsynonymous,pc_frameshift,pc_bi_frameshift,pc_hq_frameshift,pc_bi_frameshift_snpeff,pc_bi_transition,pc_bi_AT_to_AT,pc_bi_CG_to_CG,pc_bi_AT_to_CG,pc_bi_CG_to_AT,pc_phased,pc_phased_hom_ref,pc_phased_missing,pc_GQ_30,pc_het_GQ_30,pc_hom_alt_GQ_30,pc_GQ_99,pc_het_GQ_99,pc_hom_alt_GQ_99,mean_GQ,mean_GQ_hom_ref,mean_GQ_het,mean_GQ_hom_alt,mean_DP,mean_DP_hom_ref,mean_DP_het,mean_DP_hom_alt,mean_indel_len,total_indel_len


In [29]:
# The following was in the file from Jim but not the final VCF
tbl_solaris['pv_30'].antijoin(tbl_sample_metadata['pv_30'], key='sample')

sample,source_code,study,run_accessions
QR0002-CW,FA-02-MD,1157-PV-MULTI-PRICE,ERR779841


In [30]:
for release in releases:
    print(release)
    tbl_sample_metadata[release].duplicates('sample').displayall()

pf_60


sample,study,source_code,run_accessions,center_name,pc_bases_callable,pc_bases_no_coverage,pc_bases_low_coverage,pc_bases_excessive_coverage,pc_bases_poor_mapping_quality,pc_bases_ref_n,bases_of_1X_coverage,bases_of_2X_coverage,bases_of_5X_coverage,pc_genome_covered_at_1x,pc_genome_covered_at_5x,mean_coverage,mean_insert_size,sd_insert_size,avg_read_length,reads,reads_mapped,reads_mapped_and_paired,reads_properly_paired,reads_qc_failed,pairs_on_different_chromosomes,non_primary_alignments,pc_reads_mapped,pc_mapped_reads_properly_paired,pc_pairs_on_different_chromosomes,pc_non_primary_alignments,pc_reads_qc_failed,num_variants,num_pass_variants,num_missing,num_pass_missing,num_called,num_pass_called,num_hom_ref,num_het,num_pass_het,num_hom_alt,num_pass_hom_alt,num_pass_non_ref,num_biallelic_het,num_biallelic_hom_alt,num_spanning_del_het,num_spanning_del_hom_alt,num_multiallelic_het,num_multiallelic_hom_alt,num_snp_het,num_snp_hom_alt,num_snp,num_indel_het,num_indel_hom_alt,num_indel,num_ins_het,num_ins_hom_alt,num_ins,num_del_het,num_del_hom_alt,num_del,num_coding_het,num_coding_hom_alt,num_coding,num_hq_snp_called,num_hq_snp_hom_ref,num_hq_snp_het,num_hq_snp_hom_alt,num_vhq_snp_called,num_vhq_snp_hom_ref,num_vhq_snp_het,num_vhq_snp_hom_alt,num_singleton,num_biallelic_singleton,num_hq_snp_singleton,num_vhq_snp_singleton,num_bi_nonsynonymous,num_bi_synonymous,num_frameshift,num_inframe,num_bi_frameshift,num_bi_inframe,num_hq_frameshift,num_hq_inframe,num_bi_frameshift_snpeff,num_bi_inframe_snpeff,num_bi_transition,num_bi_transversion,num_bi_AT_to_AT,num_bi_CG_to_CG,num_bi_AT_to_CG,num_bi_CG_to_AT,num_phased,num_phased_non_ref,num_phased_hom_ref,num_phased_missing,num_GQ_30,num_het_GQ_30,num_hom_alt_GQ_30,num_GQ_99,num_het_GQ_99,num_hom_alt_GQ_99,pc_pass,pc_missing,pc_pass_missing,pc_het,pc_pass_het,pc_hq_snp_het,pc_vhq_snp_het,pc_hom_alt,pc_pass_hom_alt,pc_snp,pc_biallelic,pc_spanning_del,pc_mutliallelic,pc_ins,pc_coding,pc_bi_nonsynonymous,pc_frameshift,pc_bi_frameshift,pc_hq_frameshift,pc_bi_frameshift_snpeff,pc_bi_transition,pc_bi_AT_to_AT,pc_bi_CG_to_CG,pc_bi_AT_to_CG,pc_bi_CG_to_AT,pc_phased,pc_phased_hom_ref,pc_phased_missing,pc_GQ_30,pc_het_GQ_30,pc_hom_alt_GQ_30,pc_GQ_99,pc_het_GQ_99,pc_hom_alt_GQ_99,mean_GQ,mean_GQ_hom_ref,mean_GQ_het,mean_GQ_hom_alt,mean_DP,mean_DP_hom_ref,mean_DP_het,mean_DP_hom_alt,mean_indel_len,total_indel_len


pv_30


sample,study,source_code,run_accessions,center_name,pc_bases_callable,pc_bases_no_coverage,pc_bases_low_coverage,pc_bases_excessive_coverage,pc_bases_poor_mapping_quality,pc_bases_ref_n,bases_of_1X_coverage,bases_of_2X_coverage,bases_of_5X_coverage,pc_genome_covered_at_1x,pc_genome_covered_at_5x,mean_coverage,mean_insert_size,sd_insert_size,avg_read_length,reads,reads_mapped,reads_mapped_and_paired,reads_properly_paired,reads_qc_failed,pairs_on_different_chromosomes,non_primary_alignments,pc_reads_mapped,pc_mapped_reads_properly_paired,pc_pairs_on_different_chromosomes,pc_non_primary_alignments,pc_reads_qc_failed,num_variants,num_pass_variants,num_missing,num_pass_missing,num_called,num_pass_called,num_hom_ref,num_het,num_pass_het,num_hom_alt,num_pass_hom_alt,num_pass_non_ref,num_biallelic_het,num_biallelic_hom_alt,num_spanning_del_het,num_spanning_del_hom_alt,num_multiallelic_het,num_multiallelic_hom_alt,num_snp_het,num_snp_hom_alt,num_snp,num_indel_het,num_indel_hom_alt,num_indel,num_ins_het,num_ins_hom_alt,num_ins,num_del_het,num_del_hom_alt,num_del,num_coding_het,num_coding_hom_alt,num_coding,num_hq_snp_called,num_hq_snp_hom_ref,num_hq_snp_het,num_hq_snp_hom_alt,num_vhq_snp_called,num_vhq_snp_hom_ref,num_vhq_snp_het,num_vhq_snp_hom_alt,num_singleton,num_biallelic_singleton,num_hq_snp_singleton,num_vhq_snp_singleton,num_bi_nonsynonymous,num_bi_synonymous,num_frameshift,num_inframe,num_bi_frameshift,num_bi_inframe,num_hq_frameshift,num_hq_inframe,num_bi_frameshift_snpeff,num_bi_inframe_snpeff,num_bi_transition,num_bi_transversion,num_bi_AT_to_AT,num_bi_CG_to_CG,num_bi_AT_to_CG,num_bi_CG_to_AT,num_phased,num_phased_non_ref,num_phased_hom_ref,num_phased_missing,num_GQ_30,num_het_GQ_30,num_hom_alt_GQ_30,num_GQ_99,num_het_GQ_99,num_hom_alt_GQ_99,pc_pass,pc_missing,pc_pass_missing,pc_het,pc_pass_het,pc_hq_snp_het,pc_vhq_snp_het,pc_hom_alt,pc_pass_hom_alt,pc_snp,pc_biallelic,pc_spanning_del,pc_mutliallelic,pc_ins,pc_coding,pc_bi_nonsynonymous,pc_frameshift,pc_bi_frameshift,pc_hq_frameshift,pc_bi_frameshift_snpeff,pc_bi_transition,pc_bi_AT_to_AT,pc_bi_CG_to_CG,pc_bi_AT_to_CG,pc_bi_CG_to_AT,pc_phased,pc_phased_hom_ref,pc_phased_missing,pc_GQ_30,pc_het_GQ_30,pc_hom_alt_GQ_30,pc_GQ_99,pc_het_GQ_99,pc_hom_alt_GQ_99,mean_GQ,mean_GQ_hom_ref,mean_GQ_het,mean_GQ_hom_alt,mean_DP,mean_DP_hom_ref,mean_DP_het,mean_DP_hom_alt,mean_indel_len,total_indel_len


In [31]:
for release in releases:
    print(release)
    tbl_sample_metadata[release].valuecounts('avg_read_length').sort('avg_read_length').displayall()

pf_60


avg_read_length,count,frequency
36,58,0.0080757449178501
37,10,0.0013923698134224
46,1,0.0001392369813422
47,1,0.0001392369813422
53,72,0.0100250626566416
57,1,0.0001392369813422
60,1,0.0001392369813422
62,2,0.0002784739626844
63,1,0.0001392369813422
64,1,0.0001392369813422


pv_30


avg_read_length,count,frequency
74,11,0.0109890109890109
75,28,0.0279720279720279
96,1,0.0009990009990009
97,3,0.0029970029970029
98,6,0.0059940059940059
99,792,0.7912087912087912
100,160,0.1598401598401598


In [32]:
for release in releases:
    print(release)
    tbl_sample_metadata[release].valuecounts('study').sort('study').displayall()

pf_60


study,count,frequency
1001-PF-ML-DJIMDE,96,0.0133667502088554
1004-PF-BF-OUEDRAOGO,57,0.0079365079365079
1006-PF-GM-CONWAY,79,0.0109997215260373
1007-PF-TZ-DUFFY,50,0.0069618490671122
1008-PF-SEA-RINGWALD,234,0.0325814536340852
1009-PF-KH-PLOWE,5,0.0006961849067112
1010-PF-TH-ANDERSON,108,0.0150375939849624
1011-PF-KH-SU,41,0.005708716235032
1012-PF-KH-WHITE,2,0.0002784739626844
1013-PF-PEGB-BRANCH,16,0.0022277917014759


pv_30


study,count,frequency
1020-PF-VN-BONI,1,0.0009990009990009
1044-PF-KH-FAIRHURST,83,0.0829170829170829
1046-PV-BR-FERRERIA,5,0.0049950049950049
1047-PV-LK-KARUNAWEERA,2,0.0019980019980019
1049-PV-VN-BONI,13,0.0129870129870129
1050-PV-PN-MUELLER,20,0.0199800199800199
1052-PF-TRAC-WHITE,17,0.0169830169830169
1062-PF-PG-BARRY,4,0.0039960039960039
1098-PF-ET-GOLASSA,96,0.0959040959040959
1102-PF-MG-RANDRIANARIVELOJOSIA,1,0.0009990009990009
