# Extract mutations in salivary proteins

## Setup

In [1]:
%run setup.ipynb

In [2]:
# download gene annotations from vectorbase
!wget \
    --no-clobber \
    -O ../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz \
    https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp44gff3gz


File `../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz' already there; not retrieving.


In [3]:
# load the vectorbase geneset
geneset_agamp44 = allel.FeatureTable.from_gff3('../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz',
                                               attributes=['ID', 'Parent'])
geneset_agamp44 = geneset_to_pandas(geneset_agamp44)
geneset_agamp44.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
0,2L,VectorBase,chromosome,1,49364325,-1,.,-1,2L,.
1,2L,VectorBase,gene,157348,186936,-1,-,-1,AGAP004677,.
2,2L,VectorBase,mRNA,157348,181305,-1,-,-1,AGAP004677-RA,AGAP004677
3,2L,VectorBase,three_prime_UTR,157348,157495,-1,-,-1,.,AGAP004677-RA
4,2L,VectorBase,exon,157348,157623,-1,-,-1,.,AGAP004677-RA


In [4]:
# subset to SG6
region_sg6 = SeqFeature('X', 2405458, 2406119)
geneset_sg6 = geneset_agamp44.query(region_sg6.query).copy()
geneset_sg6.type.value_counts()

three_prime_UTR    1
five_prime_UTR     1
mRNA               1
exon               1
gene               1
CDS                1
Name: type, dtype: int64

In [5]:
geneset_sg6

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
164723,X,VectorBase,gene,2405458,2406119,-1,-,-1,AGAP000150,.
164724,X,VectorBase,mRNA,2405458,2406119,-1,-,-1,AGAP000150-RA,AGAP000150
164725,X,VectorBase,three_prime_UTR,2405458,2405693,-1,-,-1,.,AGAP000150-RA
164726,X,VectorBase,exon,2405458,2406119,-1,-,-1,.,AGAP000150-RA
164727,X,VectorBase,CDS,2405694,2406041,-1,-,0,AGAP000150-PA,AGAP000150-RA
164728,X,VectorBase,five_prime_UTR,2406042,2406119,-1,-,-1,.,AGAP000150-RA


In [6]:
# subset to CE5
region_ce5 = SeqFeature('3R', 3998920, 3999952)
geneset_ce5 = geneset_agamp44.query(region_ce5.query).copy()
geneset_ce5.type.value_counts()

CDS                4
exon               4
five_prime_UTR     1
mRNA               1
three_prime_UTR    1
gene               1
Name: type, dtype: int64

In [7]:
geneset_ce5

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
126178,3R,VectorBase,exon,3999031,3999129,-1,+,-1,.,AGAP008003-RA
126179,3R,VectorBase,CDS,3999031,3999129,-1,+,2,AGAP008003-PA,AGAP008003-RA
126189,3R,VectorBase,gene,3998920,3999952,-1,-,-1,AGAP008004,.
126190,3R,VectorBase,mRNA,3998920,3999952,-1,-,-1,AGAP008004-RA,AGAP008004
126191,3R,VectorBase,three_prime_UTR,3998920,3999312,-1,-,-1,.,AGAP008004-RA
126192,3R,VectorBase,exon,3998920,3999443,-1,-,-1,.,AGAP008004-RA
126193,3R,VectorBase,CDS,3999313,3999443,-1,-,2,AGAP008004-PA,AGAP008004-RA
126194,3R,VectorBase,exon,3999509,3999629,-1,-,-1,.,AGAP008004-RA
126195,3R,VectorBase,CDS,3999509,3999629,-1,-,0,AGAP008004-PA,AGAP008004-RA
126196,3R,VectorBase,CDS,3999708,3999767,-1,-,0,AGAP008004-PA,AGAP008004-RA


In [8]:
# setup a variant annotator
annotator = veff.Annotator(
    fasta_path='../ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/genome/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa', 
    gff3_path=['../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz'],
)

## Extract table of variants

In [9]:
callset = zarr.open_group('../ngs.sanger.ac.uk/production/ag1000g/phase1/AR3.1/variation/main/zarr2/ag1000g.phase1.ar3', mode='r')
callset

Group(/, 8)
  arrays: 1; samples
  groups: 7; 2L, 2R, 3L, 3R, UNKN, X, Y_unplaced
  store: DirectoryStore

In [10]:
# what fields are available?
print(', '.join(callset['X/variants']))

ABHet, ABHom, AC, AF, ALT, AN, ANN, Accessible, BaseCounts, BaseQRankSum, CHROM, Coverage, CoverageMQ0, DP, DS, Dels, FILTER_FS, FILTER_HRun, FILTER_HighCoverage, FILTER_HighMQ0, FILTER_LowCoverage, FILTER_LowMQ, FILTER_LowQual, FILTER_NoCoverage, FILTER_PASS, FILTER_QD, FILTER_ReadPosRankSum, FILTER_RefN, FILTER_RepeatDUST, FS, HRun, HW, HaplotypeScore, HighCoverage, HighMQ0, InbreedingCoeff, LOF, LowCoverage, LowMQ, LowPairing, MLEAC, MLEAF, MQ, MQ0, MQRankSum, NDA, NMD, NoCoverage, OND, POS, QD, QUAL, REF, RPA, RU, ReadPosRankSum, RefMasked, RefN, RepeatDUST, RepeatMasker, RepeatTRF, STR, VariantType, is_snp, num_alleles, svlen


In [11]:
# what SNPEFF fields are available?
print(', '.join(callset['X/variants/ANN'].dtype.names))

Allele, Annotation, Annotation_Impact, Gene_Name, Gene_ID, Feature_Type, Feature_ID, Transcript_BioType, Rank, HGVS_c, HGVS_p, cDNA_pos, cDNA_length, CDS_pos, CDS_length, AA_pos, AA_length, Distance


In [12]:
samples = pandas.read_csv('../ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/samples/samples.meta.txt',
                          sep='\t', index_col='index')
samples.head()

Unnamed: 0_level_0,ox_code,src_code,sra_sample_accession,population,country,region,contributor,contact,year,m_s,sex,n_sequences,mean_coverage,latitude,longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,AB0085-C,BF2-4,ERS223996,BFS,Burkina Faso,Pala,Austin Burt,Sam O'Loughlin,2012,S,F,89905852,28.01,11.15,-4.235
1,AB0087-C,BF3-3,ERS224013,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,116706234,36.76,11.233,-4.472
2,AB0088-C,BF3-5,ERS223991,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,112090460,23.3,11.233,-4.472
3,AB0089-C,BF3-8,ERS224031,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,145350454,41.36,11.233,-4.472
4,AB0090-C,BF3-10,ERS223936,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,105012254,34.64,11.233,-4.472


In [13]:
def tabulate_variants(callset, snpeff, seqid, start, end, pop_ids, subpops):
    """Build a table of variants for a given callset and genome region."""
    
    variants = callset[seqid]['variants']
    ann = snpeff[seqid]['variants']['ANN']
    pos = allel.SortedIndex(variants['POS'])
    loc = pos.locate_range(start, end)
    genotype = allel.GenotypeArray(callset[seqid]['calldata/genotype'][loc])
    acs = genotype.count_alleles_subpops(max_allele=3, subpops=subpops)
    
    # extract columns
    variants_fields = [
        'CHROM',
        'POS',
        'num_alleles',
        'REF',
        'ALT',
        'AC',
        'FILTER_PASS',
        'NoCoverage',
        'LowCoverage',
        'HighCoverage',
        'LowMQ',
        'HighMQ0',
        'RepeatDUST',
        'RepeatMasker',
        'RepeatTRF',
        'FS',
        'HRun',
        'QD',
        'ReadPosRankSum',
    ]
    ann_fields = ['Allele', 'Annotation', 'HGVS_c', 'HGVS_p', 'Feature_ID']
    cols = (
        [variants[f][loc] for f in variants_fields] + 
        [ann[loc][f] for f in ann_fields] + 
        [acs[p].to_frequencies() for p in pop_ids]
    )

    def split_alleles(row):
        for i in range(row.num_alleles - 1):
            # break down alleles
            out = [
                row['CHROM'], 
                row['POS'], 
                row['num_alleles'], 
                row['REF'], 
                row['ALT'][i], 
                row['AC'][i], 
                i, 
            ]
            # add in remaining variant annotations
            out += [row[f] for f in variants_fields[6:]]
            # SNPEFF annotation only applies to first allele
            if i == 0:
                out += [row[f] for f in ann_fields]
            else:
                out += [None for f in ann_fields]
            # add in population allele frequencies
            out += [row[p][i+1] for p in pop_ids]
            yield out
        
    tbl = (
        etl
        .fromcolumns(cols, header=variants_fields + ann_fields + list(pop_ids))
        .rowmapmany(split_alleles, header=variants_fields[:6] + ['ALTIX'] + variants_fields[6:] + ann_fields + list(pop_ids), failonerror=True)
        .convert('CHROM REF ALT Allele Annotation HGVS_c HGVS_p Feature_ID'.split(), lambda v: str(v, 'ascii'))
        .rename({f: 'SNPEFF_' + f for f in ann_fields})
        .rename({p: 'AF_%s' % p for p in pop_ids})
        .addfield('check_allele', lambda row: row['SNPEFF_Allele'] is None or row['SNPEFF_Allele'] == row['ALT'])
    )
    
    return tbl

In [14]:
pop_ids = 'AOM BFM GWA GNS BFS CMS GAS UGS KES'.split()

In [15]:
subpops = {p: samples[samples.population == p].index.values.tolist() for p in pop_ids}

In [16]:
# build a table of variants from phase 1
tbl_variants_sg6 = tabulate_variants(callset, callset, 
                                     seqid=region_sg6.seqid, start=region_sg6.start, end=region_sg6.end, 
                                     pop_ids=pop_ids, subpops=subpops)
tbl_variants_sg6

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|AF_AOM,26|AF_BFM,27|AF_GWA,28|AF_GNS,29|AF_BFS,30|AF_CMS,31|AF_GAS,32|AF_UGS,33|AF_KES,34|check_allele
X,2405458,2,A,G,3,0,True,0,2,4,0,0,False,False,False,1.5127,0,17.297,0.49902,G,splice_region_variant,n.*236T>C,.,AGAP000150-RA,0.0,0.0,0.0,0.0,0.0,0.00545454545455,0.0,0.0,0.0,True
X,2405459,2,A,T,1,0,True,0,2,5,0,0,False,False,False,2.082,0,13.539,-2.1465,T,splice_region_variant,n.*235T>A,.,AGAP000150-RA,0.0,0.0,0.0,0.0161290322581,0.0,0.0,0.0,0.0,0.0,True
X,2405460,3,A,G,49,0,True,0,3,5,0,0,False,False,False,0.7749,0,13.57,1.3779,G,splice_region_variant,n.*234T>C,.,AGAP000150-RA,0.0,0.036231884058,0.0434782608696,0.0645161290323,0.0432098765432,0.0381818181818,0.0446428571429,0.0145631067961,0.0,True
X,2405460,3,A,T,5,1,True,0,3,5,0,0,False,False,False,0.7749,0,13.57,1.3779,,,,,,0.0,0.0,0.0,0.0,0.0123456790123,0.00363636363636,0.0,0.00485436893204,0.0,True
X,2405461,2,C,A,4,0,True,0,3,5,0,0,False,False,False,6.2812,3,14.57,-1.6475,A,3_prime_UTR_variant,n.*233G>T,.,AGAP000150-RA,0.0,0.0144927536232,0.0108695652174,0.0,0.0,0.0,0.0,0.00485436893204,0.0,True


In [17]:
# build a table of variants from phase 1
tbl_variants_ce5 = tabulate_variants(callset, callset, 
                                     seqid=region_ce5.seqid, start=region_ce5.start, end=region_ce5.end, 
                                     pop_ids=pop_ids, subpops=subpops)
tbl_variants_ce5

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|AF_AOM,26|AF_BFM,27|AF_GWA,28|AF_GNS,29|AF_BFS,30|AF_CMS,31|AF_GAS,32|AF_UGS,33|AF_KES,34|check_allele
3R,3998920,2,T,C,1,0,True,0,1,3,0,0,False,False,False,4.8984,0,12.031,-1.7402,C,splice_region_variant,n.*393A>G,.,AGAP008004-RA,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True
3R,3998922,2,T,C,898,0,True,0,1,3,0,0,False,False,False,18.328,0,23.312,3.8945,C,splice_region_variant,n.*391A>G,.,AGAP008004-RA,0.45,0.666666666667,0.608695652174,0.709677419355,0.567901234568,0.630909090909,0.5625,0.640776699029,0.204545454545,True
3R,3998924,2,G,T,14,0,True,0,1,3,0,0,False,False,False,3.8594,2,14.102,2.1875,T,3_prime_UTR_variant,n.*389C>A,.,AGAP008004-RA,0.0,0.0,0.0,0.0322580645161,0.037037037037,0.0109090909091,0.0,0.0,0.0,True
3R,3998927,2,C,G,7,0,True,0,1,4,0,0,False,False,False,1.4326,0,15.453,0.21204,G,3_prime_UTR_variant,n.*386G>C,.,AGAP008004-RA,0.0,0.0,0.0108695652174,0.0,0.0,0.00727272727273,0.0,0.00970873786408,0.0,True
3R,3998928,3,C,A,1,0,True,0,1,4,0,0,False,False,False,10.891,0,13.211,0.61377,A,3_prime_UTR_variant,n.*385G>T,.,AGAP008004-RA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00485436893204,0.0,True


## Annotate effects

In [18]:
cds_effects = [
    'NON_SYNONYMOUS_CODING', 
    'SYNONYMOUS_CODING',    
]
intron_effects = [
    'INTRONIC', 
    'SPLICE_CORE',
    'SPLICE_REGION',        
]


In [19]:
def lpop(l, default=None):
    """Pop the first item from a list if not empty."""
    try:
        return l[0]
    except IndexError:
        return default


In [20]:
def transcript_effect(transcript_id):
    def f(row):
        e = lpop([e for e in row.VEFF if e.transcript_id == transcript_id])
        if e and e.effect in cds_effects:
            return (e.effect, e.aa_change)
        elif e and e.effect in intron_effects:
            return (e.effect, e.intron_cds_5prime, e.intron_5prime_dist, e.intron_cds_3prime, e.intron_3prime_dist)
        else:
            return None
    return f


In [21]:
tbl_variants_sg6_eff = (
    tbl_variants_sg6
    .addfield('VEFF', lambda row: [e for e in annotator.get_effects(chrom=row.CHROM, pos=row.POS, ref=row.REF, alt=row.ALT)])
    .addfield('AGAP000150-RA', transcript_effect('AGAP000150-RA'))
    .replaceall('.', None)
    .replaceall('', None)
    .cutout('VEFF')
    .cache()
)

In [22]:
tbl_variants_ce5_eff = (
    tbl_variants_ce5
    .addfield('VEFF', lambda row: [e for e in annotator.get_effects(chrom=row.CHROM, pos=row.POS, ref=row.REF, alt=row.ALT)])
    .addfield('AGAP008004-RA', transcript_effect('AGAP008004-RA'))
    .replaceall('.', None)
    .replaceall('', None)
    .cutout('VEFF')
    .cache()
)

In [23]:
list(annotator.get_effects(chrom='X', pos=2405705, ref='G', alt='A'))

[VariantEffect(effect='SYNONYMOUS_CODING', impact='LOW', chrom='X', pos=2405705, ref='G', alt='A', vlen=0, ref_start=2405705, ref_stop=2405705, gene_id='AGAP000150', gene_start=2405458, gene_stop=2406119, gene_strand='-', transcript_id='AGAP000150-RA', transcript_start=2405458, transcript_stop=2406119, transcript_strand='-', cds_id='AGAP000150-PA', cds_start=2405694, cds_stop=2406041, cds_strand='-', intron_start=None, intron_stop=None, intron_5prime_dist=None, intron_3prime_dist=None, intron_cds_5prime=None, intron_cds_3prime=None, ref_cds_start=336, ref_cds_stop=336, ref_intron_start=None, ref_intron_stop=None, ref_start_phase=0, ref_codon='Ctg', alt_codon='Ttg', codon_change='Ctg/Ttg', aa_pos=113, ref_aa='L', alt_aa='L', aa_change='L113L')]

In [24]:
tbl_variants_sg6_eff.display(200)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|AF_AOM,26|AF_BFM,27|AF_GWA,28|AF_GNS,29|AF_BFS,30|AF_CMS,31|AF_GAS,32|AF_UGS,33|AF_KES,34|check_allele,35|AGAP000150-RA
X,2405458,2,A,G,3,0,True,0,2,4,0,0,False,False,False,1.5127,0,17.297,0.49902,G,splice_region_variant,n.*236T>C,,AGAP000150-RA,0.0,0.0,0.0,0.0,0.0,0.00545454545455,0.0,0.0,0.0,True,
X,2405459,2,A,T,1,0,True,0,2,5,0,0,False,False,False,2.082,0,13.539,-2.1465,T,splice_region_variant,n.*235T>A,,AGAP000150-RA,0.0,0.0,0.0,0.0161290322581,0.0,0.0,0.0,0.0,0.0,True,
X,2405460,3,A,G,49,0,True,0,3,5,0,0,False,False,False,0.7749,0,13.57,1.3779,G,splice_region_variant,n.*234T>C,,AGAP000150-RA,0.0,0.036231884058,0.0434782608696,0.0645161290323,0.0432098765432,0.0381818181818,0.0446428571429,0.0145631067961,0.0,True,
X,2405460,3,A,T,5,1,True,0,3,5,0,0,False,False,False,0.7749,0,13.57,1.3779,,,,,,0.0,0.0,0.0,0.0,0.0123456790123,0.00363636363636,0.0,0.00485436893204,0.0,True,
X,2405461,2,C,A,4,0,True,0,3,5,0,0,False,False,False,6.2812,3,14.57,-1.6475,A,3_prime_UTR_variant,n.*233G>T,,AGAP000150-RA,0.0,0.0144927536232,0.0108695652174,0.0,0.0,0.0,0.0,0.00485436893204,0.0,True,
X,2405462,3,A,G,1,0,True,0,3,5,0,0,False,False,False,0.0,0,9.7734,0.033997,G,3_prime_UTR_variant,n.*232T>C,,AGAP000150-RA,0.0,0.0,0.0,0.0,0.00617283950617,0.0,0.0,0.0,0.0,True,
X,2405462,3,A,T,1,1,True,0,3,5,0,0,False,False,False,0.0,0,9.7734,0.033997,,,,,,0.0,0.0,0.0,0.0161290322581,0.0,0.0,0.0,0.0,0.0,True,
X,2405464,2,A,T,1,0,True,0,2,5,0,0,False,False,False,0.0,2,14.453,-0.50684,T,3_prime_UTR_variant,n.*230T>A,,AGAP000150-RA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00485436893204,0.0,True,
X,2405465,2,T,A,10,0,True,0,2,5,0,0,False,False,False,4.543,3,14.75,1.9834,A,3_prime_UTR_variant,n.*229A>T,,AGAP000150-RA,0.0,0.0144927536232,0.0108695652174,0.0161290322581,0.0123456790123,0.00727272727273,0.0,0.0,0.0,True,
X,2405466,2,T,C,1,0,True,0,2,5,0,0,False,False,False,4.2656,2,10.219,-0.022995,C,3_prime_UTR_variant,n.*228A>G,,AGAP000150-RA,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,


In [25]:
tbl_variants_ce5_eff.display(200)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|AF_AOM,26|AF_BFM,27|AF_GWA,28|AF_GNS,29|AF_BFS,30|AF_CMS,31|AF_GAS,32|AF_UGS,33|AF_KES,34|check_allele,35|AGAP008004-RA
3R,3998920,2,T,C,1,0,True,0,1,3,0,0,False,False,False,4.8984,0,12.031,-1.7402,C,splice_region_variant,n.*393A>G,,AGAP008004-RA,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True,
3R,3998922,2,T,C,898,0,True,0,1,3,0,0,False,False,False,18.328,0,23.312,3.8945,C,splice_region_variant,n.*391A>G,,AGAP008004-RA,0.45,0.666666666667,0.608695652174,0.709677419355,0.567901234568,0.630909090909,0.5625,0.640776699029,0.204545454545,True,
3R,3998924,2,G,T,14,0,True,0,1,3,0,0,False,False,False,3.8594,2,14.102,2.1875,T,3_prime_UTR_variant,n.*389C>A,,AGAP008004-RA,0.0,0.0,0.0,0.0322580645161,0.037037037037,0.0109090909091,0.0,0.0,0.0,True,
3R,3998927,2,C,G,7,0,True,0,1,4,0,0,False,False,False,1.4326,0,15.453,0.21204,G,3_prime_UTR_variant,n.*386G>C,,AGAP008004-RA,0.0,0.0,0.0108695652174,0.0,0.0,0.00727272727273,0.0,0.00970873786408,0.0,True,
3R,3998928,3,C,A,1,0,True,0,1,4,0,0,False,False,False,10.891,0,13.211,0.61377,A,3_prime_UTR_variant,n.*385G>T,,AGAP008004-RA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00485436893204,0.0,True,
3R,3998928,3,C,T,25,1,True,0,1,4,0,0,False,False,False,10.891,0,13.211,0.61377,,,,,,0.0,0.0,0.0760869565217,0.0,0.00617283950617,0.0109090909091,0.0714285714286,0.0145631067961,0.0,True,
3R,3998929,2,T,A,1,0,True,0,1,4,0,0,False,False,False,1.8691,1,7.9297,1.0527,A,3_prime_UTR_variant,n.*384A>T,,AGAP008004-RA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00485436893204,0.0,True,
3R,3998930,3,A,G,758,0,True,0,1,4,0,0,False,False,False,18.766,0,29.156,5.8008,G,3_prime_UTR_variant,n.*383T>C,,AGAP008004-RA,0.441666666667,0.514492753623,0.358695652174,0.629032258065,0.530864197531,0.516363636364,0.535714285714,0.553398058252,0.204545454545,True,
3R,3998930,3,A,T,4,1,True,0,1,4,0,0,False,False,False,18.766,0,29.156,5.8008,,,,,,0.0,0.0,0.0,0.0161290322581,0.00617283950617,0.00363636363636,0.0,0.0,0.0,True,
3R,3998932,3,A,G,1,0,True,0,1,4,0,0,False,False,False,1.1514,0,15.031,3.2559,G,3_prime_UTR_variant,n.*381T>C,,AGAP008004-RA,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True,


## Inspect missense variants

In [26]:
def simplify_missense_effect(v):
    if v and v[0] == 'NON_SYNONYMOUS_CODING':
        return v[1]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_sg6_missense = (
    tbl_variants_sg6_eff
    .select(lambda row: row['AGAP000150-RA'] and row['AGAP000150-RA'][0] == 'NON_SYNONYMOUS_CODING')
    .convert('AGAP000150-RA', simplify_missense_effect)
    .ge('AF_UGS', 0.01)
    .cutout('check_allele', *[f for f in tbl_variants_sg6_eff.header() if f.startswith('SNPEFF')])
)
tbl_variants_sg6_missense.displayall(td_styles=td_styles, tr_style=tr_style, caption='SG6')

tbl_variants_ce5_missense = (
    tbl_variants_ce5_eff
    .select(lambda row: row['AGAP008004-RA'] and row['AGAP008004-RA'][0] == 'NON_SYNONYMOUS_CODING')
    .convert('AGAP008004-RA', simplify_missense_effect)
    .ge('AF_UGS', 0.01)
    .cutout('check_allele', *[f for f in tbl_variants_sg6_eff.header() if f.startswith('SNPEFF')])
)
tbl_variants_ce5_missense.displayall(td_styles=td_styles, tr_style=tr_style, caption='CE5')

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|AF_AOM,21|AF_BFM,22|AF_GWA,23|AF_GNS,24|AF_BFS,25|AF_CMS,26|AF_GAS,27|AF_UGS,28|AF_KES,29|AGAP000150-RA
X,2405831,2,T,G,28,0,False,0,16,2,1,0,False,False,False,3200.0,1,0.049988,8.1562,0.0166666666667,0.0144927536232,0.0108695652174,0.0161290322581,0.00617283950617,0.0182481751825,0.0267857142857,0.0291262135922,0.0227272727273,T71P
X,2405832,2,T,A,5,0,True,0,17,2,1,0,False,False,False,1.2969,0,11.117,0.89502,0.0,0.0,0.0,0.0,0.0,0.0036496350365,0.0,0.0145631067961,0.0,E70D
X,2405870,2,T,G,53,0,False,0,11,2,0,0,False,False,False,3200.0,3,0.25,-16.484,0.0333333333333,0.0507246376812,0.0,0.0322580645161,0.037037037037,0.0327272727273,0.0446428571429,0.0291262135922,0.0568181818182,T58P
X,2405931,2,A,C,1389,0,True,0,13,2,0,0,False,False,False,7.3359,1,32.812,0.63184,0.966666666667,0.869565217391,0.880434782609,0.935483870968,0.913580246914,0.892727272727,0.848214285714,0.932038834951,1.0,N37K
X,2405968,3,T,A,20,0,True,0,9,2,0,0,False,False,False,2.9648,0,12.289,0.011002,0.00833333333333,0.0144927536232,0.0108695652174,0.0483870967742,0.0,0.0145454545455,0.0,0.0242718446602,0.0,H25L
X,2405969,2,G,A,72,0,True,0,8,2,0,0,False,False,False,1.7441,0,15.617,-0.085999,0.0,0.0507246376812,0.0108695652174,0.0483870967742,0.0740740740741,0.0509090909091,0.0535714285714,0.0679611650485,0.0113636363636,H25Y
X,2405982,3,T,G,186,1,True,0,9,2,0,0,False,False,False,0.3501,0,17.328,1.875,0.225,0.0797101449275,0.25,0.177419354839,0.16049382716,0.127272727273,0.0,0.0873786407767,0.0,E20D
X,2406028,2,A,C,297,0,True,0,6,2,0,0,False,False,False,0.66211,2,16.938,-2.1973,0.391666666667,0.188405797101,0.141304347826,0.161290322581,0.185185185185,0.174545454545,0.125,0.179611650485,0.272727272727,V5G


0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|AF_AOM,21|AF_BFM,22|AF_GWA,23|AF_GNS,24|AF_BFS,25|AF_CMS,26|AF_GAS,27|AF_UGS,28|AF_KES,29|AGAP008004-RA
3R,3999325,3,C,A,86,0,True,0,0,3,0,0,False,False,False,28.5,0,29.578,1.5811,0.308333333333,0.0434782608696,0.0108695652174,0.0483870967742,0.0308641975309,0.04,0.0535714285714,0.0291262135922,0.0,E100D
3R,3999371,2,G,A,94,0,True,0,1,1,0,0,False,False,False,34.781,2,15.43,-0.89697,0.0166666666667,0.00724637681159,0.108695652174,0.0806451612903,0.037037037037,0.0690909090909,0.0178571428571,0.106796116505,0.0909090909091,S85F
3R,3999372,2,A,T,59,0,True,0,0,1,0,0,False,False,False,8.4453,0,13.773,-1.0166,0.0,0.0217391304348,0.0652173913043,0.0806451612903,0.0740740740741,0.0454545454545,0.0178571428571,0.0291262135922,0.0,S85T
3R,3999423,2,C,T,10,0,True,0,0,1,0,0,False,False,False,0.76514,1,14.672,-0.91895,0.0,0.0,0.0,0.0,0.00617283950617,0.0109090909091,0.0,0.0145631067961,0.0,D68N
3R,3999425,3,G,C,6,1,True,0,0,1,0,0,False,False,False,9.9297,0,16.422,1.4482,0.0,0.0,0.0,0.0,0.0123456790123,0.00181818181818,0.0,0.0145631067961,0.0,A67G
3R,3999731,3,G,C,33,1,True,0,1,2,0,0,False,False,False,2.543,0,12.672,4.0117,0.00833333333333,0.0217391304348,0.0326086956522,0.0,0.0308641975309,0.0272727272727,0.0,0.0291262135922,0.0,L13V
3R,3999763,2,G,A,85,0,True,0,0,2,0,0,False,False,False,0.46094,2,14.703,0.96094,0.175,0.036231884058,0.0217391304348,0.0645161290323,0.0555555555556,0.0472727272727,0.0357142857143,0.0679611650485,0.0,A2V


In [27]:
qflds = tbl_variants_sg6_missense.header()[7:20]

In [28]:
(tbl_variants_sg6_missense
 .true('FILTER_PASS')
 .cutout(*qflds)
 .teetsv('sg6_missense.txt')
 .displayall(td_styles=td_styles, tr_style=tr_style))

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|AF_AOM,8|AF_BFM,9|AF_GWA,10|AF_GNS,11|AF_BFS,12|AF_CMS,13|AF_GAS,14|AF_UGS,15|AF_KES,16|AGAP000150-RA
X,2405832,2,T,A,5,0,0.0,0.0,0.0,0.0,0.0,0.0036496350365,0.0,0.0145631067961,0.0,E70D
X,2405931,2,A,C,1389,0,0.966666666667,0.869565217391,0.880434782609,0.935483870968,0.913580246914,0.892727272727,0.848214285714,0.932038834951,1.0,N37K
X,2405968,3,T,A,20,0,0.00833333333333,0.0144927536232,0.0108695652174,0.0483870967742,0.0,0.0145454545455,0.0,0.0242718446602,0.0,H25L
X,2405969,2,G,A,72,0,0.0,0.0507246376812,0.0108695652174,0.0483870967742,0.0740740740741,0.0509090909091,0.0535714285714,0.0679611650485,0.0113636363636,H25Y
X,2405982,3,T,G,186,1,0.225,0.0797101449275,0.25,0.177419354839,0.16049382716,0.127272727273,0.0,0.0873786407767,0.0,E20D
X,2406028,2,A,C,297,0,0.391666666667,0.188405797101,0.141304347826,0.161290322581,0.185185185185,0.174545454545,0.125,0.179611650485,0.272727272727,V5G


In [29]:
(tbl_variants_ce5_missense
 .true('FILTER_PASS')
 .cutout(*qflds)
 .teetsv('ce5_missense.txt')
 .displayall(td_styles=td_styles, tr_style=tr_style))

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|AF_AOM,8|AF_BFM,9|AF_GWA,10|AF_GNS,11|AF_BFS,12|AF_CMS,13|AF_GAS,14|AF_UGS,15|AF_KES,16|AGAP008004-RA
3R,3999325,3,C,A,86,0,0.308333333333,0.0434782608696,0.0108695652174,0.0483870967742,0.0308641975309,0.04,0.0535714285714,0.0291262135922,0.0,E100D
3R,3999371,2,G,A,94,0,0.0166666666667,0.00724637681159,0.108695652174,0.0806451612903,0.037037037037,0.0690909090909,0.0178571428571,0.106796116505,0.0909090909091,S85F
3R,3999372,2,A,T,59,0,0.0,0.0217391304348,0.0652173913043,0.0806451612903,0.0740740740741,0.0454545454545,0.0178571428571,0.0291262135922,0.0,S85T
3R,3999423,2,C,T,10,0,0.0,0.0,0.0,0.0,0.00617283950617,0.0109090909091,0.0,0.0145631067961,0.0,D68N
3R,3999425,3,G,C,6,1,0.0,0.0,0.0,0.0,0.0123456790123,0.00181818181818,0.0,0.0145631067961,0.0,A67G
3R,3999731,3,G,C,33,1,0.00833333333333,0.0217391304348,0.0326086956522,0.0,0.0308641975309,0.0272727272727,0.0,0.0291262135922,0.0,L13V
3R,3999763,2,G,A,85,0,0.175,0.036231884058,0.0217391304348,0.0645161290323,0.0555555555556,0.0472727272727,0.0357142857143,0.0679611650485,0.0,A2V


In [75]:
tbl_variants_sg6_missense_bi = tbl_variants_sg6_missense.eq('num_alleles', 2).true('FILTER_PASS')
(tbl_variants_sg6_missense_bi
 .cutout(*qflds)
 .displayall(td_styles=td_styles, tr_style=tr_style))

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|AF_AOM,8|AF_BFM,9|AF_GWA,10|AF_GNS,11|AF_BFS,12|AF_CMS,13|AF_GAS,14|AF_UGS,15|AF_KES,16|AGAP000150-RA
X,2405832,2,T,A,5,0,0.0,0.0,0.0,0.0,0.0,0.0036496350365,0.0,0.0145631067961,0.0,E70D
X,2405931,2,A,C,1389,0,0.966666666667,0.869565217391,0.880434782609,0.935483870968,0.913580246914,0.892727272727,0.848214285714,0.932038834951,1.0,N37K
X,2405969,2,G,A,72,0,0.0,0.0507246376812,0.0108695652174,0.0483870967742,0.0740740740741,0.0509090909091,0.0535714285714,0.0679611650485,0.0113636363636,H25Y
X,2406028,2,A,C,297,0,0.391666666667,0.188405797101,0.141304347826,0.161290322581,0.185185185185,0.174545454545,0.125,0.179611650485,0.272727272727,V5G


In [84]:
tbl_variants_ce5_missense_bi = tbl_variants_ce5_missense.eq('num_alleles', 2).true('FILTER_PASS')
(tbl_variants_ce5_missense_bi
 .cutout(*qflds)
 .displayall(td_styles=td_styles, tr_style=tr_style))

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|AF_AOM,8|AF_BFM,9|AF_GWA,10|AF_GNS,11|AF_BFS,12|AF_CMS,13|AF_GAS,14|AF_UGS,15|AF_KES,16|AGAP008004-RA
3R,3999371,2,G,A,94,0,0.0166666666667,0.00724637681159,0.108695652174,0.0806451612903,0.037037037037,0.0690909090909,0.0178571428571,0.106796116505,0.0909090909091,S85F
3R,3999372,2,A,T,59,0,0.0,0.0217391304348,0.0652173913043,0.0806451612903,0.0740740740741,0.0454545454545,0.0178571428571,0.0291262135922,0.0,S85T
3R,3999423,2,C,T,10,0,0.0,0.0,0.0,0.0,0.00617283950617,0.0109090909091,0.0,0.0145631067961,0.0,D68N
3R,3999763,2,G,A,85,0,0.175,0.036231884058,0.0217391304348,0.0645161290323,0.0555555555556,0.0472727272727,0.0357142857143,0.0679611650485,0.0,A2V


In [46]:
callset_phased = zarr.open_group('../ngs.sanger.ac.uk/production/ag1000g/phase1/AR3.1/haplotypes/main/zarr2/ag1000g.phase1.ar3.1.haplotypes', mode='r')
callset_phased

Group(/, 6)
  arrays: 1; samples
  groups: 5; 2L, 2R, 3L, 3R, X
  store: DirectoryStore

In [85]:
pos_x = allel.SortedIndex(callset_phased['X/variants/POS'])
pos_x

0,1,2,3,4,...,4219274,4219275,4219276,4219277,4219278
49,80,110,137,149,...,24316800,24316805,24316819,24333445,24334916


In [86]:
loc_sg6_missense = pos_x.locate_keys(tbl_variants_sg6_missense_bi.values('POS').list())
np.count_nonzero(loc_sg6_missense)

4

In [87]:
pos_3r = allel.SortedIndex(callset_phased['3R/variants/POS'])

In [89]:
loc_ce5_missense = pos_3r.locate_keys(tbl_variants_ce5_missense_bi.values('POS').list())
np.count_nonzero(loc_ce5_missense)

4

In [95]:
genotypes_x = allel.GenotypeDaskArray(callset_phased['X/calldata/genotype'])
genotypes_x

Unnamed: 0,0,1,2,3,4,...,726,727,728,729,730,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
4219276,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
4219277,0/0,1/0,1/0,0/1,0/0,...,0/0,0/0,0/0,0/0,0/0,
4219278,1/0,0/0,0/0,0/0,0/0,...,1/0,1/1,0/0,0/0,1/0,


In [96]:
genotypes_3r = allel.GenotypeDaskArray(callset_phased['3R/calldata/genotype'])
genotypes_3r

Unnamed: 0,0,1,2,3,4,...,768,769,770,771,772,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
10178800,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
10178801,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
10178802,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


In [56]:
loc_ugs = (samples.population == 'UGS').values
np.count_nonzero(loc_ugs)

103

In [58]:
genotypes_sg6_missense_ugs = genotypes_x.subset(loc_sg6_missense, loc_ugs).compute()
genotypes_sg6_missense_ugs

Unnamed: 0,0,1,2,3,4,...,98,99,100,101,102
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
1,1/1,1/1,1/1,1/1,1/1,...,1/1,1/1,1/1,1/1,1/1
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
3,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/1


In [73]:
haps_sg6_ugs = genotypes_sg6_missense_ugs.to_haplotypes()
haps_sg6_ugs

Unnamed: 0,0,1,2,3,4,...,201,202,203,204,205
0,0,0,0,0,0,...,0,0,0,0,0
1,1,1,1,1,1,...,1,1,1,1,1
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,0,0,0,0,1


In [93]:
hd = haps_sg6_ugs.distinct()
hdf = haps_sg6_ugs.distinct_frequencies()
for s, f in zip(hd, hdf):
    print(''.join(haps_sg6_ugs[:, list(s)[0]].astype('U')), f)

0100 0.71359223301
0101 0.150485436893
0000 0.0728155339806
0110 0.0485436893204
1100 0.00970873786408
0001 0.00485436893204


In [97]:
genotypes_ce5_missense_ugs = genotypes_3r.subset(loc_ce5_missense, loc_ugs).compute()
genotypes_ce5_missense_ugs

Unnamed: 0,0,1,2,3,4,...,98,99,100,101,102
0,0/0,1/0,0/0,0/0,0/0,...,0/0,0/1,0/0,0/0,0/0
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
3,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0


In [98]:
haps_ce5_ugs = genotypes_ce5_missense_ugs.to_haplotypes()
haps_ce5_ugs

Unnamed: 0,0,1,2,3,4,...,201,202,203,204,205
0,0,0,1,0,0,...,0,0,0,0,0
1,0,0,0,0,0,...,0,0,0,0,0
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,0,0,0,0,0


In [99]:
hd = haps_ce5_ugs.distinct()
hdf = haps_ce5_ugs.distinct_frequencies()
for s, f in zip(hd, hdf):
    print(''.join(haps_ce5_ugs[:, list(s)[0]].astype('U')), f)

0000 0.791262135922
1000 0.101941747573
0001 0.0582524271845
0100 0.0242718446602
0010 0.0145631067961
1001 0.00485436893204
0101 0.00485436893204


## Inspect splice site variants

In [41]:
def simplify_intron_effect(v):
    if v and v[0] in ['SPLICE_REGION', 'SPLICE_CORE']:
        if math.fabs(v[2]) < math.fabs(v[4]):
            return v[1], v[2]
        else:
            return v[3], v[4]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_sg6_splice = (
    tbl_variants_sg6_eff
    .select(lambda row: row['AGAP000150-RA'] and row['AGAP000150-RA'][0] in ['SPLICE_REGION', 'SPLICE_CORE'])
    .convert('AGAP000150-RA', simplify_intron_effect)
)
tbl_variants_sg6_splice.displayall(td_styles=td_styles, tr_style=tr_style)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|AF_AOM,26|AF_BFM,27|AF_GWA,28|AF_GNS,29|AF_BFS,30|AF_CMS,31|AF_GAS,32|AF_UGS,33|AF_KES,34|check_allele,35|AGAP000150-RA
