In [64]:
import pandas as pd
import pybedtools

In [65]:
# import annotation beds
genes = pybedtools.BedTool('/Users/am60/data/genome/agamP3/Anopheles-gambiae-PEST_BASEFEATURES_AgamP3.8.gff3.gz')
repeats = pybedtools.BedTool('/Users/am60/data/genome/agamP3/Anopheles-gambiae-PEST_REPEATFEATURES_AgamP3.gff3.gz')

# import amplicon data and transform to bed
amplicon_data = pd.read_csv('/Users/am60/malaria/20180129_phylo_ampliseq/20180324_long_ampl_stats.csv', index_col=0)
amplicon_data['chr'] = [seq[10:] for seq in amplicon_data['seqid']]
amplicon_bed = amplicon_data[['chr', 'start', 'end']].to_string(header=False, index=False)
amplicon_beds = amplicon_bed.split('\n')
print(len(amplicon_beds))
amplicon_beds[:2]

74


['2L   5759550   5759934', '2L   6533965   6534240']

In [66]:
def bt_to_df(bt):
    '''
    Convert bedtool to pandas dataframe replacing empty files with None'''
    if len(bt) > 0:
        return bt.to_dataframe()
    else:
        return None

# collect gene and repeat features found in amplicons
ampl_annot = []
for a in amplicon_beds:
    a_bed = pybedtools.BedTool(a, from_string=True) 
    ag_gff = genes.intersect(a_bed)
    ar_gff = repeats.intersect(a_bed)
    ampl_annot.append([bt_to_df(a_bed), pd.concat([bt_to_df(ag_gff), bt_to_df(ar_gff)])])
    
print(ampl_annot[1])

[  chrom    start      end
0    2L  6533965  6534240,   seqname      source feature    start      end score strand frame  \
0      2L  VectorBase  contig  6533966  6534240     .      .     .   
1      2L  VectorBase    gene  6533966  6534240     .      +     .   
2      2L  VectorBase    mRNA  6533966  6534240     .      +     .   
3      2L  VectorBase    exon  6533970  6534240     .      +     .   
4      2L  VectorBase     CDS  6533970  6534240     .      +     2   
5      2L  VectorBase    mRNA  6533966  6534240     .      +     .   
6      2L  VectorBase    exon  6533970  6534240     .      +     .   
7      2L  VectorBase     CDS  6533970  6534240     .      +     2   

                                                    attributes  
0  ID=2L;molecule_type=dsDNA;GenBank:chromosome:AgamP3:2L:1...  
1                        ID=AGAP004943;biotype=protein_coding;  
2  ID=AGAP004943-RA;Parent=AGAP004943;Dbxref=RefSeq_NA:XM_0...  
3                            ID=E015172A;Parent=AGAP004

In [67]:
# all features in combined annotation
pd.concat([x[1].feature for x in ampl_annot]).unique()

array(['contig', 'gene', 'mRNA', 'exon', 'CDS', 'five_prime_utr',
       'repeat', 'three_prime_utr'], dtype=object)

In [68]:
# multiple exons spanned 
pd.options.display.max_colwidth = 60
ampl_annot[16][1]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes
0,2L,VectorBase,contig,45871632,45871844,.,.,.,ID=2L;molecule_type=dsDNA;GenBank:chromosome:AgamP3:2L:1...
1,2L,VectorBase,gene,45871632,45871844,.,+,.,ID=AGAP007340;biotype=protein_coding;
2,2L,VectorBase,mRNA,45871632,45871844,.,+,.,ID=AGAP007340-RB;Parent=AGAP007340;Dbxref=RefSeq_NA:XM_3...
3,2L,VectorBase,exon,45871665,45871666,.,+,.,ID=E025387B;Parent=AGAP007340-RB;
4,2L,VectorBase,exon,45871686,45871844,.,+,.,ID=E025388B;Parent=AGAP007340-RB;
5,2L,VectorBase,CDS,45871665,45871666,.,+,2,Parent=AGAP007340-RB;
6,2L,VectorBase,CDS,45871686,45871844,.,+,0,Parent=AGAP007340-RB;
7,2L,VectorBase,mRNA,45871632,45871844,.,+,.,ID=AGAP007340-RA;Parent=AGAP007340;Dbxref=RefSeq_NA:XM_3...
8,2L,VectorBase,exon,45871665,45871666,.,+,.,ID=E025387A;Parent=AGAP007340-RA;
9,2L,VectorBase,exon,45871686,45871844,.,+,.,ID=E025388A;Parent=AGAP007340-RA;


In [69]:
# five_prime annotation
ampl_annot[17][1]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes
0,2R,VectorBase,contig,1760379,1760568,.,.,.,ID=2R;molecule_type=dsDNA;GenBank:chromosome:AgamP3:2R:1...
1,2R,VectorBase,gene,1760379,1760568,.,-,.,ID=AGAP001235;biotype=protein_coding;
2,2R,VectorBase,mRNA,1760379,1760568,.,-,.,ID=AGAP001235-RA;Parent=AGAP001235;Dbxref=RefSeq_NA:XM_3...
3,2R,VectorBase,exon,1760379,1760568,.,-,.,ID=E058022A;Parent=AGAP001235-RA;
4,2R,VectorBase,five_prime_utr,1760444,1760568,.,-,.,Parent=AGAP001235-RA
5,2R,VectorBase,CDS,1760379,1760443,.,-,0,Parent=AGAP001235-RA;
6,2R,VectorBase,mRNA,1760379,1760568,.,-,.,ID=AGAP001235-RB;Parent=AGAP001235;Dbxref=RefSeq_NA:XM_0...
7,2R,VectorBase,exon,1760379,1760502,.,-,.,ID=E038474B;Parent=AGAP001235-RB;
8,2R,VectorBase,five_prime_utr,1760444,1760502,.,-,.,Parent=AGAP001235-RB
9,2R,VectorBase,CDS,1760379,1760443,.,-,0,Parent=AGAP001235-RB;


In [70]:
# repeat annotation
ampl_annot[20][1]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes
0,2R,VectorBase,contig,2218578,2218918,.,.,.,ID=2R;molecule_type=dsDNA;GenBank:chromosome:AgamP3:2R:1...
1,2R,VectorBase,gene,2218578,2218918,.,-,.,ID=AGAP001279;biotype=protein_coding;
2,2R,VectorBase,mRNA,2218578,2218918,.,-,.,ID=AGAP001279-RA;Parent=AGAP001279;Dbxref=RefSeq_NA:XM_3...
3,2R,VectorBase,exon,2218578,2218918,.,-,.,ID=E038650A;Parent=AGAP001279-RA;
4,2R,VectorBase,CDS,2218578,2218918,.,-,0,Parent=AGAP001279-RA;
0,2R,VectorBase,repeat,2218578,2218613,.,+,.,Name=trf; Note=trf;


In [71]:
# combine data
aa_data = pd.concat([pd.concat(x) for x in ampl_annot])
aa_data['seqid'] = aa_data['chrom'].fillna('') + aa_data['seqname'].fillna('')
cols = ['seqid','source','feature','start','end','score','strand','frame','attributes']
aa_data = aa_data[cols]
aa_data.head()

Unnamed: 0,seqid,source,feature,start,end,score,strand,frame,attributes
0,2L,,,5759550,5759934,,,,
0,2L,VectorBase,contig,5759551,5759934,.,.,.,ID=2L;molecule_type=dsDNA;GenBank:chromosome:AgamP3:2L:1...
1,2L,VectorBase,gene,5759551,5759934,.,+,.,ID=AGAP004902;biotype=protein_coding;
2,2L,VectorBase,mRNA,5759551,5759934,.,+,.,ID=AGAP004902-RA;Parent=AGAP004902;Dbxref=RefSeq_NA:XM_0...
3,2L,VectorBase,exon,5759551,5759934,.,+,.,ID=E015026A;Parent=AGAP004902-RA;


In [72]:
def get_attrs(d, feature, attr_id):
    out = []
    for attr in d[d.feature == feature]['attributes']:
        adict = dict()
        for a in attr.split(';'):
            aa = a.split('=')
            if len(aa) == 2:
                adict[aa[0]] = aa[1]
        try:
            out.append(adict[attr_id])
        except:
            pass
        
    return ';'.join(out)
attr_dict = dict()
for (i, x) in enumerate(ampl_annot):
    d = x[1]
    attr_dict[i] = {
    'gene': get_attrs(d, 'gene', 'ID'),
    'mRNA': get_attrs(d, 'mRNA', 'ID'),
    'exon': get_attrs(d, 'exon', 'ID'),
    'repeat': get_attrs(d, 'repeat', 'Name'),
    }
    attr_dict[i]['utr'] = ('Yes' if ('utr' in str(d['feature'])) else '')
    attr_dict[i]['intron'] = ('Yes' if (attr_dict[i]['mRNA'].count(';') < attr_dict[i]['exon'].count(';')) else '')

In [73]:
'utr' in str(ampl_annot[17][1]['feature'])

True

In [74]:
add_attr = pd.DataFrame(attr_dict).T[['gene','mRNA','exon','intron','utr','repeat']]
add_attr.head()
    

Unnamed: 0,gene,mRNA,exon,intron,utr,repeat
0,AGAP004902,AGAP004902-RA,E015026A,,,
1,AGAP004943,AGAP004943-RA;AGAP004943-RB,E015172A;E015175B,,,
2,AGAP004943,AGAP004943-RA;AGAP004943-RB,E015172A;E015175B,,,
3,AGAP004950,AGAP004950-RA,E015203A,,,
4,,,,,,


In [75]:
updated_data = pd.concat([amplicon_data, add_attr], axis=1)
updated_data.head()

Unnamed: 0,seqid,start,end,aligned_len,snvs,indels,target_start,target_end,target_aligned_len,target_snvs,target_indels,unid_species,chr,gene,mRNA,exon,intron,utr,repeat
0,AgamP3.chr2L,5759550,5759934,405,43,21,5759855,5759874,34,8,15,"[{'AquaS1', 'AgamP3', 'AaraD1', 'AgamM1', 'AmelC1', 'Aga...",2L,AGAP004902,AGAP004902-RA,E015026A,,,
1,AgamP3.chr2L,6533965,6534240,275,36,1,6534054,6534097,43,13,0,"[{'AquaS1', 'AgamP3', 'AepiE1', 'AaraD1', 'AgamM1', 'Ame...",2L,AGAP004943,AGAP004943-RA;AGAP004943-RB,E015172A;E015175B,,,
2,AgamP3.chr2L,6534097,6534337,240,34,0,6534240,6534272,32,8,0,"[{'AfunF1', 'AquaS1', 'AgamP3', 'AsteS1', 'AmacM1', 'Aar...",2L,AGAP004943,AGAP004943-RA;AGAP004943-RB,E015172A;E015175B,,,
3,AgamP3.chr2L,6993399,6993612,220,28,7,6993469,6993506,37,13,0,"[{'AquaS1', 'AgamP3', 'AaraD1', 'AgamM1', 'AgamS1', 'Ame...",2L,AGAP004950,AGAP004950-RA,E015203A,,,
4,AgamP3.chr2L,13325151,13325363,225,23,16,13325250,13325273,29,7,6,"[{'AquaS1', 'AgamP3', 'AaraD1', 'AgamM1', 'AmelC1', 'Ach...",2L,,,,,,


In [76]:
ud_file = '/Users/am60/malaria/20180129_phylo_ampliseq/20180327_amplicon_data.csv'
updated_data.to_csv(ud_file)

In [98]:
# write to file
#BAD - combination of 1-based gff and 0-based bed
aa_file = '/Users/am60/malaria/20180129_phylo_ampliseq/20180327_amplicon_annotations.tsv'
aa_data.to_csv(aa_file, sep='\t', index=False)

In [90]:
ampl_annot[0][0]

Unnamed: 0,chrom,start,end
0,2L,5759550,5759934
