Prepare fasta files for genbank submissions

In [1]:
from Bio import AlignIO, SeqIO
import os
import pandas as pd

In [2]:
# in
WD = '../../../data/phylo_ampl_dada2/coi_its2/work/'
FA = os.path.join(WD, 'seqman_fa/plate{}.fas')
SEQ_META = 'data/species_predictions.csv'
PARTNER_META = '../7_species_id/data/0_samples.csv'
# out
GB_FA = os.path.join(WD, 'genbank_{}.fas')
GB_SM = os.path.join(WD, 'genbank_{}.tsv')

In [3]:
species_conflict = ['Amar-42','Amar-5','VBS00145']

In [4]:
smeta = pd.read_csv(SEQ_META, index_col=0)
smeta['partner_species'] = smeta.partner_species.fillna('')
smeta['Organism'] = smeta.partner_species.apply(lambda x: ' '.join(x.split('_')[:2]))
smeta.loc[smeta.index.isin(species_conflict), 'partner_species'] = 'Anopheles sp.'
smeta.loc[smeta.index.isin(species_conflict), 'Organism'] = 'Anopheles sp.'
smeta.head()

Unnamed: 0,partner_species,amplicons_recovered,ampseq_species,bold_best_id,bold_top_%,bold_low_%,COI_length,COI_num_alignments,COI_species_predictions,COI_top10_species,...,ITS2_second_hit,ITS2_second_hit_species,ITS2_second_hit_hsps,ITS2_second_hit_length,ITS2_second_hit_identities,ITS2_second_hit_identity,ITS2_second_hit_score,COI_seqid,ITS2_seqid,Organism
Abro-21,Anopheles_brohieri,21.0,"Anopheles_brohieri, Anopheles_demeilloni, Anop...",,,,,,,,...,gi|374676298|gb|JN994151.1|,Anopheles_theileri,1.0,481.0,438.0,91.060291,344.0,,bro.21_A10-ITS2A.ab1,Anopheles brohieri
Abro-22,Anopheles_brohieri,62.0,Anopheles_brohieri,No_match,,,391.0,500.0,,"Anopheles_sp., Anopheles_theileri, Anopheles_a...",...,gi|374676298|gb|JN994151.1|,Anopheles_theileri,1.0,481.0,438.0,91.060291,344.0,bro.22_B10-HCO2198.ab1,bro.22_B10-ITS2A.ab1,Anopheles brohieri
Abro-30,Anopheles_brohieri,59.0,Anopheles_brohieri,No_match,,,616.0,500.0,,"Anopheles_sp., Anopheles_minimus, Anopheles_cf...",...,gi|374676298|gb|JN994151.1|,Anopheles_theileri,1.0,481.0,438.0,91.060291,344.0,bro.30_C10-HCO2198.ab1,bro.30_C10-ITS2A.ab1,Anopheles brohieri
Abro-33,Anopheles_brohieri,38.0,"Anopheles_brohieri, Anopheles_demeilloni, Anop...",No_match,,,616.0,500.0,,"Anopheles_sp., Anopheles_minimus, Anopheles_cf...",...,gi|374676298|gb|JN994151.1|,Anopheles_theileri,1.0,480.0,439.0,91.458333,350.0,bro.33_D10-HCO2198.ab1,bro.33_D10-ITS2A.ab1,Anopheles brohieri
Acar-191,Anopheles_carnevalei,49.0,Anopheles_carnevalei,Anopheles_carnevalei,99.5,92.59,620.0,500.0,,"Anopheles_darlingi, Anopheles_coustani, Anophe...",...,gi|471756220|gb|KC189966.1|,Anopheles_nili,1.0,400.0,334.0,83.5,183.0,car.191_A01-HCO2198.ab1,car.191_B11-ITS2A.ab1,Anopheles carnevalei


In [5]:
marker_names = {
    'coi':'cytochrome oxidase subunit I (COI) gene, partial cds; mitochondrial',
    'its':'5.8S ribosomal RNA gene, partial sequence; internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence'
}
marker_plates = {
    'coi':(1,2),
    'its':(3,4)
}
marker_seqid_col = {
    'coi':'COI_seqid',
    'its':'ITS2_seqid'
}



In [6]:
# sequence dat
for marker in ('coi','its'):
    out_seqs = list() 
    print('marker',marker)
    for plate in marker_plates[marker]:
        in_seqs = SeqIO.parse(FA.format(plate), format='fasta')
        print('plate',plate)
        for seq in in_seqs:
            q = '{} == "{}"'.format(marker_seqid_col[marker], seq.name)
            # print(q, marker)
            marker_data = smeta.query(q)
            assert marker_data.shape[0] == 1, q
            sample_id = marker_data.index[0]
            species_tax = marker_data.Organism[0]
            species = marker_data.partner_species[0].replace('_',' ')
            # no partner species - sample excluded from publication
            if species == '':
                continue
            title = '[organism={species_tax}] {species} isolate {sample_id} {marker}'.format(
                      seqname=seq.name, 
                      species_tax=species_tax,
                      species=species,   
                      sample_id=sample_id, 
                      marker=marker_names[marker])
            seq.description = title
            # we sequenced COI from a reverse primer
            if marker == 'coi':
                seq.seq = seq.seq.reverse_complement()
            out_seqs.append(seq)
    SeqIO.write(out_seqs, GB_FA.format(marker), format='fasta')


marker coi
plate 1
plate 2
marker its
plate 3
plate 4


In [7]:
# prep for source modifier tables
pmeta = pd.read_csv(PARTNER_META, index_col=0)
meta = pd.merge(smeta, pmeta, left_index=True, right_index=True)

In [8]:
# remove duplicated theileri sample
display(meta.shape)
meta = meta[~meta.index.duplicated(keep='first')]
meta.shape

(125, 61)

(124, 61)

In [9]:
col_mapping = {
    'Specimen Sex':'Sex',
    'Collection Context':'Isolation_source',
}
meta = meta.rename(columns=col_mapping)

In [10]:
meta['Isolate'] = meta.index

In [11]:
def lat_lon(r):
    lat_suffix = ('N' if r.Latitude >= 0 else 'S')
    lon_suffix = ('E' if r.Longitude >= 0 else 'W')
    return '{:.5f} {} {:.5f} {}'.format(
            abs(r.Latitude),lat_suffix,
            abs(r.Longitude),lon_suffix)
meta['Lat_Lon'] = meta.apply(lat_lon, axis=1)

In [12]:
meta['Collection_date'] = pd.to_datetime(meta['Date of Collection']).dt.strftime('%d-%b-%Y')

In [13]:
add_cols = [
    'Collection_date',
    'Country',
    'Isolate',
    'Isolation_source',
    'Organism',
    'Lat_Lon'
]
# source modifier tables
for marker in ('coi','its'):
    mid = marker_seqid_col[marker]
    smdf = meta[[mid,
                 *add_cols]]
    smdf = smdf.rename(columns={mid:'Sequence_ID'})
    smdf = smdf[smdf.Sequence_ID.notna()]
    smdf.to_csv(GB_SM.format(marker), sep='\t', index=False)

## Sandbox

In [14]:
# translation for coi
prot_seqs = list()
for seq in SeqIO.parse(GB_FA.format('coi'), format='fasta'):
#     seq = seq.reverse_complement()
    for i in range(3):
        prot_seq = seq[i:].translate(table=5)
        if '*' not in prot_seq:
            prot_seq.name = seq.name
            prot_seqs.append(prot_seq)
            break
    else:
        raise ValueError(seq.name)

    break
print(prot_seqs)

[SeqRecord(seq=Seq('KDIGTLYFIFGAWAGMVGTSLSILIRAELGHPGAFIGDDQIYNVIVTAHAFIMI...NLN', ExtendedIUPACProtein()), id='<unknown id>', name='VBS00156_E09-HCO2198.ab1', description='<unknown description>', dbxrefs=[])]


