Blast COI and ITS2 sequences against nt and interpret results within a range of sequence similarity - 98% identical or higher for ITS2, 96% for COI. 

Use 7_species_id conda environment

In [1]:
from Bio.Blast import NCBIXML
from Bio import AlignIO, SeqIO
import os
import pandas as pd
from collections import defaultdict

In [2]:
# in
WD = '../../../data/phylo_ampl_dada2/coi_its2/work/'
FA = os.path.join(WD, 'seqman_fa/plate{}.fas')
XML = os.path.join(WD, 'blast_xml/plate{}.xml')
AMPSEQ_PRED = '../7_species_id/data/4_spp_predictions.csv'
SAMPLE_META = '../7_species_id/data/1_sample_seq.csv'
BOLD_TSV = os.path.join(WD, 'bold/bold_plate{}.tsv') # generated from BOLD_SEQ, see below

# out
BOLD_SEQ = os.path.join(WD, 'bold/bold_plate{}.fas')
OD = 'data'
SPP_PREDS = os.path.join(OD, 'species_predictions.csv')

# within-species parameters
COI_WITHINSP_IDENTITY = 0.96
ITS2_WITHINSP_IDENTITY = 0.98
MIN_WITHINSP_ALIGNED = 0.75

In [3]:
!mkdir -p {OD}

## Sequence for BOLD

Need to reverse complement. Storing plates separately, as combined submission (127 sequences) raises error. 

Upon creation, each fasta file contents are pasted into BOLD identification interface at http://www.boldsystems.org/index.php/IDS_OpenIdEngine. Note that you need to be signed in to submit multiple sequences at once. Resulting files have xls extension, but those are plain text TSVs.

In [4]:
# bseq = []
for p in [1,2]:
    bseq = list(SeqIO.parse(FA.format(p),format='fasta'))
    for seq in bseq:
        seq.seq = seq.seq.reverse_complement()

    SeqIO.write(bseq, BOLD_SEQ.format(p), 'fasta')

## Parse BLAST results

In [5]:
blast_dict = defaultdict(dict)

for plate in range(1,5):
    blast_records = NCBIXML.parse(open(XML.format(plate)))
    for record in blast_records:
        # parse sample name
        marker_seq_name = record.query
        # non-empty samples
        seq_name = marker_seq_name
        if not marker_seq_name.startswith('empty'):
            seq_name = marker_seq_name.split('_')[0]
            # african samples
            if not seq_name.startswith('VBS'):
                 seq_name = 'A' + seq_name.replace('.','-')

        #blast_dict[seq_name]['sample'] = seq_name.split('_')[0]
        marker = ('COI' if plate in [1,2] else 'ITS2')
        blast_dict[seq_name][marker+'_seqid'] = marker_seq_name
        blast_dict[seq_name][marker+'_length'] = record.query_length
        blast_dict[seq_name][marker+'_num_alignments'] = len(record.alignments)

        match_spp = []
        # within-species identity thresholds COI/ITS2
        withinsp_threshold = (COI_WITHINSP_IDENTITY if plate in [1,2] else ITS2_WITHINSP_IDENTITY)
        withinsp_spp = []
        for aln in record.alignments:
            sp = '_'.join(aln.hit_def.split(' ')[:2])
            # all matching species
            if sp not in match_spp:
                match_spp.append(sp)
            # matching species with high identity
            # do not consider short matches
            if aln.hsps[0].align_length / record.query_length < MIN_WITHINSP_ALIGNED:
                continue
            # calculate identity
            identity = aln.hsps[0].identities / aln.hsps[0].align_length
            if identity >= withinsp_threshold and sp not in withinsp_spp:
                withinsp_spp.append(sp)
                
        blast_dict[seq_name][marker+'_species_predictions'] = ', '.join(withinsp_spp)
        
        if len(match_spp) > 10:
            match_spp = match_spp[:10]
        blast_dict[seq_name][marker+'_top10_species'] = ', '.join(match_spp)
        
        for i, h in enumerate(['first','second']):
            # not enough hits
            if len(record.alignments) == i:
                break
            # hit data
            hit = record.alignments[i]
            prefix = '{}_{}_hit'.format(marker, h)
            blast_dict[seq_name][prefix] = hit.hit_id
            blast_dict[seq_name][prefix+'_species'] = '_'.join(hit.hit_def.split(' ')[:2])
            blast_dict[seq_name][prefix+'_hsps'] = len(hit.hsps)
            blast_dict[seq_name][prefix+'_length'] = hit.hsps[0].align_length
            blast_dict[seq_name][prefix+'_identities'] = hit.hsps[0].identities
            blast_dict[seq_name][prefix+'_identity'] = hit.hsps[0].identities / hit.hsps[0].align_length * 100
            blast_dict[seq_name][prefix+'_score'] = hit.hsps[0].score

In [6]:
blast_df = pd.DataFrame(blast_dict).T
blast_df

Unnamed: 0,COI_first_hit,COI_first_hit_hsps,COI_first_hit_identities,COI_first_hit_identity,COI_first_hit_length,COI_first_hit_score,COI_first_hit_species,COI_length,COI_num_alignments,COI_second_hit,...,ITS2_second_hit,ITS2_second_hit_hsps,ITS2_second_hit_identities,ITS2_second_hit_identity,ITS2_second_hit_length,ITS2_second_hit_score,ITS2_second_hit_species,ITS2_seqid,ITS2_species_predictions,ITS2_top10_species
VBS00156,gi|1699503525|gb|MK685245.1|,1,596,98.8391,603,582,Anopheles_vagus,616,500,gi|1578806696|gb|MH425442.1|,...,gi|1698454428|gb|MN148589.1|,1,650,100,650,650,Anopheles_vagus,VBS00156_A06-ITS2A.ab1,"Anopheles_vagus, Anopheles_sp.","Anopheles_vagus, Anopheles_sp., Anopheles_subp..."
VBS00157,gi|1699503525|gb|MK685245.1|,1,598,99.1708,603,588,Anopheles_vagus,616,500,gi|1578806696|gb|MH425442.1|,...,gi|1698454428|gb|MN148589.1|,1,650,100,650,650,Anopheles_vagus,VBS00157_E06-ITS2A.ab1,"Anopheles_vagus, Anopheles_sp.","Anopheles_vagus, Anopheles_sp., Anopheles_subp..."
VBS00158,gi|1755716158|gb|MK628547.1|,1,607,99.183,612,597,Anopheles_vagus,616,500,gi|1383770353|gb|MF179260.1|,...,gi|1698454428|gb|MN148589.1|,1,645,100,645,645,Anopheles_vagus,VBS00158_H06-ITS2A.ab1,"Anopheles_vagus, Anopheles_sp.","Anopheles_vagus, Anopheles_sp., Anopheles_subp..."
Abro-22,gi|656438847|gb|KJ522835.1|,1,368,94.8454,388,328,Anopheles_sp.,391,500,gi|656438851|gb|KJ522837.1|,...,gi|374676298|gb|JN994151.1|,1,438,91.0603,481,344,Anopheles_theileri,bro.22_B10-ITS2A.ab1,,"Anopheles_sp., Anopheles_theileri, Anopheles_m..."
Abro-30,gi|656438861|gb|KJ522842.1|,1,580,94.1558,616,508,Anopheles_sp.,616,500,gi|656438847|gb|KJ522835.1|,...,gi|374676298|gb|JN994151.1|,1,438,91.0603,481,344,Anopheles_theileri,bro.30_C10-ITS2A.ab1,,"Anopheles_sp., Anopheles_theileri, Anopheles_m..."
Abro-33,gi|656438861|gb|KJ522842.1|,1,581,94.3182,616,511,Anopheles_sp.,616,500,gi|656438847|gb|KJ522835.1|,...,gi|374676298|gb|JN994151.1|,1,439,91.4583,480,350,Anopheles_theileri,bro.33_D10-ITS2A.ab1,,"Anopheles_sp., Anopheles_theileri, Anopheles_m..."
Acol-554,gi|1573759805|gb|MK300237.1|,1,616,100,616,616,Anopheles_gambiae,616,500,gi|1573759799|gb|MK300234.1|,...,gi|1727559942|gb|MN335028.1|,1,523,100,523,523,Anopheles_gambiae,col.554_D08-ITS2A.ab1,"Anopheles_gambiae, Anopheles_quadriannulatus, ...","Anopheles_gambiae, Anopheles_quadriannulatus, ..."
Acol-558,gi|1573759805|gb|MK300237.1|,1,616,100,616,616,Anopheles_gambiae,616,500,gi|1573759799|gb|MK300234.1|,...,gi|1727559942|gb|MN335028.1|,1,522,99.6183,524,517,Anopheles_gambiae,col.558_E08-ITS2A.ab1,"Anopheles_gambiae, Anopheles_quadriannulatus, ...","Anopheles_gambiae, Anopheles_quadriannulatus, ..."
Acol-570,gi|1573759805|gb|MK300237.1|,1,614,99.6753,616,610,Anopheles_gambiae,616,500,gi|1573759799|gb|MK300234.1|,...,gi|1727559942|gb|MN335028.1|,1,522,99.6183,524,517,Anopheles_gambiae,col.570_F08-ITS2A.ab1,"Anopheles_gambiae, Anopheles_quadriannulatus, ...","Anopheles_gambiae, Anopheles_quadriannulatus, ..."
Acol-645,gi|1573759797|gb|MK300233.1|,1,611,99.1883,616,601,Anopheles_gambiae,616,500,gi|1382879396|gb|MG753704.1|,...,gi|374676293|gb|JN994146.1|,1,518,98.855,524,506,Anopheles_quadriannulatus,col.645_G08-ITS2A.ab1,"Anopheles_quadriannulatus, Anopheles_gambiae, ...","Anopheles_quadriannulatus, Anopheles_gambiae, ..."


## Add BOLD species predictions

In [7]:
# collect bold results
bold_dfs = []

for i in [1,2]:
    bold_df = pd.read_csv(BOLD_TSV.format(i), sep='\t')
    bold_dfs.append(bold_df)
bold_df = pd.concat(bold_dfs)
print(bold_df.shape)
bold_df.head()

(127, 5)


Unnamed: 0,Query ID,Best ID,Search DB,Top %,Low %
0,VBS00156_E09-HCO2198.ab1,Anopheles vagus,COI SPECIES DATABASE,99.33,89.72
1,VBS00157_H09-HCO2198.ab1,Anopheles vagus,COI SPECIES DATABASE,99.66,90.07
2,VBS00158_B11-HCO2198.ab1,Anopheles vagus,COI SPECIES DATABASE,99.32,89.74
3,bro.22_B10-HCO2198.ab1,No match,COI SPECIES DATABASE,,
4,bro.30_C10-HCO2198.ab1,No match,COI SPECIES DATABASE,,


In [8]:
# adjust bold results
bold_df.columns = 'bold_' + bold_df.columns.str.lower().str.replace(' ','_')
bold_df['bold_best_id'] = bold_df.bold_best_id.str.replace(' ','_')
bold_df['sample'] = bold_df.bold_query_id
non_empty = ~bold_df['sample'].str.startswith('empty')
non_vbs = ~bold_df['sample'].str.startswith('VBS')
bold_df.loc[non_empty, 'sample'] = bold_df.loc[non_empty, 'sample'].str.split('_').str.get(0)
bold_df.loc[non_empty & non_vbs, 'sample'] = 'A' + bold_df.loc[non_empty & non_vbs,'sample'].str.replace('.','-')
bold_df = bold_df.set_index('sample')
print(bold_df.shape)
bold_df.head()

(127, 5)


Unnamed: 0_level_0,bold_query_id,bold_best_id,bold_search_db,bold_top_%,bold_low_%
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VBS00156,VBS00156_E09-HCO2198.ab1,Anopheles_vagus,COI SPECIES DATABASE,99.33,89.72
VBS00157,VBS00157_H09-HCO2198.ab1,Anopheles_vagus,COI SPECIES DATABASE,99.66,90.07
VBS00158,VBS00158_B11-HCO2198.ab1,Anopheles_vagus,COI SPECIES DATABASE,99.32,89.74
Abro-22,bro.22_B10-HCO2198.ab1,No_match,COI SPECIES DATABASE,,
Abro-30,bro.30_C10-HCO2198.ab1,No_match,COI SPECIES DATABASE,,


In [9]:
comb_df = pd.merge(blast_df, bold_df, how='outer', left_index=True, right_index=True)
comb_df.shape

(144, 43)

## Add partner and ampseq species predictions

In [10]:
# read sample metadata, remove duplicate row
sm_df = pd.read_csv(SAMPLE_META, index_col=0)
sm_df = sm_df[~sm_df.index.duplicated(keep='first')]
sm_df.shape

(163, 21)

In [11]:
# read amplicon sequencing species predictions matrix
ap_df = pd.read_csv(AMPSEQ_PRED, index_col=0)

In [12]:
# collect top predictions from ampseq
ap_dict = dict()

for sample in blast_df.index:
    ap_dict[sample] = ''

for sample, row in ap_df.iterrows():
    top_pred = row.max()
    row[row==top_pred]
    ap_dict[sample] = ', '.join(row[row==top_pred].index.to_list())


In [13]:
# for partner species, use series-dict-series conversion to get rid of duplicate species entries 
# (last species annotation is taken)
comb_df['partner_species'] = sm_df['Species']
comb_df['ampseq_species'] = pd.Series(ap_dict)
comb_df['amplicons_recovered'] = sm_df['Amplicons Recovered']
comb_df.shape

(144, 46)

In [16]:
cols = ('partner_species	amplicons_recovered	ampseq_species	'
'bold_best_id	bold_top_%	bold_low_%	'
'COI_length	COI_num_alignments	COI_species_predictions	COI_top10_species	'
'COI_first_hit	COI_first_hit_species	COI_first_hit_hsps	COI_first_hit_length	'
'COI_first_hit_identities	COI_first_hit_identity	COI_first_hit_score	'
'COI_second_hit	COI_second_hit_species	COI_second_hit_hsps	COI_second_hit_length	'
'COI_second_hit_identities	COI_second_hit_identity	COI_second_hit_score	'
'ITS2_length	ITS2_num_alignments	ITS2_species_predictions	ITS2_top10_species	'
'ITS2_first_hit	ITS2_first_hit_species	ITS2_first_hit_hsps	ITS2_first_hit_length	'
'ITS2_first_hit_identities	ITS2_first_hit_identity	ITS2_first_hit_score	'
'ITS2_second_hit	ITS2_second_hit_species	ITS2_second_hit_hsps	ITS2_second_hit_length	'
'ITS2_second_hit_identities	ITS2_second_hit_identity	ITS2_second_hit_score	'
'COI_seqid	ITS2_seqid').split('\t')

In [17]:
comb_df[cols].to_csv(SPP_PREDS)
raise Exception('Analysis ended!')

Exception: Analysis ended!

# Sandbox

BioPython blasting via web takes about 20 minutes per sequence. Plus, complete plate cannot be sumbitted - limit for a single submission is probably around 10-20 sequences. Thus, we run blast on Sanger farm using internal copy of nt database (see blast_nt.smk).

In [None]:
from Bio.BLAST import NCBIWWW

In [None]:
%%time
# fasta string
f = os.path.join(WD, FA.format(1))
fa_str = open(f).read()
result_handle = NCBIWWW.qblast("blastn", "nt", fa_str)
result_handle

In [None]:
%%time
result_handle = NCBIWWW.qblast("blastn", "nt", records[0].seq)
result_handle

In [None]:
# submit each sequence separately, TODO try batching ~10 seqs - whole dataset does not work
# current timing - ca 20 (with high SD) mins per sequence
for i, record in enumerate(records):
    print(i, record.name)
    xml_file = os.path.join(WD, record.name + '.xml')
    if os.path.isfile(xml_file):
        continue
    result_handle = NCBIWWW.qblast("blastn", "nt", records[0].seq)
    with open(xml_file, "w") as out_handle:
        out_handle.write(result_handle.read())

In [None]:
with open(xml_file, "w") as out_handle:
    out_handle.write(result_handle.read())

In [None]:
records = list(SeqIO.parse(FA.format(1),format='fasta'))
records[0]

In [None]:
str(records[0].seq.reverse_complement())

In [None]:
xml_file = os.path.join(WD, 'VBS00156_E09-HCO2198.ab1.xml')
blast_record = NCBIXML.parse(open(xml_file))
plate = 1
r = list(blast_record)[0]

In [None]:
r.alignments[0].hit_id