## Extract _C. floridanus_ and _C. pseudogracilis_ haplotypes from metabarcoding data.

Before extracting the _Crangonyx spp._ haplotypes we need to import the functions.

In [None]:
import metaBEAT_global_misc_functions as mb

In [None]:
mkdir Crangonyx_haplotypes

In [None]:
cd Crangonyx_haplotypes/

Now extract the OTU ids for the relevant species that we filtered above 2%.

In [None]:
OTU_table = mb.load_BIOM('../../2-metaBEAT/filtered.biom')

dictionary = mb.find_target_OTUs_by_taxonomy(OTU_table, target='Crangonyx_floridanus', level='all')

OTUs = {}
OTUs['Crangonyx_floridanus'] = dictionary.keys()

dictionary = mb.find_target_OTUs_by_taxonomy(OTU_table, target='Crangonyx_pseudogracilis', level='all')

OTUs['Crangonyx_pseudogracilis'] = dictionary.keys()

print ""
print OTUs


OTUs_as_list = []
for sp in OTUs:
    OTUs_as_list.extend(OTUs[sp])

print "\nOTU list:"
print OTUs_as_list

Identify samples contributing to each of the OTUs.

In [None]:
per_OTU_samples={}

samples = OTU_table.ids(axis='sample')

for OTU in OTUs_as_list:
#    print OTU
    per_OTU_samples[OTU] = []
    obs = OTU_table.data(OTU, axis='observation')
    for i in range(len(obs)):
        if int(obs[i]) > 0:
#            print "\t%s" %samples[i]
            per_OTU_samples[OTU].append(".".join(samples[i].split(".")[:-1]))

    
print "OTU centroid ID - # of samples"
for OTU in per_OTU_samples:
    print "%s - %s" %(OTU,len(per_OTU_samples[OTU]))

Identify centroid IDS for the relevant OTUs, including query target alignments ('H').

In [None]:
per_OTU_centroids={}

print "OTU ids to process:"
for OTU in per_OTU_samples:
    print OTU,len(per_OTU_samples[OTU])
print "#######\n"

uc=open('../../2-metaBEAT/GLOBAL/global.uc', 'r')

for line in uc:
    if line.startswith('H'):
        if line.strip().split("\t")[9] in OTUs_as_list:
#            print "hit: %s\t%s" %(line.strip().split("\t")[9],line.strip().split("\t")[8])
            if not per_OTU_centroids.has_key(line.strip().split("\t")[9]):
                per_OTU_centroids[line.strip().split("\t")[9]]=[line.strip().split("\t")[9]]

                
            if line.strip().split("\t")[8].split("|")[0] in per_OTU_samples[line.strip().split("\t")[9]]:
                per_OTU_centroids[line.strip().split("\t")[9]].append(line.strip().split("\t")[8])
#                print "found sample: %s" %line.strip().split("\t")[8].split("|")[0]
                        
uc.close()


for OTU in per_OTU_centroids:
    print OTU,str(len(per_OTU_centroids[OTU]))
    for c in sorted(per_OTU_centroids[OTU]):
        print "\tfound centroid: "+c
#print per_OTU_centroids


Specify a unique id for each OTU.

In [None]:
OTUs_synonyms = []
for sp in OTUs:
    count=0
#    OTUs_as_list.extend(OTUs[sp])
    
    for otu in OTUs[sp]:
        OTUs_synonyms.append(sp+'_'+str(count)+'_OTU')
        count+=1
    
print OTUs_as_list
print OTUs_synonyms


Identify samples contributing to each of the OTUs.

In [None]:
per_OTU_samples={}
            
for otu in per_OTU_centroids:
    per_OTU_samples[otu] = []
    for centroid in per_OTU_centroids[otu]:
        per_OTU_samples[otu].append(centroid.split("|")[0])
#        print centroid.split("|")[1]

    per_OTU_samples[otu]=list(set(per_OTU_samples[otu]))
    print "\n"+otu,len(per_OTU_samples[otu]),sorted(per_OTU_samples[otu])


Create a global fasta file containing all reads for each OTU.

In [None]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print "OTU: %s -> %s.fa" %(otu,syn)
    read_ids_per_OTU=[]
    seqs_per_OTU=[]
    for sample in per_OTU_samples[otu]:
        seqs_per_sample=[]
#        print "Sample: %s" %sample
        read_ids=[]
#        collect relevant centroids
        centroids=[]
        for c in per_OTU_centroids[otu]:
#            print "centroid: %s" %c
            if c.startswith(sample+'|'):
                for c_ind in c.split("|")[1:]:
                    centroids.append(c_ind)
                
#        print "Centroids: %s" %centroids
        
#        extract read ids from uc file
        read_ids.extend(centroids[:])
        uc=open('../../2-metaBEAT/'+sample+'/'+sample+'.uc', 'r')
        for line in uc:
            if line.startswith('H'):
                if line.strip().split("\t")[9] in centroids:
#                    print line
                    read_ids.append(line.strip().split("\t")[8])
        uc.close()            
#        print "READ IDS: %i" %len(read_ids)
        
#        extract reads per sample
        
        fasta=open('../../2-metaBEAT/'+sample+'/'+sample+'_queries.fasta', 'r')     # trimmed.fasta', 'r')
        for r in SeqIO.parse(fasta, 'fasta'):
            if r.id in read_ids:
                r.id = sample+'|'+r.id
                r.description = r.id
                seqs_per_OTU.append(r)
                
            
        fasta.close()
        
        
#    Write out global fasta per OTU, containing all reads across all samples
    out=open(syn+'.fa', 'w')
    SeqIO.write(seqs_per_OTU, out, 'fasta')
    out.close()

Dereplicate all sequences at 95% identity match.

In [None]:
%%bash

for OTU in $(ls -1 *.fa)
do
    vsearch --derep_fulllength $OTU \
    --strand both --output derep_$OTU \
    --uc derep_$OTU.uc --id 0.95
done

For each OTU, extract the most abundant dereplicated sequence as reference. Compare all other sequences to this one via `usearch_global`. Parse output and reverse complement any sequences if necessary.

Extract top sequence and write to file. 

In [None]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    seqs = SeqIO.parse(open('derep_'+syn+'.fa','r'),'fasta')

    seq = seqs.next()

#    seq.seq = seq.seq.reverse_complement()

    out=open('derep_'+syn+'.ref.fasta','w')
    SeqIO.write(seq, out, 'fasta')
    out.close()

Compare all sequences against the most abundant dereplicated sequences with `usearch_global`.

In [None]:
%%bash

for ref in $(ls -1 derep* | grep "fasta")
do
    full=$(echo -e "$ref" | sed 's/\.ref//' | sed 's/sta$//')
    
    vsearch --usearch_global $full \
    --strand both \
    --db $ref \
    --id 0.9 \
    --blast6out $full.blast.out 
done

Parse output and identify sequences to reverse complement.

In [None]:
to_reverse = {}

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    to_reverse[syn] = []
    blast = open('derep_'+syn+'.fa.blast.out','r')

    for rec in blast:
        cols = rec.strip().split("\t")
        if cols[6] > cols[7]:
            to_reverse[syn].append(cols[0])
        
    print to_reverse[syn]

Reverse complement if necessary.

In [None]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    if to_reverse[syn]:
        seqs = SeqIO.parse(open('derep_'+syn+'.fa','r'),'fasta')
        outseqs = []
        for s in seqs:
            print "search: %s" %s.id
            if s.id in to_reverse[syn]:
                print "#%s\t%s" %(to_reverse[syn].index(s.id), to_reverse[syn][to_reverse[syn].index(s.id)])
                s.seq = s.seq.reverse_complement()
                del(to_reverse[syn][to_reverse[syn].index(s.id)])
                outseqs.append(s)
        out = open('derep_'+syn+'.fa','w')
        SeqIO.write(outseqs, out, 'fasta')
        out.close()
    else:
        print "nothing to reverse"

Write top ten sequences to file and align for manual inspection.

In [None]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    seqs = SeqIO.parse(open('derep_'+syn+'.fa','r'),'fasta')
    count=0
    seqs_to_print = []
    for s in seqs:
        print count,s.id
        count+=1
        seqs_to_print.append(s)
        if count > 9:
            break
    out=open('derep_'+syn+'.top10.fa','w')
    SeqIO.write(seqs_to_print,out,'fasta')
    out.close()

Align top 10 with mafft.

In [None]:
%%bash

for file in $(ls -1 *top10*)
do
    out=$(echo -e "$file" | sed 's/fa$/aln.fa/')
    mafft --localpair --maxiterate 1000 $file > $out
done

## Representatives of observed OTUs

Curated minibarcode (Leray CO1 region) sequences were selected as representatives and saved to two separate files (one per taxa).

In [None]:
%%file Crangonyx_floridanus_OTU.minibc.ref.fasta
>CH101-pl1-1-1-Oct-nc|1_2105_17136_11806_1_ex
tttagcatctacagctgctcatagaggtgcttctgtagacttagctattttctctcttcacctagcaggtgcctcctctattttaggttcaattaactttatttccacagtaataaatatacgagtaaaaaatatattaatagaccaaatccctttatttgtttgagctattttcttcactactattcttcttcttcttctttctttacctgttctagcaggagctatcacaatacttttaacagaccgtaatctcaatacatcattctttgacccttctggggggggtgaccctatcttgtaccagcatctctt

In [None]:
%%file Crangonyx_pseudogracilis_OTU.minibc.ref.fasta
>CH304-pl1-3-7-Oct-nc|1_1106_17586_25029_1_ex
ctctatcatcaataacagcccacagaggttcatcagtagacctggctattttttctctccacctagctggtgcatcctcaattttaggagctatcaattttctatccacaataataaatataaaagtaaaaaaccttcttatagaccaagttcctttatttgtttgagcaattttttttacaacaattcttctccttctgtctctacctgttttagccggagctatcactatactattgacagaccgcaatcttaatacatcattctttgatccatcaggaggtggagaccctattctatatcaacatctttt

# Comparing all sequences against the minibc reference sequences.

Create blast databases.

In [None]:
%%bash

makeblastdb -in Crangonyx_floridanus_OTU.minibc.ref.fasta -dbtype nucl -out Crangonyx_floridanus_minibc_ref
makeblastdb -in Crangonyx_pseudogracilis_OTU.minibc.ref.fasta -dbtype nucl -out Crangonyx_pseudogracilis_minibc_ref

Run Blast on all *_OTU.fa* files, excluding all the dereplicated files.

In [None]:
%%bash

for file in $(ls -1 *.fa | grep "derep" -v)
do
    prefix=$(echo -e "$file" | sed 's/\.fa$//')
    sp=$(echo -e "$file" | cut -d "_" -f 1,2)
        
    blastn -db $sp\_minibc_ref -query $file -outfmt 6 -out $prefix.vs.minibc.blastn.out
done

Parse outputs and identify clipping points for sequences longer than the reference.

In [None]:
full_length=313


global_clips = {}

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    global_clips[syn] = {}
    blast = open(syn+'.vs.minibc.blastn.out','r')
    
    for OTU in blast:
        cols=OTU.strip().split("\t")
        temp = []
    #    print "#%s" %cols[0]
        if int(cols[8]) < int(cols[9]):
    #        print "\torientation ok"
            temp.append(0)
            #check the first end for overhang
            if int(cols[6]) == 1: #sequence alignment starts at position 1 -> lower clipping point is 0, i.e. not needed
    #            print "\t1 - first end starts with 1 - no clipping on this side"
                temp.append(0)
            else: #Alignment does not start at position 1, then specify clippoint to clip to same length as ref
                temp.append(int(cols[6])-int(cols[8]))
    #            print "\t1 - Alignment starts at pos %s vs. %s in ref - clip at: %s" %(cols[6],cols[8],temp[-1])
            
            #check second end for overhang
            if int(cols[9]) == full_length:
                temp.append(int(cols[7]))
    #            print "\t2 - Full lenght alignment ending at pos %s in query" %cols[7]
            else:
                temp.append((full_length-int(cols[9]))+int(cols[7])) 
    #            print "\t2 - incomplete alignment ends with query pos %s and ref pos %s - clip at: %s" %(cols[7], cols[9], temp[-1])
        
        
        else:
    #        print "\treverse complement"
            temp.append(1)
            if int(cols[6]) == 1: #sequence alignment starts at position 1 -> lower clipping point is 0, i.e. not needed
    #            print "\t1 - first end alignment starts at base 1 - no clipping on this side"
                temp.append(0)
            else:
                temp.append(int(cols[6])-(full_length-int(cols[8]))-1)
    #            print "\t1 - alignment starts with query pos %s and ref pos %s - clip at: %s" %(cols[6], cols[8], temp[-1])
            
            if int(cols[9]) == 1:
    #            print "\t2 - full length alignment ending in pos %s in query" %cols[7]
                temp.append(int(cols[7]))
            else:
                temp.append(int(cols[7])+int(cols[9]))
    #            print "\t2 - incomplete alignment ends with query %s at ref pos %s - clip at: %s" %(cols[7], cols[9], temp[-1])

        for i in range(len(temp)):
            if temp[i] < 0:
                temp[i] = 0
            
#    print "\t"+str(temp)
        global_clips[syn][cols[0]] = temp[:]
    
#print global_clips

Clip and reverese complement based on blast results if necessary.

In [None]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn

    outseqs = []
    seqs = SeqIO.parse(open(syn+'.fa','r'),'fasta')

    for s in seqs:
        s.seq = s.seq[global_clips[syn][s.id][1]:global_clips[syn][s.id][2]]
        if global_clips[syn][s.id][0] == 1:
            s.seq = s.seq.reverse_complement()
        
        outseqs.append(s)
    
    out = open(syn+'.clipped.fasta','w')
    SeqIO.write(outseqs,out,'fasta')
    out.close()

For each OTU bin reads per sample.

In [None]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    outseqs = {}
    
    seqs = SeqIO.parse(open(syn+'.clipped.fasta','r'),'fasta')
    for s in seqs:
        sample = s.id.split("|")[0]
        
        if not sample in outseqs:
            outseqs[sample] = []
        outseqs[sample].append(s)
        
    for sample in outseqs:
        print "\twriting data for sample: %s" %sample
        out=open(sample+'.'+syn+'.clipped.fasta','w')
        SeqIO.write(outseqs[sample],out,'fasta')
        out.close()
        
    

Identify most abundant OTU per sample.

In [None]:
import glob
import os
import shlex, subprocess

for i in range(len(OTUs_synonyms)):
    print OTUs_synonyms[i],OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    hts_per_OTU=[]
    seqs_per_OTU=[]
    
    for f in glob.glob('*.'+syn+'.clipped.fasta'):
        prefix = f.replace('.fasta','')
        print "#"+prefix,

        #cluster at 100% similarity full length

        mb.vsearch_cluster_full_length(infile=f, cluster_match=float(1), threads=3, sampleID=prefix)
        
        #output original centroid id as chosen by vsearch
        cs=[]
        for c in per_OTU_centroids[OTUs_as_list[i]]:
            if f.split(".")[0] == c.split("|")[0]:
                cs.append(c)
        print " - %s" %cs,
                
        hts_per_OTU.append(mb.find_most_abundant_seq_from_uc(uc=prefix+'.uc'))
        print " -> %s" %hts_per_OTU[-1],
        
        if hts_per_OTU[-1] in cs:
            print " - OK"
        else:
            print " - adjust"
        

#        os.remove(f+'.uc')
#        os.remove(f+'_centroids.fasta')
#        os.remove(f)

    print "extracting hts for %s -> %s" %(syn,syn+'_hts.fasta')
    for r in SeqIO.parse(syn+'.clipped.fasta', 'fasta'):
        if r.id in hts_per_OTU:
            seqs_per_OTU.append(r)
                
    fasta.close()
#    print "final cleanup .. ",
#    os.remove(OTUs_synonyms[i]+'.fa')
#    print "DONE!\n"
        
    out=open(syn+'_hts.fasta', 'w')
    SeqIO.write(seqs_per_OTU, out, 'fasta')
    out.close()

For each original OTU, cluster the chosen (most abundant) haplotypes for each sample at 100% to remove redundancy and identify the set of unique observed haplotypes.

In [None]:
for f in glob.glob('*OTU_hts.fasta'):
    print f
    prefix = f.replace('.fasta','')
    mb.vsearch_cluster_full_length(infile=f, cluster_match=float(1), threads=3, sampleID=prefix)

Align the non-redundant sequences with mafft.

In [None]:
%%bash

for file in $(ls -1 *_hts_centroids*)
do
    out=$(echo -e "$file" | sed 's/fasta$/aln.fasta/')
    mafft --localpair --maxiterate 1000 $file > $out
done


#### Manually inspect alignments and remove dubious bases. Curated alignments were saved to directory `haplotype_alignments`.
Cluster observed haplotypes again at 100% similarity to remove redundancy.

In [None]:
from Bio import SeqIO
import glob

for f in glob.glob('../haplotypes_alignments/*'):
    print f
    prefix = f.split("/")[1].replace("_centroids.aln.fasta","")
    print prefix
    out = ""
    seqs = SeqIO.parse(open(f,'r'), 'fasta')
    for s in seqs:
        out+=">%s\n%s\n" %(s.id,str(s.seq).replace("-","").upper())

    fh = open(prefix+'.fasta','w')
    fh.write(out)
    fh.close()
    
    mb.vsearch_cluster_full_length(infile=prefix+'.fasta', cluster_match=float(1), threads=3, sampleID=prefix)

Give unique names to haplotypes and write to file.

In [None]:
from Bio import SeqIO
import glob

for f in glob.glob('*_hts_centroids.fasta'):
    print f
    sp_prefix = f.split("_")[0][0]+f.split("_")[1][0]
    prefix = f.replace("_centroids.fasta","")
    print prefix
    
    count = 1

    seqs = SeqIO.parse(open(f, 'r'), 'fasta')

    fh=open(prefix+'.nr.fasta','w')
    for s in seqs:
        fh.write(">"+sp_prefix+'_UK_Mb-'+"%02d\n%s\n" %(count,s.seq))
        count+=1
    fh.close()

Concatenate all haplotypes into a single file.

In [None]:
from Bio import SeqIO
import glob

seqs = []

for i in range(len(OTUs_synonyms)):
    print OTUs_synonyms[i],OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    sp="_".join(syn.split("_")[:2])
    print sp
#    prefix = sp.split("_")[0][0]+sp.split("_")[1][0]
#    print prefix
    for r in SeqIO.parse(open(syn+'_hts.nr.fasta','r'), 'fasta'):
        r.description = sp+'|'+r.id
        r.id = r.description
        seqs.append(r)
        
out=open('Crangonyx_from_metaBEAT.fasta', 'w')
SeqIO.write(seqs, out, 'fasta')
out.close()

Compare the observed haplotypes to the full set of sequences to identify the samples which contain sequences that receive full length hits from the haplotypes.

In [None]:
%%bash

for s in $(ls -1 *_hts.nr.fasta)
do
    prefix=$(echo -e "$s" | cut -d "." -f 1 | sed 's/_hts//')
    vsearch --usearch_global $s  \
    --strand both \
    --db $prefix.clipped.fasta \
    --id 0.97 --query_cov 1 --maxaccepts 100000 \
    --blast6out $prefix.vs.full.blast.out 
done

Remove redundancy from Sanger sequences produced with Folmer primers.

In [None]:
%%bash

vsearch --cluster_fast ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta --id 1.0 --strand both --threads 3 \
--centroids ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER.nr.fasta \
--uc ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER.nr.uc --query_cov 1