## Extract _C. floridanus_ and _C. pseudogracilis_ haplotypes from metabarcoding data.

Before extracting the _Crangonyx spp._ haplotypes we need to import the functions.

In [1]:
import metaBEAT_global_misc_functions as mb

In [2]:
mkdir Crangonyx_haplotypes

mkdir: cannot create directory ‘Crangonyx_haplotypes’: File exists


In [3]:
cd Crangonyx_haplotypes/

/home/working/C_floridanus/3-extract_haplotypes/Crangonyx_haplotypes


Now extract the OTU ids for the relevant species that we filtered above 2%.

In [4]:
OTU_table = mb.load_BIOM('../../2-metaBEAT/filtered.biom')

dictionary = mb.find_target_OTUs_by_taxonomy(OTU_table, target='Crangonyx_floridanus', level='all')

OTUs = {}
OTUs['Crangonyx_floridanus'] = dictionary.keys()

dictionary = mb.find_target_OTUs_by_taxonomy(OTU_table, target='Crangonyx_pseudogracilis', level='all')

OTUs['Crangonyx_pseudogracilis'] = dictionary.keys()

print ""
print OTUs


OTUs_as_list = []
for sp in OTUs:
    OTUs_as_list.extend(OTUs[sp])

print "\nOTU list:"
print OTUs_as_list


Specified BIOM input format 'json' - ok!
SEARCH TERM: 'Crangonyx_floridanus'
Found taxonomy metadata with OTUs - ok!
Screening at taxonomic level: 'all'


Identified 1 OTU(s) assigned to 'Crangonyx_floridanus'.
SEARCH TERM: 'Crangonyx_pseudogracilis'
Found taxonomy metadata with OTUs - ok!
Screening at taxonomic level: 'all'


Identified 1 OTU(s) assigned to 'Crangonyx_pseudogracilis'.

{'Crangonyx_floridanus': [u'CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex'], 'Crangonyx_pseudogracilis': [u'CH105-1-4-May-nc|1_2105_9715_15085_1_ex']}

OTU list:
[u'CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex', u'CH105-1-4-May-nc|1_2105_9715_15085_1_ex']


Identify samples contributing to each of the OTUs.

In [5]:
per_OTU_samples={}

samples = OTU_table.ids(axis='sample')

for OTU in OTUs_as_list:
#    print OTU
    per_OTU_samples[OTU] = []
    obs = OTU_table.data(OTU, axis='observation')
    for i in range(len(obs)):
        if int(obs[i]) > 0:
#            print "\t%s" %samples[i]
            per_OTU_samples[OTU].append(".".join(samples[i].split(".")[:-1]))

    
print "OTU centroid ID - # of samples"
for OTU in per_OTU_samples:
    print "%s - %s" %(OTU,len(per_OTU_samples[OTU]))

OTU centroid ID - # of samples
CH105-1-4-May-nc|1_2105_9715_15085_1_ex - 75
CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex - 77


Identify centroid IDS for the relevant OTUs, including query target alignments ('H').

In [6]:
per_OTU_centroids={}

print "OTU ids to process:"
for OTU in per_OTU_samples:
    print OTU,len(per_OTU_samples[OTU])
print "#######\n"

uc=open('../../2-metaBEAT/GLOBAL/global.uc', 'r')

for line in uc:
    if line.startswith('H'):
        if line.strip().split("\t")[9] in OTUs_as_list:
#            print "hit: %s\t%s" %(line.strip().split("\t")[9],line.strip().split("\t")[8])
            if not per_OTU_centroids.has_key(line.strip().split("\t")[9]):
                per_OTU_centroids[line.strip().split("\t")[9]]=[line.strip().split("\t")[9]]

                
            if line.strip().split("\t")[8].split("|")[0] in per_OTU_samples[line.strip().split("\t")[9]]:
                per_OTU_centroids[line.strip().split("\t")[9]].append(line.strip().split("\t")[8])
#                print "found sample: %s" %line.strip().split("\t")[8].split("|")[0]
                        
uc.close()


for OTU in per_OTU_centroids:
    print OTU,str(len(per_OTU_centroids[OTU]))
    for c in sorted(per_OTU_centroids[OTU]):
        print "\tfound centroid: "+c
#print per_OTU_centroids


OTU ids to process:
CH105-1-4-May-nc|1_2105_9715_15085_1_ex 75
CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex 77
#######

CH105-1-4-May-nc|1_2105_9715_15085_1_ex 190
	found centroid: CH103-1-6-May-nc|1_1101_24330_23874_1_ex
	found centroid: CH103-1-6-May-nc|1_1102_10220_11006_1_ex
	found centroid: CH103-1-6-May-nc|1_1102_27565_22423_1_ex
	found centroid: CH103-1-6-May-nc|1_1102_7800_20792_1_ex
	found centroid: CH103-1-6-May-nc|1_1102_8729_18418_1_ex
	found centroid: CH103-1-6-May-nc|1_1103_11450_9537_1_ex
	found centroid: CH103-1-6-May-nc|1_1103_12563_12372_1_ex
	found centroid: CH103-1-6-May-nc|1_1103_17548_18310_1_ex
	found centroid: CH103-1-6-May-nc|1_1103_28869_13963_1_ex
	found centroid: CH103-1-6-May-nc|1_1104_22429_18099_1_ex
	found centroid: CH103-1-6-May-nc|1_1104_29122_17093_1_ex
	found centroid: CH103-1-6-May-nc|1_1104_6632_7010_1_ex
	found centroid: CH103-1-6-May-nc|1_1105_21661_12374_1_ex
	found centroid: CH103-1-6-May-nc|1_1105_25153_15980_1_ex
	found centroid: CH103-1-6-Ma

Specify a unique id for each OTU.

In [7]:
OTUs_synonyms = []
for sp in OTUs:
    count=0
#    OTUs_as_list.extend(OTUs[sp])
    
    for otu in OTUs[sp]:
        OTUs_synonyms.append(sp+'_'+str(count)+'_OTU')
        count+=1
    
print OTUs_as_list
print OTUs_synonyms


[u'CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex', u'CH105-1-4-May-nc|1_2105_9715_15085_1_ex']
['Crangonyx_floridanus_0_OTU', 'Crangonyx_pseudogracilis_0_OTU']


Identify samples contributing to each of the OTUs.

In [8]:
per_OTU_samples={}
            
for otu in per_OTU_centroids:
    per_OTU_samples[otu] = []
    for centroid in per_OTU_centroids[otu]:
        per_OTU_samples[otu].append(centroid.split("|")[0])
#        print centroid.split("|")[1]

    per_OTU_samples[otu]=list(set(per_OTU_samples[otu]))
    print "\n"+otu,len(per_OTU_samples[otu]),sorted(per_OTU_samples[otu])



CH105-1-4-May-nc|1_2105_9715_15085_1_ex 75 ['CH103-1-6-May-nc', 'CH105-1-4-May-nc', 'CH106-1-3-May-nc', 'CH109-pl1-2-1-Oct-nc', 'CH110-pl1-2-2-Oct-nc', 'CH111-pl1-2-3-Oct-nc', 'CH112-pl1-2-4-Oct-nc', 'CH113-pl1-2-5-Oct-nc', 'CH114-pl1-2-6-Oct-nc', 'CH115-pl1-2-7-Oct-nc', 'CH116-pl1-2-8-Oct-nc', 'CH117-pl1-3-1-Oct-nc', 'CH118-pl1-3-2-Oct-nc', 'CH119-pl1-3-3-Oct-nc', 'CH201-2-6-May-nc', 'CH203-2-4-May-nc', 'CH204-2-3-May-nc', 'CH205-2-2-May-nc', 'CH206-2-1-May-nc', 'CH209-3-6-May-nc', 'CH301-pl1-3-4-Oct-nc', 'CH302-pl1-3-5-Oct-nc', 'CH303-pl1-3-6-Oct-nc', 'CH304-pl1-3-7-Oct-nc', 'CH305-pl1-3-8-Oct-nc', 'CH306-pl1-4-1-Oct-nc', 'CH307-pl1-4-2-Oct-nc', 'CH308-3-3-May-nc', 'CH308-pl1-4-3-Oct-nc', 'CH309-3-2-May-nc', 'CH309-pl1-4-4-Oct-nc', 'CH310-3-1-May-nc', 'CH310-pl1-4-5-Oct-nc', 'CH311-4-8-May-nc', 'CH312-4-7-May-nc', 'CH313-4-6-May-nc', 'CH314-4-5-May-nc', 'CH315-4-4-May-nc', 'CH316-4-3-May-nc', 'CH317-4-2-May-nc', 'CH401-4-1-May-nc', 'CH401-pl1-4-6-Oct-nc', 'CH402-5-8-May-nc', 'CH403-

Create a global fasta file containing all reads for each OTU.

In [9]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print "OTU: %s -> %s.fa" %(otu,syn)
    read_ids_per_OTU=[]
    seqs_per_OTU=[]
    for sample in per_OTU_samples[otu]:
        seqs_per_sample=[]
#        print "Sample: %s" %sample
        read_ids=[]
#        collect relevant centroids
        centroids=[]
        for c in per_OTU_centroids[otu]:
#            print "centroid: %s" %c
            if c.startswith(sample+'|'):
                for c_ind in c.split("|")[1:]:
                    centroids.append(c_ind)
                
#        print "Centroids: %s" %centroids
        
#        extract read ids from uc file
        read_ids.extend(centroids[:])
        uc=open('../../2-metaBEAT/'+sample+'/'+sample+'.uc', 'r')
        for line in uc:
            if line.startswith('H'):
                if line.strip().split("\t")[9] in centroids:
#                    print line
                    read_ids.append(line.strip().split("\t")[8])
        uc.close()            
#        print "READ IDS: %i" %len(read_ids)
        
#        extract reads per sample
        
        fasta=open('../../2-metaBEAT/'+sample+'/'+sample+'_queries.fasta', 'r')     # trimmed.fasta', 'r')
        for r in SeqIO.parse(fasta, 'fasta'):
            if r.id in read_ids:
                r.id = sample+'|'+r.id
                r.description = r.id
                seqs_per_OTU.append(r)
                
            
        fasta.close()
        
        
#    Write out global fasta per OTU, containing all reads across all samples
    out=open(syn+'.fa', 'w')
    SeqIO.write(seqs_per_OTU, out, 'fasta')
    out.close()

OTU: CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex -> Crangonyx_floridanus_0_OTU.fa
OTU: CH105-1-4-May-nc|1_2105_9715_15085_1_ex -> Crangonyx_pseudogracilis_0_OTU.fa


Dereplicate all sequences at 95% identity match.

In [10]:
%%bash

for OTU in $(ls -1 *.fa)
do
    vsearch --derep_fulllength $OTU \
    --strand both --output derep_$OTU \
    --uc derep_$OTU.uc --id 0.95
done

vsearch v1.1.0_linux_x86_64, 15.6GB RAM, 8 cores
https://github.com/torognes/vsearch

vsearch v1.1.0_linux_x86_64, 15.6GB RAM, 8 cores
https://github.com/torognes/vsearch



Reading file Crangonyx_floridanus_0_OTU.fa 0%  Reading file Crangonyx_floridanus_0_OTU.fa 1%  Reading file Crangonyx_floridanus_0_OTU.fa 2%  Reading file Crangonyx_floridanus_0_OTU.fa 2%  Reading file Crangonyx_floridanus_0_OTU.fa 3%  Reading file Crangonyx_floridanus_0_OTU.fa 4%  Reading file Crangonyx_floridanus_0_OTU.fa 5%  Reading file Crangonyx_floridanus_0_OTU.fa 5%  Reading file Crangonyx_floridanus_0_OTU.fa 6%  Reading file Crangonyx_floridanus_0_OTU.fa 7%  Reading file Crangonyx_floridanus_0_OTU.fa 8%  Reading file Crangonyx_floridanus_0_OTU.fa 8%  Reading file Crangonyx_floridanus_0_OTU.fa 9%  Reading file Crangonyx_floridanus_0_OTU.fa 10%  Reading file Crangonyx_floridanus_0_OTU.fa 10%  Reading file Crangonyx_floridanus_0_OTU.fa 11%  Reading file Crangonyx_floridanus_0_OTU.fa 12%  Reading file Crangonyx_floridanus_0_OTU.fa 13%  Reading file Crangonyx_floridanus_0_OTU.fa 13%  Reading file Crangonyx_floridanus_0_OTU.fa 14%  Reading file Crangonyx_floridanus

For each OTU, extract the most abundant dereplicated sequence as reference. Compare all other sequences to this one via `usearch_global`. Parse output and reverse complement any sequences if necessary.

Extract top sequence and write to file. 

In [11]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    seqs = SeqIO.parse(open('derep_'+syn+'.fa','r'),'fasta')

    seq = seqs.next()

#    seq.seq = seq.seq.reverse_complement()

    out=open('derep_'+syn+'.ref.fasta','w')
    SeqIO.write(seq, out, 'fasta')
    out.close()

Crangonyx_floridanus_0_OTU
Crangonyx_pseudogracilis_0_OTU


Compare all sequences against the most abundant dereplicated sequences with `usearch_global`.

In [12]:
%%bash

for ref in $(ls -1 derep* | grep "fasta")
do
    full=$(echo -e "$ref" | sed 's/\.ref//' | sed 's/sta$//')
    
    vsearch --usearch_global $full \
    --strand both \
    --db $ref \
    --id 0.9 \
    --blast6out $full.blast.out 
done

vsearch v1.1.0_linux_x86_64, 15.6GB RAM, 8 cores
https://github.com/torognes/vsearch

vsearch v1.1.0_linux_x86_64, 15.6GB RAM, 8 cores
https://github.com/torognes/vsearch



Reading file derep_Crangonyx_floridanus_0_OTU.ref.fasta 0%  Reading file derep_Crangonyx_floridanus_0_OTU.ref.fasta 100%  Reading file derep_Crangonyx_floridanus_0_OTU.ref.fasta 100%
314 nt in 1 seqs, min 314, max 314, avg 314
Indexing sequences 0%  Indexing sequences 0%  Indexing sequences 100%
Masking 0%  Masking 100%
Counting unique k-mers 0%  Counting unique k-mers 0%  Counting unique k-mers 100%
Creating index of unique k-mers 0%  Creating index of unique k-mers 0%  Creating index of unique k-mers 100%
Searching 0%  Searching 1%  Searching 3%  Searching 4%  Searching 5%  Searching 7%  Searching 8%  Searching 10%  Searching 11%  Searching 14%  Searching 16%  Searching 17%  Searching 20%  Searching 20%  Searching 23%  Searching 23%  Searching 26%  Searching 27%  Searching 29%  Searching 30%  Searching 31%  Searching 32%  Searching 33%  Searching 34%  Searching 35%  Searching 36%  Searching 36%  Searching 39%  Searching 40%  Searching 44%  Searc

Parse output and identify sequences to reverse complement.

In [13]:
to_reverse = {}

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    to_reverse[syn] = []
    blast = open('derep_'+syn+'.fa.blast.out','r')

    for rec in blast:
        cols = rec.strip().split("\t")
        if cols[6] > cols[7]:
            to_reverse[syn].append(cols[0])
        
    print to_reverse[syn]

Crangonyx_floridanus_0_OTU
[]
Crangonyx_pseudogracilis_0_OTU
[]


Reverse complement if necessary.

In [14]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    if to_reverse[syn]:
        seqs = SeqIO.parse(open('derep_'+syn+'.fa','r'),'fasta')
        outseqs = []
        for s in seqs:
            print "search: %s" %s.id
            if s.id in to_reverse[syn]:
                print "#%s\t%s" %(to_reverse[syn].index(s.id), to_reverse[syn][to_reverse[syn].index(s.id)])
                s.seq = s.seq.reverse_complement()
                del(to_reverse[syn][to_reverse[syn].index(s.id)])
                outseqs.append(s)
        out = open('derep_'+syn+'.fa','w')
        SeqIO.write(outseqs, out, 'fasta')
        out.close()
    else:
        print "nothing to reverse"

Crangonyx_floridanus_0_OTU
nothing to reverse
Crangonyx_pseudogracilis_0_OTU
nothing to reverse


Write top ten sequences to file and align for manual inspection.

In [15]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    seqs = SeqIO.parse(open('derep_'+syn+'.fa','r'),'fasta')
    count=0
    seqs_to_print = []
    for s in seqs:
        print count,s.id
        count+=1
        seqs_to_print.append(s)
        if count > 9:
            break
    out=open('derep_'+syn+'.top10.fa','w')
    SeqIO.write(seqs_to_print,out,'fasta')
    out.close()

Crangonyx_floridanus_0_OTU
0 pcr-NEG-pl1-7-5-Oct-nc|1_2110_26480_22422_1_ex
1 CH-BL2-pl1-7-4-Oct-nc|1_1102_10818_23263_1_ex
2 CH101-1-8-May-nc|1_2114_22385_22080_1_ex
3 CH101-pl1-1-1-Oct-nc|1_1101_19163_12271_1_ex
4 CH101-pl1-1-1-Oct-nc|1_1105_9774_10777_1_ex
5 CH101-pl1-1-1-Oct-nc|1_2105_17136_11806_1_ex
6 CH102-1-7-May-nc|1_1102_17379_27773_1_ex
7 CH102-pl1-1-2-Oct-nc|1_2112_22478_8174_1_ex
8 CH103-pl1-1-3-Oct-nc|1_1102_9913_11787_1_ex
9 CH104-1-5-May-nc|1_1103_23331_12069_1_ex
Crangonyx_pseudogracilis_0_OTU
0 CH113-pl1-2-5-Oct-nc|1_1102_10926_16218_1_ex
1 CH204-2-3-May-nc|1_1113_25654_15317_1_ex
2 CH304-pl1-3-7-Oct-nc|1_1106_17586_25029_1_ex
3 CH501-pl1-4-7-Oct-nc|1_1114_21367_11968_1_ex
4 CH103-1-6-May-nc|1_1101_24330_23874_1_ex
5 CH103-1-6-May-nc|1_1102_10220_11006_1_ex
6 CH103-1-6-May-nc|1_1102_27565_22423_1_ex
7 CH103-1-6-May-nc|1_1102_7800_20792_1_ex
8 CH103-1-6-May-nc|1_1102_8729_18418_1_ex
9 CH103-1-6-May-nc|1_1103_11450_9537_1_ex


Align top 10 with mafft.

In [16]:
%%bash

for file in $(ls -1 *top10*)
do
    out=$(echo -e "$file" | sed 's/fa$/aln.fa/')
    mafft --localpair --maxiterate 1000 $file > $out
done


nseq =  10
distance =  local
iterate =  16
cycle =  1
nthread = 0
lastonce = 0
generating 200PAM scoring matrix for nucleotides ... done
done
done
scoremtx = -1
    0 / 10    1 / 10    2 / 10    3 / 10    4 / 10    5 / 10    6 / 10    7 / 10    8 / 10

##### writing hat3
pairlocalalign (nuc) Version 7.123b alg=L, model=DNA200 (2),  2.000 ( 6.000), -0.099 (-0.297)
0 thread(s)
nthread = 0
blosum 62 / kimura 200
Loading 'hat3' ... 
done.
generating 200PAM scoring matrix for nucleotides ... done
done
done
scoremtx = -1
Gap Penalty = -1.53, +0.00, +0.00
Loading 'hat2' ... done.
Constructing a UPGMA tree ... 
    0 / 10
done.

Progressive alignment ... 
STEP     1 /9 cSTEP     2 /9 cSTEP     3 /9 cSTEP     4 /9 cSTEP     5 /9 cSTEP     6 /9 cSTEP     7 /9 cSTEP     8 /9 cSTEP     9 /9 c
done.
tbfast (nuc) Version 7.123b alg=A, model=DNA200 (2),  1.530 ( 4.590), -0.000 (-0.000)
0 thread(s)
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
Loading 'ha

## Representatives of observed OTUs

Curated minibarcode (Leray CO1 region) sequences were selected as representatives and saved to two separate files (one per taxa).

In [17]:
%%file Crangonyx_floridanus_OTU.minibc.ref.fasta
>CH101-pl1-1-1-Oct-nc|1_2105_17136_11806_1_ex
tttagcatctacagctgctcatagaggtgcttctgtagacttagctattttctctcttcacctagcaggtgcctcctctattttaggttcaattaactttatttccacagtaataaatatacgagtaaaaaatatattaatagaccaaatccctttatttgtttgagctattttcttcactactattcttcttcttcttctttctttacctgttctagcaggagctatcacaatacttttaacagaccgtaatctcaatacatcattctttgacccttctggggggggtgaccctatcttgtaccagcatctctt

Writing Crangonyx_floridanus_OTU.minibc.ref.fasta


In [18]:
%%file Crangonyx_pseudogracilis_OTU.minibc.ref.fasta
>CH304-pl1-3-7-Oct-nc|1_1106_17586_25029_1_ex
ctctatcatcaataacagcccacagaggttcatcagtagacctggctattttttctctccacctagctggtgcatcctcaattttaggagctatcaattttctatccacaataataaatataaaagtaaaaaaccttcttatagaccaagttcctttatttgtttgagcaattttttttacaacaattcttctccttctgtctctacctgttttagccggagctatcactatactattgacagaccgcaatcttaatacatcattctttgatccatcaggaggtggagaccctattctatatcaacatctttt

Writing Crangonyx_pseudogracilis_OTU.minibc.ref.fasta


# Comparing all sequences against the minibc reference sequences.

Create blast databases.

In [19]:
%%bash

makeblastdb -in Crangonyx_floridanus_OTU.minibc.ref.fasta -dbtype nucl -out Crangonyx_floridanus_minibc_ref
makeblastdb -in Crangonyx_pseudogracilis_OTU.minibc.ref.fasta -dbtype nucl -out Crangonyx_pseudogracilis_minibc_ref



Building a new DB, current time: 07/24/2019 18:10:02
New DB name:   Crangonyx_floridanus_minibc_ref
New DB title:  Crangonyx_floridanus_OTU.minibc.ref.fasta
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 1 sequences in 0.000191927 seconds.


Building a new DB, current time: 07/24/2019 18:10:02
New DB name:   Crangonyx_pseudogracilis_minibc_ref
New DB title:  Crangonyx_pseudogracilis_OTU.minibc.ref.fasta
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 1 sequences in 0.000222921 seconds.


Run Blast on all *_OTU.fa* files, excluding all the dereplicated files.

In [20]:
%%bash

for file in $(ls -1 *.fa | grep "derep" -v)
do
    prefix=$(echo -e "$file" | sed 's/\.fa$//')
    sp=$(echo -e "$file" | cut -d "_" -f 1,2)
        
    blastn -db $sp\_minibc_ref -query $file -outfmt 6 -out $prefix.vs.minibc.blastn.out
done

Parse outputs and identify clipping points for sequences longer than the reference.

In [21]:
full_length=313


global_clips = {}

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    global_clips[syn] = {}
    blast = open(syn+'.vs.minibc.blastn.out','r')
    
    for OTU in blast:
        cols=OTU.strip().split("\t")
        temp = []
    #    print "#%s" %cols[0]
        if int(cols[8]) < int(cols[9]):
    #        print "\torientation ok"
            temp.append(0)
            #check the first end for overhang
            if int(cols[6]) == 1: #sequence alignment starts at position 1 -> lower clipping point is 0, i.e. not needed
    #            print "\t1 - first end starts with 1 - no clipping on this side"
                temp.append(0)
            else: #Alignment does not start at position 1, then specify clippoint to clip to same length as ref
                temp.append(int(cols[6])-int(cols[8]))
    #            print "\t1 - Alignment starts at pos %s vs. %s in ref - clip at: %s" %(cols[6],cols[8],temp[-1])
            
            #check second end for overhang
            if int(cols[9]) == full_length:
                temp.append(int(cols[7]))
    #            print "\t2 - Full lenght alignment ending at pos %s in query" %cols[7]
            else:
                temp.append((full_length-int(cols[9]))+int(cols[7])) 
    #            print "\t2 - incomplete alignment ends with query pos %s and ref pos %s - clip at: %s" %(cols[7], cols[9], temp[-1])
        
        
        else:
    #        print "\treverse complement"
            temp.append(1)
            if int(cols[6]) == 1: #sequence alignment starts at position 1 -> lower clipping point is 0, i.e. not needed
    #            print "\t1 - first end alignment starts at base 1 - no clipping on this side"
                temp.append(0)
            else:
                temp.append(int(cols[6])-(full_length-int(cols[8]))-1)
    #            print "\t1 - alignment starts with query pos %s and ref pos %s - clip at: %s" %(cols[6], cols[8], temp[-1])
            
            if int(cols[9]) == 1:
    #            print "\t2 - full length alignment ending in pos %s in query" %cols[7]
                temp.append(int(cols[7]))
            else:
                temp.append(int(cols[7])+int(cols[9]))
    #            print "\t2 - incomplete alignment ends with query %s at ref pos %s - clip at: %s" %(cols[7], cols[9], temp[-1])

        for i in range(len(temp)):
            if temp[i] < 0:
                temp[i] = 0
            
#    print "\t"+str(temp)
        global_clips[syn][cols[0]] = temp[:]
    
#print global_clips

Crangonyx_floridanus_0_OTU
Crangonyx_pseudogracilis_0_OTU


Clip and reverese complement based on blast results if necessary.

In [22]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn

    outseqs = []
    seqs = SeqIO.parse(open(syn+'.fa','r'),'fasta')

    for s in seqs:
        s.seq = s.seq[global_clips[syn][s.id][1]:global_clips[syn][s.id][2]]
        if global_clips[syn][s.id][0] == 1:
            s.seq = s.seq.reverse_complement()
        
        outseqs.append(s)
    
    out = open(syn+'.clipped.fasta','w')
    SeqIO.write(outseqs,out,'fasta')
    out.close()

Crangonyx_floridanus_0_OTU
Crangonyx_pseudogracilis_0_OTU


For each OTU bin reads per sample.

In [23]:
from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print syn
    outseqs = {}
    
    seqs = SeqIO.parse(open(syn+'.clipped.fasta','r'),'fasta')
    for s in seqs:
        sample = s.id.split("|")[0]
        
        if not sample in outseqs:
            outseqs[sample] = []
        outseqs[sample].append(s)
        
    for sample in outseqs:
        print "\twriting data for sample: %s" %sample
        out=open(sample+'.'+syn+'.clipped.fasta','w')
        SeqIO.write(outseqs[sample],out,'fasta')
        out.close()
        
    

Crangonyx_floridanus_0_OTU
	writing data for sample: CH101-pl1-1-1-Oct-nc
	writing data for sample: CH113-pl1-2-5-Oct-nc
	writing data for sample: CH206-2-1-May-nc
	writing data for sample: CH304-pl1-3-7-Oct-nc
	writing data for sample: CH112-pl1-2-4-Oct-nc
	writing data for sample: CH602-pl1-6-2-Oct-nc
	writing data for sample: CH603-pl1-6-3-Oct-nc
	writing data for sample: CH103-pl1-1-3-Oct-nc
	writing data for sample: CH502-pl1-4-8-Oct-nc
	writing data for sample: CH110-2-7-May-nc
	writing data for sample: CH102-1-7-May-nc
	writing data for sample: CH203-2-4-May-nc
	writing data for sample: CH114-pl1-2-6-Oct-nc
	writing data for sample: CH310-pl1-4-5-Oct-nc
	writing data for sample: CH308-pl1-4-3-Oct-nc
	writing data for sample: CH111-pl1-2-3-Oct-nc
	writing data for sample: CH609-pl1-7-1-Oct-nc
	writing data for sample: CH607-pl1-6-7-Oct-nc
	writing data for sample: CH104-1-5-May-nc
	writing data for sample: CH508-6-1-May-nc
	writing data for sample: CH606-pl1-6-6-Oct-nc
	writing d

Identify most abundant OTU per sample.

In [24]:
import glob
import os
import shlex, subprocess

for i in range(len(OTUs_synonyms)):
    print OTUs_synonyms[i],OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    hts_per_OTU=[]
    seqs_per_OTU=[]
    
    for f in glob.glob('*.'+syn+'.clipped.fasta'):
        prefix = f.replace('.fasta','')
        print "#"+prefix,

        #cluster at 100% similarity full length

        mb.vsearch_cluster_full_length(infile=f, cluster_match=float(1), threads=3, sampleID=prefix)
        
        #output original centroid id as chosen by vsearch
        cs=[]
        for c in per_OTU_centroids[OTUs_as_list[i]]:
            if f.split(".")[0] == c.split("|")[0]:
                cs.append(c)
        print " - %s" %cs,
                
        hts_per_OTU.append(mb.find_most_abundant_seq_from_uc(uc=prefix+'.uc'))
        print " -> %s" %hts_per_OTU[-1],
        
        if hts_per_OTU[-1] in cs:
            print " - OK"
        else:
            print " - adjust"
        

#        os.remove(f+'.uc')
#        os.remove(f+'_centroids.fasta')
#        os.remove(f)

    print "extracting hts for %s -> %s" %(syn,syn+'_hts.fasta')
    for r in SeqIO.parse(syn+'.clipped.fasta', 'fasta'):
        if r.id in hts_per_OTU:
            seqs_per_OTU.append(r)
                
    fasta.close()
#    print "final cleanup .. ",
#    os.remove(OTUs_synonyms[i]+'.fa')
#    print "DONE!\n"
        
    out=open(syn+'_hts.fasta', 'w')
    SeqIO.write(seqs_per_OTU, out, 'fasta')
    out.close()

Crangonyx_floridanus_0_OTU CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex
#CH206-2-1-May-nc.Crangonyx_floridanus_0_OTU.clipped  - ['CH206-2-1-May-nc|1_1114_27579_13621_1_ex']  -> CH206-2-1-May-nc|1_1114_27579_13621_1_ex  - OK
#CH105-pl1-1-5-Oct-nc.Crangonyx_floridanus_0_OTU.clipped  - ['CH105-pl1-1-5-Oct-nc|1_1111_17590_9870_1_ex']  -> CH105-pl1-1-5-Oct-nc|1_1111_17590_9870_1_ex  - OK
#CH504-pl1-5-2-Oct-nc.Crangonyx_floridanus_0_OTU.clipped  - ['CH504-pl1-5-2-Oct-nc|1_1103_6377_16927_1_ex']  -> CH504-pl1-5-2-Oct-nc|1_1103_6377_16927_1_ex  - OK
#pcr-NEG-pl1-7-7-Oct-nc.Crangonyx_floridanus_0_OTU.clipped  - ['pcr-NEG-pl1-7-7-Oct-nc|1_1101_9829_3153_1_ex']  -> pcr-NEG-pl1-7-7-Oct-nc|1_1101_9829_3153_1_ex  - OK
#CH208-3-7-May-nc.Crangonyx_floridanus_0_OTU.clipped  - ['CH208-3-7-May-nc|1_2107_13823_23739_1_ex', 'CH208-3-7-May-nc|1_1102_25191_15965_1_ex', 'CH208-3-7-May-nc|1_1107_24978_13102_1_ex', 'CH208-3-7-May-nc|1_2111_10616_10934_1_ex', 'CH208-3-7-May-nc|1_2113_13133_17253_1_ex']  -> CH208

For each original OTU, cluster the chosen (most abundant) haplotypes for each sample at 100% to remove redundancy and identify the set of unique observed haplotypes.

In [25]:
for f in glob.glob('*OTU_hts.fasta'):
    print f
    prefix = f.replace('.fasta','')
    mb.vsearch_cluster_full_length(infile=f, cluster_match=float(1), threads=3, sampleID=prefix)

Crangonyx_floridanus_0_OTU_hts.fasta
Crangonyx_pseudogracilis_0_OTU_hts.fasta


Align the non-redundant sequences with mafft.

In [26]:
%%bash

for file in $(ls -1 *_hts_centroids*)
do
    out=$(echo -e "$file" | sed 's/fasta$/aln.fasta/')
    mafft --localpair --maxiterate 1000 $file > $out
done



nseq =  64
distance =  local
iterate =  16
cycle =  1
nthread = 0
lastonce = 0
generating 200PAM scoring matrix for nucleotides ... done
done
done
scoremtx = -1
    0 / 64    1 / 64    2 / 64    3 / 64    4 / 64    5 / 64    6 / 64    7 / 64    8 / 64    9 / 64   10 / 64   11 / 64   12 / 64   13 / 64   14 / 64   15 / 64   16 / 64   17 / 64   18 / 64   19 / 64   20 / 64   21 / 64   22 / 64   23 / 64   24 / 64   25 / 64   26 / 64   27 / 64   28 / 64   29 / 64   30 / 64   31 / 64   32 / 64   33 / 64   34 / 64   35 / 64   36 / 64   37 / 64   38 / 64   39 / 64   40 / 64   41 / 64   42 / 64   43 / 64   44 / 64   45 / 64   46 / 64   47 / 64   48 / 64   49 / 64   50 / 64   51 / 64   52 / 64   53 / 64   54 / 64   55 / 64   56 / 64   57 / 64   58 / 64   59 / 64   60 / 64   61 / 64   62 / 64

##### writing hat3
pairlocalalign (nuc) Version 7.123b alg=L, model=DNA200 (2),  2.000 ( 6.000), -0.099 (-0.297)
0 thread(s)
nthread = 0
blosum

#### Manually inspect alignments and remove dubious bases. Curated alignments were saved to directory `haplotype_alignments`.
Cluster observed haplotypes again at 100% similarity to remove redundancy.

In [27]:
from Bio import SeqIO
import glob

for f in glob.glob('../haplotypes_alignments/*'):
    print f
    prefix = f.split("/")[1].replace("_centroids.aln.fasta","")
    print prefix
    out = ""
    seqs = SeqIO.parse(open(f,'r'), 'fasta')
    for s in seqs:
        out+=">%s\n%s\n" %(s.id,str(s.seq).replace("-","").upper())

    fh = open(prefix+'.fasta','w')
    fh.write(out)
    fh.close()
    
    mb.vsearch_cluster_full_length(infile=prefix+'.fasta', cluster_match=float(1), threads=3, sampleID=prefix)

Give unique names to haplotypes and write to file.

In [28]:
from Bio import SeqIO
import glob

for f in glob.glob('*_hts_centroids.fasta'):
    print f
    sp_prefix = f.split("_")[0][0]+f.split("_")[1][0]
    prefix = f.replace("_centroids.fasta","")
    print prefix
    
    count = 1

    seqs = SeqIO.parse(open(f, 'r'), 'fasta')

    fh=open(prefix+'.nr.fasta','w')
    for s in seqs:
        fh.write(">"+sp_prefix+'_UK_Mb-'+"%02d\n%s\n" %(count,s.seq))
        count+=1
    fh.close()

Crangonyx_pseudogracilis_0_OTU_hts_centroids.fasta
Crangonyx_pseudogracilis_0_OTU_hts
Crangonyx_floridanus_0_OTU_hts_centroids.fasta
Crangonyx_floridanus_0_OTU_hts


Concatenate all haplotypes into a single file.

In [29]:
from Bio import SeqIO
import glob

seqs = []

for i in range(len(OTUs_synonyms)):
    print OTUs_synonyms[i],OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    sp="_".join(syn.split("_")[:2])
    print sp
#    prefix = sp.split("_")[0][0]+sp.split("_")[1][0]
#    print prefix
    for r in SeqIO.parse(open(syn+'_hts.nr.fasta','r'), 'fasta'):
        r.description = sp+'|'+r.id
        r.id = r.description
        seqs.append(r)
        
out=open('Crangonyx_from_metaBEAT.fasta', 'w')
SeqIO.write(seqs, out, 'fasta')
out.close()

Crangonyx_floridanus_0_OTU CH506-pl1-5-4-Oct-nc|1_1105_11317_10762_1_ex
Crangonyx_floridanus
Crangonyx_pseudogracilis_0_OTU CH105-1-4-May-nc|1_2105_9715_15085_1_ex
Crangonyx_pseudogracilis


Compare the observed haplotypes to the full set of sequences to identify the samples which contain sequences that receive full length hits from the haplotypes.

In [30]:
%%bash

for s in $(ls -1 *_hts.nr.fasta)
do
    prefix=$(echo -e "$s" | cut -d "." -f 1 | sed 's/_hts//')
    vsearch --usearch_global $s  \
    --strand both \
    --db $prefix.clipped.fasta \
    --id 0.97 --query_cov 1 --maxaccepts 100000 \
    --blast6out $prefix.vs.full.blast.out 
done

vsearch v1.1.0_linux_x86_64, 15.6GB RAM, 8 cores
https://github.com/torognes/vsearch

vsearch v1.1.0_linux_x86_64, 15.6GB RAM, 8 cores
https://github.com/torognes/vsearch



Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 0%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 1%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 2%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 2%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 3%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 4%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 5%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 5%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 6%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 7%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 8%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 8%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 9%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 10%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 10%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta 11%  Reading file Crangonyx_floridanus_0_OTU.clipped.fasta

Remove redundancy from Sanger sequences produced with Folmer primers.

In [31]:
%%bash

vsearch --cluster_fast ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta --id 1.0 --strand both --threads 3 \
--centroids ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER.nr.fasta \
--uc ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER.nr.uc --query_cov 1

vsearch v1.1.0_linux_x86_64, 15.6GB RAM, 8 cores
https://github.com/torognes/vsearch



Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 0%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 8%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 17%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 25%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 33%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 40%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 48%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 56%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 64%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 71%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 79%  Reading file ../../4-infer_phylogeny/sequences/c.flor_c.pse_SANGER_full.fasta 