# Goal: 

* Running CD-HIT on fullCyc OTUs & bac_genome1210 16S rRNA gene dataset
  * cutoff 97% seqID
  * ID OTUs with taxa from both datasets
    * target genomes

# User variables

In [15]:
baseDir = '/home/nick/notebook/SIPSim/dev/fullCyc/'
workDir = os.path.join(baseDir, 'CD-HIT')

rnammerSeqs = '/home/nick/notebook/SIPSim/dev/bac_genome1210/rnammer/bac_genome1210_16S.fna'
otuRepFile = '/home/nick/notebook/SIPSim/dev/fullCyc/OTU_reps.fna'
otuTaxFile = '/home/nick/notebook/fullCyc/data/MiSeq_16S/515f-806r/V4_Lib1-7/OTU_binning/otusn_tax/otusn_tax_assignments.txt'
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
#otuTableFile = '/var/seq_data/priming_exp/data/otu_table.txt'

# Init

In [19]:
import re
import glob
import itertools
import random
from pprint import pprint

In [4]:
%load_ext rpy2.ipython

In [5]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)

Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)


In [6]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)

# Making input fasta

In [10]:
# concatenating sequences
!cd $workDir; \
    cat $otuRepFile $rnammerSeqs > ssu_all.fna
!cd $workDir; \
    grep -c ">" ssu_all.fna

8927


# CD-HIT run

In [11]:
!cd $workDir; \
    cd-hit-est -i ssu_all.fna -o ssu_all_cdhit -c 0.97 -d 0

Program: CD-HIT, V4.6, Feb 20 2014, 09:04:54
Command: cd-hit-est -i ssu_all.fna -o ssu_all_cdhit -c 0.97 -d
         0

Started: Sat Oct 17 12:57:17 2015
                            Output                              
----------------------------------------------------------------
total seq: 8927
longest and shortest : 3286 and 239
Total letters: 8078543
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 9M
Buffer          : 1 X 12M = 12M
Table           : 1 X 16M = 16M
Miscellaneous   : 4M
Total           : 43M

Table limit with the given memory limit:
Max number of representatives: 465479
Max number of word counting entries: 94567221

comparing sequences from          0  to       8927
........
     8927  finished       5017  clusters

Apprixmated maximum memory consumption: 62M
writing new database
writing clustering information
program completed !

Total CPU time 6.87


## Finding clusters with sequences from both datasets

In [12]:
inFile = os.path.join(workDir, 'ssu_all_cdhit.clstr')

tbl = {}
with open(inFile, 'rb') as inFH:
    clst_id = None
    for line in inFH:
        line = line.rstrip()
        if line.startswith('>'):
            clst_id = line.lstrip('>Cluster ')
            tbl[clst_id] = []
        else:
            tbl[clst_id].append(re.split('\t|, ', line))
            
print "Number of clusters loaded: {}".format(len(tbl.keys()))            

Number of clusters loaded: 5017


In [13]:
# clusters that have '>OTU'  and '>rRNA' (OTU and genome)
def shared_clust(x):
    otus = any([y[2].startswith('>OTU') for y in x])
    genomes = any([y[2].startswith('>rRNA') for y in x])    
    return otus == True and genomes == True

tbl_f = {x:y for x,y in tbl.items() if shared_clust(y)}
print "Number of clusters with OTUs and genomes: {}".format(len(tbl_f.keys()))

Number of clusters with OTUs and genomes: 162


### Getting taxonomic classification of OTUs in target clusters

In [16]:
# loading tax file
tax = {}
with open(otuTaxFile, 'rb') as inFH:
    for line in inFH:
        line = line.rstrip()
        if not line.startswith('OTU'):
            continue
        otu,cls,boot,_ = line.split('\t')
        cls = [x.lstrip(' __') for x in cls.split(';')]
        for i in range(8):
            try:
                len(cls[i])
            except IndexError:
                cls.append('Unclassified')
        tax[otu] = cls

In [17]:
def printDict(d, n=10):
    cnt = 0
    for x,y in d.items():
        pprint(x)
        print(y)
        cnt += 1
        if cnt >= n:
            break

In [20]:
printDict(tax, n=3)

'OTU.8469'
['Bacteria', 'Proteobacteria', 'Deltaproteobacteria', 'Bdellovibrionales', 'Bacteriovoracaceae', 'Peredibacter', 'uncultured_bacterium', 'Unclassified']
'OTU.11582'
['Bacteria', 'Acidobacteria', 'DA023', 'uncultured_bacterium', 'Unclassified', 'Unclassified', 'Unclassified', 'Unclassified']
'OTU.5687'
['Bacteria', 'Actinobacteria', 'Thermoleophilia', 'Solirubrobacterales', '480-2', 'uncultured_bacterium', 'Unclassified', 'Unclassified']


In [21]:
# adding taxonomic classifications to OTUs in target clusters

for clstr,x in tbl_f.items():
    for y in x:
        ID = y[2].lstrip('>')
        ID = re.sub('\.\.\..+','', ID)
        #print 'ID: "{}"'.format(ID)
        try:
            y.append(tax[ID])
        except KeyError:
            y.append(None)                  

In [22]:
# gut check: manual check of OTU classifications & genome names 

for clstr,x in tbl_f.items():
    print 'Cluster: {}'.format(clstr)
    for y in x:
        ID = y[2].lstrip('>')
        if ID.startswith('OTU'):
            # classifications
            try:
                print ':'.join(y[3])[:100]
            except IndexError:
                print ':'.join(y[3])
        elif ID.startswith('rRNA'):
            # genome names
            try:
                print ID[:100]
            except IndexError:
                print ID

Cluster: 210
Bacteria:Proteobacteria:Gammaproteobacteria:Xanthomonadales:Xanthomonadaceae:Pseudoxanthomonas:Uncla
rRNA_CP003093_Pseudoxanthomonas_spadix_BD_a59_3414517-3416053_DIR-... *
Cluster: 668
Bacteria:Firmicutes:Clostridia:Clostridiales:Clostridiaceae:Clostridium:uncultured_bacterium:Unclass
rRNA_CP006903_Clostridium_botulinum_202F_2486924-2488425_DIR-... at +/99.67%
rRNA_CP006903_Clostridium_botulinum_202F_3019041-3020542_DIR+... at +/99.73%
rRNA_CP006903_Clostridium_botulinum_202F_2480928-2482428_DIR-... at +/98.60%
rRNA_CP006903_Clostridium_botulinum_202F_2390736-2392237_DIR-... at +/99.87%
rRNA_CP006903_Clostridium_botulinum_202F_2781623-2783125_DIR+... *
rRNA_CP006903_Clostridium_botulinum_202F_2407620-2409121_DIR-... at +/99.73%
rRNA_CP006903_Clostridium_botulinum_202F_2513527-2515028_DIR-... at +/99.87%
rRNA_CP006903_Clostridium_botulinum_202F_2605272-2606773_DIR+... at +/99.73%
rRNA_CP006903_Clostridium_botulinum_202F_2499962-2501463_DIR-... at +/99.47%
rRNA_CP006903_Clo

__Notes:__

* At least most of the taxonomic classifications make sense for the genomes in each cluster

### Writing out a list of target genomes and their corresponding OTUs

* If an OTU has multiple associations with a genome, selecting 1 a random
  * ie., 1-to-1 association

In [23]:
def write_targets(tbl, fh):
    fh.write('\t'.join(['cluster', 'ssu_ID', 'target_genome', 'OTU', 'OTU_taxonomy']) + '\n')
    for clstr,rows in tbl.items():
        # parsing cluster; getting all OTUs and genome_IDs
        targets = []
        otus = []
        for row in rows:
            ID = row[2].lstrip('>')
            ID = re.sub('\.\.\..+','', ID)        
            if ID.startswith('OTU'):
                otu = [ID, ':'.join(row[3])]
                otus.append(otu)
            elif ID.startswith('rRNA'):
                targets.append(ID)
                        
        # writing out list
        ## one 1 randomly selected genome is associated with OTU
        random.shuffle(targets)
        for otu, target in zip(otus, itertools.cycle(targets)):
            genome = target.lstrip('rRNA_')
            genome = re.sub('_\d+-\d+_DIR.+', '', genome)
            fh.write('\t'.join([clstr, target, genome] + otu) + '\n')        

In [24]:
outFile = os.path.join(workDir, 'target_taxa.txt')
with open(outFile, 'wb') as oFH:
    write_targets(tbl_f, oFH)

#### Gut check on written file

In [25]:
!printf "Number of rows in table: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | wc -l

Number of rows in table: 199


In [26]:
!printf "Number of clusters: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | cut -f 1 | sort -u | wc -l

Number of clusters: 162


In [27]:
!printf "Number of target genomes: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | cut -f 3 | sort -u | wc -l

Number of target genomes: 169


In [28]:
!printf "Number of OTUs: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | cut -f 4 | sort -u | wc -l

Number of OTUs: 199
