# Goal: 

* Running CD-HIT on fullCyc OTUs & bac_genome1210 16S rRNA gene dataset
  * cutoff 97% seqID
  * ID OTUs with taxa from both datasets
    * target genomes

# Setting variables

In [13]:
import os

baseDir = '/home/nick/notebook/SIPSim/dev/fullCyc/'
workDir = os.path.join(baseDir, 'CD-HIT')

rnammerSeqs = '/home/nick/notebook/SIPSim/dev/bac_genome1147/rnammer/bac_genome1147_16S.fna'
otuRepFile = '/home/nick/notebook/SIPSim/dev/fullCyc/OTU_reps.fna'
otuTaxFile = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/OTU_binning/otusn_tax/otusn_tax_assignments.txt'
genomeDir = '/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/'
#genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
#otuTableFile = '/var/seq_data/priming_exp/data/otu_table.txt'

# Init

In [3]:
import re
import glob
import itertools
import random
from pprint import pprint
%load_ext rpy2.ipython

In [4]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)

Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)


In [5]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)

# Making input fasta

In [8]:
# concatenating sequences
!cd $workDir; \
    cat $otuRepFile $rnammerSeqs > ssu_all.fna
!printf 'Number of sequences: '
!cd $workDir; \
    grep -c ">" ssu_all.fna

Number of sequences: 8868


# CD-HIT run

In [9]:
!cd $workDir; \
    cd-hit-est -i ssu_all.fna -o ssu_all_cdhit -c 0.97 -d 0

Program: CD-HIT, V4.6, Feb 20 2014, 09:04:54
Command: cd-hit-est -i ssu_all.fna -o ssu_all_cdhit -c 0.97 -d
         0

Started: Fri Jan 15 13:12:33 2016
                            Output                              
----------------------------------------------------------------
total seq: 8868
longest and shortest : 3198 and 239
Total letters: 7975133
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 9M
Buffer          : 1 X 12M = 12M
Table           : 1 X 16M = 16M
Miscellaneous   : 4M
Total           : 43M

Table limit with the given memory limit:
Max number of representatives: 464937
Max number of word counting entries: 94568193

comparing sequences from          0  to       8868
........
     8868  finished       4959  clusters

Apprixmated maximum memory consumption: 61M
writing new database
writing clustering information
program completed !

Total CPU time 6.84


## Finding clusters with sequences from both datasets

In [10]:
inFile = os.path.join(workDir, 'ssu_all_cdhit.clstr')

tbl = {}
with open(inFile, 'rb') as inFH:
    clst_id = None
    for line in inFH:
        line = line.rstrip()
        if line.startswith('>'):
            clst_id = line.lstrip('>Cluster ')
            tbl[clst_id] = []
        else:
            tbl[clst_id].append(re.split('\t|, ', line))
            
print "Number of clusters loaded: {}".format(len(tbl.keys()))            

Number of clusters loaded: 4959


In [11]:
# clusters that have '>OTU'  and '>rRNA' (OTU and genome)
def shared_clust(x):
    otus = any([y[2].startswith('>OTU') for y in x])
    genomes = any([y[2].startswith('>rRNA') for y in x])    
    return otus == True and genomes == True

tbl_f = {x:y for x,y in tbl.items() if shared_clust(y)}
print "Number of clusters with OTUs and genomes: {}".format(len(tbl_f.keys()))

Number of clusters with OTUs and genomes: 159


### Getting taxonomic classification of OTUs in target clusters

In [14]:
# loading tax file
tax = {}
with open(otuTaxFile, 'rb') as inFH:
    for line in inFH:
        line = line.rstrip()
        if not line.startswith('OTU'):
            continue
        otu,cls,boot,_ = line.split('\t')
        cls = [x.lstrip(' __') for x in cls.split(';')]
        for i in range(8):
            try:
                len(cls[i])
            except IndexError:
                cls.append('Unclassified')
        tax[otu] = cls

In [15]:
def printDict(d, n=10):
    cnt = 0
    for x,y in d.items():
        pprint(x)
        print(y)
        cnt += 1
        if cnt >= n:
            break

In [16]:
printDict(tax, n=3)

'OTU.8469'
['Bacteria', 'Proteobacteria', 'Deltaproteobacteria', 'Bdellovibrionales', 'Bacteriovoracaceae', 'Peredibacter', 'uncultured_bacterium', 'Unclassified']
'OTU.11582'
['Bacteria', 'Acidobacteria', 'DA023', 'uncultured_bacterium', 'Unclassified', 'Unclassified', 'Unclassified', 'Unclassified']
'OTU.5687'
['Bacteria', 'Actinobacteria', 'Thermoleophilia', 'Solirubrobacterales', '480-2', 'uncultured_bacterium', 'Unclassified', 'Unclassified']


In [17]:
# adding taxonomic classifications to OTUs in target clusters

for clstr,x in tbl_f.items():
    for y in x:
        ID = y[2].lstrip('>')
        ID = re.sub('\.\.\..+','', ID)
        #print 'ID: "{}"'.format(ID)
        try:
            y.append(tax[ID])
        except KeyError:
            y.append(None)                  

In [18]:
# gut check: manual check of OTU classifications & genome names 

for clstr,x in tbl_f.items():
    print 'Cluster: {}'.format(clstr)
    for y in x:
        ID = y[2].lstrip('>')
        if ID.startswith('OTU'):
            # classifications
            try:
                print ':'.join(y[3])[:100]
            except IndexError:
                print ':'.join(y[3])
        elif ID.startswith('rRNA'):
            # genome names
            try:
                print ID[:100]
            except IndexError:
                print ID

Cluster: 212
Bacteria:Proteobacteria:Gammaproteobacteria:Pseudomonadales:Moraxellaceae:Acinetobacter:Unclassified
rRNA_NC_010611_Acinetobacter_baumannii_ACICU__Acinetobacter_baumannii_ACICU_3391663-3393187_DIR-... 
rRNA_NC_010611_Acinetobacter_baumannii_ACICU__Acinetobacter_baumannii_ACICU_40143-41667_DIR+... at +
rRNA_NC_010611_Acinetobacter_baumannii_ACICU__Acinetobacter_baumannii_ACICU_3891232-3892756_DIR-... 
rRNA_NC_010611_Acinetobacter_baumannii_ACICU__Acinetobacter_baumannii_ACICU_686949-688473_DIR+... at
rRNA_NC_010611_Acinetobacter_baumannii_ACICU__Acinetobacter_baumannii_ACICU_216608-218132_DIR+... at
rRNA_NC_010611_Acinetobacter_baumannii_ACICU__Acinetobacter_baumannii_ACICU_3424880-3426404_DIR-... 
rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_3479770-3481295_DIR-... at
rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_18773-20298_DIR+... at +/1
rRNA_NC_014259_Acinetobacter_oleivorans_DR1__Acinetobacter_oleivorans_DR1_4120

__Notes:__

* At least most of the taxonomic classifications make sense for the genomes in each cluster

### Writing out a list of target genomes and their corresponding OTUs

* If an OTU has multiple associations with a genome, selecting 1 a random
  * ie., 1-to-1 association

In [24]:
# making a index of scaffolds for each genome
genome_list = glob.glob(os.path.join(genomeDir, '*.fna'))

def fasta_seqID(fastaFiles):
    file_seqID = {}
    for f in fastaFiles:
        with open(f, 'rb') as iFH:
            for line in iFH:
                if line.startswith('>'):
                    line = line.lstrip('>').rstrip()
                    file_seqID[line] = f
    return file_seqID

seqID_fastaID = fasta_seqID(genome_list)                    
print 'Index length: {}'.format(len(seqID_fastaID))

Index length: 1231


In [25]:
def write_targets(tbl, oFH, seqID_fastaID):
    """
    Args:
    tbl -- cd-hit results
    oFH -- output file handle
    seqID_fastaID -- dict(seqName : fastaFileName)
    """
    oFH.write('\t'.join(['cluster', 'ssu_ID', 'genome_fileID', 'genomeID',
                         'genome_seqID', 'OTU', 'OTU_taxonomy']) + '\n')
    for clstr,rows in tbl.items():
        # parsing cluster; getting all OTUs and genome_IDs
        targets = []
        otus = []
        for row in rows:
            ID = row[2].lstrip('>')
            ID = re.sub('\.\.\..+','', ID)        
            if ID.startswith('OTU'):
                otu = [ID, ':'.join(row[3])]
                otus.append(otu)
            elif ID.startswith('rRNA'):
                targets.append(ID)
                
                        
        # writing out list
        ## one 1 randomly selected genome is associated with OTU
        random.shuffle(targets)
        for otu, target in zip(otus, itertools.cycle(targets)):
            # genome sequence name
            seqID = target[5:]  # removing rRNA_
            seqID = re.sub('_\d+-\d+_DIR.+', '', seqID)
            # genome file name
            try:
                fileID = seqID_fastaID[seqID]
            except KeyError:
                msg = 'Cannot find "{}"'
                print msg.format(seqID)
                fileID = ''
                genomeID = ''
            else:
                # genome name                
                x = os.path.split(fileID)
                genomeID = os.path.splitext(x[1])[0]
            
            # writing row
            oFH.write('\t'.join([clstr, target, fileID, genomeID, seqID] + otu) + '\n')        

In [26]:
outFile = os.path.join(workDir, 'target_taxa.txt')
with open(outFile, 'wb') as oFH:
    write_targets(tbl_f, oFH, seqID_fastaID)

### Gut check on written file

* Multiple OTUs will likely cluster with some representative genomes
  * Thus, the number of target genomes should be > target OTUs

In [27]:
!printf "Number of rows in table: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | wc -l

Number of rows in table: 194


In [28]:
!printf "Number of clusters: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | cut -f 1 | sort -u | wc -l

Number of clusters: 159


In [29]:
!printf "Number of target genomes: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | cut -f 3 | sort -u | wc -l

Number of target genomes: 165


In [30]:
!printf "Number of OTUs: "
!cd $workDir; \
    tail -n +2 target_taxa.txt | cut -f 4 | sort -u | wc -l

Number of OTUs: 165
