# Goal

Accuracy as a function of isotope incorporation

### Variable parameters:

* atom % isotope incorporation
  * 0, 25, 50, 100
* % taxa that incorporate
  * __1, 5, 10, 25, 50__
* n-reps (stocastic: taxon abundances & which incorporate)
  * 10 

## Init

In [1]:
import os
import glob
import itertools
import nestly

In [2]:
%load_ext rpy2.ipython
%load_ext pushnote

In [3]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)

http://groups.google.com/group/ggplot2.

  res = super(Function, self).__call__(*new_args, **new_kwargs)
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)


### BD min/max

In [4]:
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_BD = min_GC/100.0 * 0.098 + 1.66    
max_BD = max_GC/100.0 * 0.098 + 1.66    

max_BD = max_BD + max_13C_shift_in_BD

print 'Min BD: {}'.format(min_BD)
print 'Max BD: {}'.format(max_BD)

Min BD: 1.67323
Max BD: 1.7744


# Nestly

* assuming fragments already simulated

In [5]:
# paths
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1147/'
buildDir = os.path.join(workDir, 'atomIncorp_taxaIncorp')
R_dir = '/home/nick/notebook/SIPSim/lib/R/'

fragFile = '/home/nick/notebook/SIPSim/dev/bac_genome1147/validation/ampFrags_kde.pkl'
genome_index = '/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/genome_index.txt'

In [6]:
if not os.path.isdir(buildDir):
    os.makedirs(buildDir)
%cd $buildDir

/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp


In [7]:
# making an experimental design file for qSIP
x = range(1,7)
y = ['control', 'treatment']

expDesignFile = os.path.join(buildDir, 'qSIP_exp_design.txt')
with open(expDesignFile, 'wb') as outFH:
    for i,z in itertools.izip(x,itertools.cycle(y)):
        line = '\t'.join([str(i),z])
        outFH.write(line + '\n')

!head $expDesignFile       

1	control
2	treatment
3	control
4	treatment
5	control
6	treatment


In [8]:
expDesignFile 

'/home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/qSIP_exp_design.txt'

## Nestly params

In [9]:
# building tree structure
nest = nestly.Nest()

# varying params: test
nest.add('fracIncorp', [0.5])
nest.add('percTaxa', [10])
nest.add('rep', range(1,3))


# varying params
#nest.add('fracIncorp', [0, 0.25, 0.5, 1])
#nest.add('percTaxa', [1, 5, 10, 25, 50])
#nest.add('rep', range(1,11))

## set params
nest.add('abs', ['1e9'], create_dir=False)
nest.add('np', [10], create_dir=False)
nest.add('subsample_dist', ['lognormal'], create_dir=False)
nest.add('subsample_mean', [9.432], create_dir=False)
nest.add('subsample_scale', [0.5], create_dir=False)
nest.add('subsample_min', [10000], create_dir=False)
nest.add('subsample_max', [30000], create_dir=False)
nest.add('min_BD', [min_BD], create_dir=False)
nest.add('max_BD', [max_BD], create_dir=False)
nest.add('DBL_scaling', [0.5], create_dir=False)
nest.add('bandwidth', [0.8], create_dir=False)
nest.add('heavy_BD_min', [1.71], create_dir=False)
nest.add('heavy_BD_max', [1.75], create_dir=False)
nest.add('topTaxaToPlot', [100], create_dir=False)
nest.add('padj', [0.1], create_dir=False)
nest.add('log2', [0.25], create_dir=False)

### input/output files
nest.add('buildDir', [buildDir], create_dir=False)
nest.add('R_dir', [R_dir], create_dir=False)
nest.add('genome_index', [genome_index], create_dir=False)
nest.add('fragFile', [fragFile], create_dir=False)
nest.add('exp_design', [expDesignFile], create_dir=False)


# building directory tree
nest.build(buildDir)

# bash file to run
bashFile = os.path.join(buildDir, 'SIPSimRun.sh')

## Experimental design

In [10]:
%%writefile $bashFile
#!/bin/bash

echo '#-- Experimental design --#'

echo '# Making an isotope incorporation config file'
echo '## 3 replicate gradients for control & treatment'
SIPSim incorpConfigExample \
  --percTaxa {percTaxa} \
  --n_reps 3 \
  > incorp.config

echo '# Selecting incorporator taxa'
echo '## This is to make the gradient replicates consistent (qSIP finds mean among replicates)'
SIPSim KDE_selectTaxa \
    -f {fracIncorp} \
    {fragFile} \
    > incorporators.txt

echo '# Creating a community file (3 replicate control, 3 replicate treatment)'
SIPSim communities \
    --config incorp.config \
    {genome_index} \
    > comm.txt    

echo '# simulating gradient fractions'
SIPSim gradient_fractions \
    --BD_min {min_BD} \
    --BD_max {max_BD} \
    comm.txt \
    > fracs.txt        

Writing /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/SIPSimRun.sh


In [11]:
!chmod 777 $bashFile
!cd $workDir; \
    nestrun --template-file $bashFile -d atomIncorp_taxaIncorp --log-file exp_design.log -j 10

2016-03-19 13:13:17,834 * INFO * Template: ./SIPSimRun.sh
2016-03-19 13:13:17,836 * INFO * [213609] Started ./SIPSimRun.sh in atomIncorp_taxaIncorp/0.5/10/2
2016-03-19 13:13:17,838 * INFO * [213610] Started ./SIPSimRun.sh in atomIncorp_taxaIncorp/0.5/10/1
2016-03-19 13:13:21,451 * INFO * [213609] atomIncorp_taxaIncorp/0.5/10/2 Finished with 0
2016-03-19 13:13:21,451 * INFO * [213610] atomIncorp_taxaIncorp/0.5/10/1 Finished with 0


## SIPSim pipeline

In [None]:
%%writefile $bashFile
#!/bin/bash

echo '#-- SIPSim pipeline --#'    
    
echo '# Adding diffusion'    
SIPSim diffusion \
    -n 100000 \
    --bw {bandwidth} \
    --np {np} \
    {fragFile} \
    > ampFrags_KDE_dif.pkl    

echo '# Adding DBL contamination; abundance-weighted smearing'
SIPSim DBL \
    -n 100000 \
    --comm comm.txt \
    --commx {DBL_scaling} \
    --np {np} \
    ampFrags_KDE_dif.pkl \
    > ampFrags_KDE_dif_DBL.pkl 

echo '# Adding isotope incorporation to BD distribution'
SIPSim isotope_incorp \
    -n 100000 \
    --comm comm.txt \
    --taxa incorporators.txt \
    --np {np} \
    ampFrags_KDE_dif_DBL.pkl \
    incorp.config \
    > ampFrags_KDE_dif_DBL_inc.pkl

echo '# Simulating an OTU table'
SIPSim OTU_table \
    --abs {abs} \
    --np {np} \
    ampFrags_KDE_dif_DBL_inc.pkl \
    comm.txt \
    fracs.txt \
    > OTU_abs{abs}.txt
    
echo '# Simulating PCR'
SIPSim OTU_PCR \
    OTU_abs{abs}.txt \
    > OTU_abs{abs}_PCR.txt    
    
echo '# Subsampling from the OTU table (simulating sequencing of the DNA pool)'
SIPSim OTU_subsample \
    --dist {subsample_dist} \
    --dist_params mean:{subsample_mean},sigma:{subsample_scale} \
    --min_size {subsample_min} \
    --max_size {subsample_max} \
    OTU_abs{abs}_PCR.txt \
    > OTU_abs{abs}_PCR_sub.txt
        
echo '# Making a wide-formatted table'
SIPSim OTU_wideLong -w \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_w.txt
    
echo '# Making metadata (phyloseq: sample_data)'
SIPSim OTU_sampleData \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_meta.txt
    
    

#-- removing large intermediate files --#
#rm -f ampFrags_KDE_dif.pkl
#rm -f ampFrags_KDE_dif_DBL.pkl
#rm -f ampFrags_KDE_dif_DBL_inc.pkl    

Overwriting /home/nick/notebook/SIPSim/dev/bac_genome1147/atomIncorp_taxaIncorp/SIPSimRun.sh


In [None]:
!chmod 777 $bashFile
!cd $workDir; \
    nestrun --template-file $bashFile -d atomIncorp_taxaIncorp --log-file SIPSim_pipeline.log -j 2

2016-03-19 13:13:21,736 * INFO * Template: ./SIPSimRun.sh
2016-03-19 13:13:21,738 * INFO * [213735] Started ./SIPSimRun.sh in atomIncorp_taxaIncorp/0.5/10/2
2016-03-19 13:13:21,741 * INFO * [213736] Started ./SIPSimRun.sh in atomIncorp_taxaIncorp/0.5/10/1


In [None]:
%pushnote SIPSim pipeline complete

## HR-SIP

In [None]:
%%writefile $bashFile
#!/bin/bash

#-- R analysis --#
export PATH={R_dir}:$PATH
   

# plotting 'raw' taxon abundances
OTU_taxonAbund.r \
    OTU_abs{abs}.txt \
    -r {topTaxaToPlot} \
    -o OTU_abs{abs}
# plotting 'sequenced' taxon abundances
OTU_taxonAbund.r \
    OTU_abs{abs}_PCR_sub.txt \
    -r {topTaxaToPlot} \
    -o OTU_abs{abs}_PCR_sub

# running DeSeq2
## making phyloseq object from OTU table
phyloseq_make.r \
    OTU_abs{abs}_PCR_sub_w.txt \
    -s OTU_abs{abs}_PCR_sub_meta.txt \
    > OTU_abs{abs}_PCR_sub.physeq
## filtering phyloseq object to just 'heavy' fractions
phyloseq_edit.r \
    OTU_abs{abs}_PCR_sub.physeq \
    --BD_min {heavy_BD_min} \
    --BD_max {heavy_BD_max} \
    > OTU_abs{abs}_PCR_sub_filt.physeq

## making ordination
phyloseq_ordination.r \
    OTU_abs{abs}_PCR_sub_filt.physeq \
    OTU_abs{abs}_PCR_sub_filt_bray-NMDS.pdf

## DESeq2
phyloseq_DESeq2.r \
    OTU_abs{abs}_PCR_sub_filt.physeq \
    --log2 {log2} \
    --hypo greater \
    > OTU_abs{abs}_PCR_sub_filt_DESeq2
    
## Confusion matrix
DESeq2_confuseMtx.r \
    -o HR-SIP_DESeq2-cMtx \
    BD-shift_stats.txt \
    OTU_abs{abs}_PCR_sub_filt_DESeq2 \
    --padj {padj} 

In [None]:
!chmod 777 $bashFile
!cd $workDir; \
    nestrun --template-file $bashFile -d atomIncorp_taxaIncorp --log-file HR-SIP.log -j 10

### Aggregating the confusion matrix data

In [None]:
# all data
!nestagg delim \
        -d $buildDir \
        -k fracIncorp,percTaxa,rep \
        -o HR-SIP_DESeq2-cMtx_data.txt \
        --tab \
        HR-SIP_DESeq2-cMtx_data.txt

# overall
!nestagg delim \
        -d $buildDir \
        -k fracIncorp,percTaxa,rep \
        -o HR-SIP_DESeq2-cMtx_overall.txt \
        --tab \
        HR-SIP_DESeq2-cMtx_overall.txt

# by class
!nestagg delim \
        -d $buildDir \
        -k fracIncorp,percTaxa,rep \
        -o HR-SIP_DESeq2-cMtx_byClass.txt \
        --tab \
        HR-SIP_DESeq2-cMtx_byClass.txt

## qSIP

In [None]:
%%writefile $bashFile
#!/bin/bash

#-- qSIP --#
echo '# qSIP'
SIPSim qSIP \
    --reps 3 \
    OTU_abs{abs}.txt \
    OTU_abs{abs}_PCR_sub.txt \
    > OTU_abs{abs}_PCR_sub_qSIP.txt
        
# atom excess
echo '# qSIP: atom excess'
SIPSim qSIP_atomExcess \
    --np {np} \
    OTU_abs{abs}_PCR_sub_qSIP.txt \
    {exp_design} \
    > OTU_abs{abs}_PCR_sub_qSIP_atom.txt  

In [None]:
!chmod 777 $bashFile
!cd $workDir; \
    nestrun --template-file $bashFile -d atomIncorp_taxaIncorp --log-file qSIP.log -j 2

In [None]:
%pushnote qSIP complete