# RATIONAL PARAMETER GENERATOR -  TOEHOLD DATABASE
## by Luis Soenksen
### Rev 0.1 20190306

#### This code requires the following pre-installs:
> ##### Biopython
> In anaconda env: `conda install -c anaconda biopython` or `pip install biopython`<br>
> Ref: https://github.com/biopython/biopython
> ##### ViennaRNA
> In anaconda env: `conda install -c bioconda viennarna`<br>
> Ref: https://github.com/ViennaRNA/ViennaRNA
> ##### RNAsketch
> In anaconda env: `conda install -c bioconda rnasketch`<br>
> Ref: https://github.com/ViennaRNA/RNAsketch
> ##### Pysster (Python 3.5+)
> In anaconda env: `pip3 install pysster`<br>
> Ref: https://github.com/budach/pysste

In [1]:
# Clear memory from IPython
from IPython import get_ipython
get_ipython().magic('reset -sf') 

In [2]:
## Import Libraries
# General system libraries
import os
import numpy as np
import pandas as pd
from time import time
from IPython.display import Image

# Multiprocessing
import multiprocessing
from multiprocessing import Pool, cpu_count

# DNA/RNA Analysis Libraries (Biopython, ViennaRNA, pysster) 
# Biopython Lib
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_rna, generic_dna, generic_protein, IUPAC

# ViennaRNA Lib
import RNA

# Pysster Lib
from pysster import utils
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster.One_Hot_Encoder import One_Hot_Encoder
from pysster.Alphabet_Encoder import Alphabet_Encoder

# Import TPOT libs
from tpot import TPOTRegressor

# Import sklearn libs
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.metrics import median_absolute_error, r2_score

# Math & Visualization Libs
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Progress Bar
from tqdm import tqdm

# Warnings
import warnings

Using TensorFlow backend.


In [3]:
# Create Data folder if not existent
data_folder = "data/"
if not os.path.isdir(data_folder):
    os.makedirs(data_folder)

In [4]:
# Define path to load desired Toehold dataset file (.csv)
data_filename = "2019-02-21_toehold_dataset_proc.csv"
data_proc_path = data_folder + data_filename
data_proc = pd.read_csv(data_proc_path)

#Change all NaNs for zeros
data_proc = data_proc.fillna(0)

#Show dataframe
print('Full Database without NaNs:')
data_proc.head()

Full Database without NaNs:


Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,Cbn2_on,Cbn3_on,Cbn4_on,Cbn1_off,Cbn2_off,Cbn3_off,Cbn4_off,ON,OFF,ON_OFF
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.333333,0.0
1,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,...,0.077983,0.063451,0.0,1.0,0.0,0.0,0.0,0.068295,0.0,0.068295
3,AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...,TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_6492,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TCCAAGTAGTAGTCCTACGTTATTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAATAACGTAGGACTACTACTTGGA,AACAGAGGAGA,...,0.0,0.0,0.0,0.883458,0.116542,0.0,0.0,0.0,0.038847,-0.038847
4,AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...,ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2624,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACATATTAGTAACTGTTTTCCATTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAATGGAAAACAGTTACTAATATGT,AACAGAGGAGA,...,0.167182,0.037408,0.0,0.815066,0.0,0.184934,0.0,0.080666,0.123289,-0.042624


In [5]:
# Drop rows where on-off is NaN
data_proc_on_off = data_proc.dropna(subset=['ON_OFF'])
data_proc_on_off = data_proc_on_off.drop(['Cb1_on','Cb2_on','Cb3_on','Cb4_on','Cb1_off','Cb2_off','Cb3_off','Cb4_off', 
                                          'Cbn1_on','Cbn2_on','Cbn3_on','Cbn4_on','Cbn1_off','Cbn2_off','Cbn3_off','Cbn4_off',
                                          'On_Gate1_counts','On_Gate2_counts','On_Gate3_counts','On_Gate4_counts',
                                          'Off_Gate1_counts', 'Off_Gate2_counts', 'Off_Gate3_counts', 'Off_Gate4_counts'], axis=1)
data_path_proc_on_off = (data_folder + data_filename.replace('.csv','_proc_on_off.csv')) 
data_proc_on_off.to_csv(data_path_proc_on_off, index=False)
print('ON-OFF Database without NaNs:')
display(data_proc_on_off.head())

ON-OFF Database without NaNs:


Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,stem1,atg,stem2,linker,post_linker,nupack_mfe,ON,OFF,ON_OFF
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,AACTGT,ATG,CCATTTTTT,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,-13.0,0.0,0.333333,0.0
1,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,TTAGTA,ATG,GTTTTCCAT,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,-16.4,0.0,0.0,0.0
2,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,CTAAAT,ATG,CAATAGTAG,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,-13.2,0.068295,0.0,0.068295
3,AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...,TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_6492,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TCCAAGTAGTAGTCCTACGTTATTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAATAACGTAGGACTACTACTTGGA,AACAGAGGAGA,TCCAAG,ATG,TAGTCCTAC,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,-21.5,0.0,0.038847,-0.038847
4,AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...,ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2624,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACATATTAGTAACTGTTTTCCATTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAATGGAAAACAGTTACTAATATGT,AACAGAGGAGA,ACATAT,ATG,TAACTGTTT,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,-14.9,0.080666,0.123289,-0.042624


In [6]:
# Function to calculate Ensemble Defect from sequence. 
def ensemble_defect_calc(seq, n_db, i_db):
    """Calculate ensemble defect given DNA/RNA sequence .
    by Luis Soenksen 2019-02-01

    Args:
        seq: Nucleotide sequence (String)
        n_db: Native (calculated) Dot-Bracket representation of sequence (String)
        i_db: Ideal (pre-defimned) Dot-Bracket representation of sequence (String)

    Returns:
        n_ed: Native ensemble defect (float)
        i_ed: Ideal ensemble defect (float)
        
    """
    # NOTE: Ensemble defect is the average number of nucleotides that 
    #       are incorrectly paired at equilibrium relative to the 
    #       specified secondary structure, evaluated over the 
    #       Boltzmann-weighted ensemble of (unpseudoknotted) secondary 
    #       structures (0 is best, N is worst, for a strand with N bases).
    #       https://www.tbi.univie.ac.at/RNA/ViennaRNA/

    # Create fold_compound object
    fc = RNA.fold_compound(seq)
    # Compute partition function
    fc.pf()
    
    #NATIVE Ensemble defect (Calculated using obtained ViennaRNA/Nupack dot-bracket secondary representation)
    n_ed = fc.ensemble_defect(n_db)
    #IDEAL Ensemble defect (Calculated using ideal toehold region dot-bracket secondary representation)
    i_ed = fc.ensemble_defect(i_db)
    
    return n_ed, i_ed

In [7]:
# Function create rational parameter from toehold DNA sequences. 
def toehold_rational_param_calc(toehold):
    """Produce rational parameters given toehold DNA sequence .
    by Luis Soenksen 2019-02-01

    Args:
        toehold: Toehold dataframe object

    Returns:
        params: Pandas data structure of calculated rational parameters
    """
    #Create Base toehold string sequence [0-144]
    #   GGG  - Trigger - Loop1 - Switch  - Loop2 - Stem1 -  AUG  -  Stem2  -  Linker - Post-linker
    # [-3,-1]  [0,-29]  [30-49]  [50-79]  [80-90] [91,96] [97,99] [100,108] [109,134]  [135,144]
    ggg = toehold.loc[0]['promoter'][-3:]
    seq = toehold.loc[0]['trigger'] + \
                toehold.loc[0]['loop1'] + \
                toehold.loc[0]['switch'] + \
                toehold.loc[0]['loop2'] + \
                toehold.loc[0]['stem1'] + \
                toehold.loc[0]['atg'] + \
                toehold.loc[0]['stem2'] + \
                toehold.loc[0]['linker'] + \
                toehold.loc[0]['post_linker']
    
    # SEQUENCE REGION DEFINITION (DNA Seq):
    seq_SwitchOFF      = ggg + seq[30:109]
    seq_SwitchOFF_GFP  = ggg + seq[30:145]
    seq_SwitchOFF_NoTo = seq[62:145]
    seq_SwitchON       = ggg + seq[0:109]
    seq_SwitchON_GFP   = ggg + seq[0:145]
    seq_Trigger        = ggg + seq[0:30]
    seq_ToeholdOFF     = ggg + seq[30:62]
    seq_ToeholdON      = ggg + seq[0:62]
    seq_Stem           = seq[62:109]
    seq_AscendingStem  = seq[62:100]
    seq_DescendingStem = seq[80:109]
    seq_StemTop        = seq[74:97]
    seq_RBS_GFP        = seq[80:145]
    seq_RBS_Linker     = seq[80:135]
    
    
    # MFE VALUES (Vienna/Nupack) of:
    #   -> SwitchOFF MFE, Nucleotides: [-3,-1] + [30,144]
    #   -> SwitchON MFE, Nucleotides: [-3,144]
    #   -> Trigger MFE, Nucleotides: [-3,29]
    #   -> ToeholdOFF MFE, Nucleotides: [-3,-1] + [30,61]
    #   -> ToeholdON MFE, Nucleotides: [-3,61]
    #   -> Stem MFE, Nucleotides: [62,108]
    #   -> AscendingStem MFE, Nucleotides: [62,99]
    #   -> DescendingStem MFE, Nucleotides: [80,108]
    #   -> StemTop MFE, Nucleotides: [74,96]
    #   -> RBS_GFP MFE, Nucleotides: [80,144]
    #   -> RBS_Linker MFE, Nucleotides: [80,134]
    # NOTE: This specifies a secondary structure in dot-parens-plus notation 
    # (each unpaired base is represented by a dot, each base pair by matching 
    # parentheses, and each nick between strands by a plus). Structures must be 
    # connected and free of pseudoknots.
    (db_seq_SwitchOFF, mfe_seq_SwitchOFF) = RNA.fold(seq_SwitchOFF)
    (db_seq_SwitchOFF_GFP, mfe_seq_SwitchOFF_GFP) = RNA.fold(seq_SwitchOFF_GFP)
    (db_seq_SwitchOFF_NoTo, mfe_seq_SwitchOFF_NoTo) = RNA.fold(seq_SwitchOFF_NoTo)
    (db_seq_SwitchON, mfe_seq_SwitchON) = RNA.fold(seq_SwitchON)
    (db_seq_SwitchON_GFP, mfe_seq_SwitchON_GFP) = RNA.fold(seq_SwitchON_GFP)
    (db_seq_Trigger, mfe_seq_Trigger) = RNA.fold(seq_Trigger)
    (db_seq_ToeholdOFF, mfe_seq_ToeholdOFF) = RNA.fold(seq_ToeholdOFF)
    (db_seq_ToeholdON, mfe_seq_ToeholdON) = RNA.fold(seq_ToeholdON)
    (db_seq_Stem, mfe_seq_Stem) = RNA.fold(seq_Stem)
    (db_seq_AscendingStem, mfe_seq_AscendingStem) = RNA.fold(seq_AscendingStem)
    (db_seq_DescendingStem, mfe_seq_DescendingStem) = RNA.fold(seq_DescendingStem)
    (db_seq_StemTop, mfe_seq_StemTop) = RNA.fold(seq_StemTop)
    (db_seq_RBS_GFP, mfe_seq_RBS_GFP) = RNA.fold(seq_RBS_GFP)
    (db_seq_RBS_Linker, mfe_seq_RBS_Linker) = RNA.fold(seq_RBS_Linker)
       
        
    # ENSEMBLE DEFECT VALUES (Vienna/Nupack) of target structure:
    #   -> SwitchOFF ED, Nucleotides: [-3,-1] + [30,144]
    #   -> SwitchON ED, Nucleotides: [-3,144]
    #   -> ToeholdOn ED, Nucleotides: [-3,61]
    #   -> Stem ED, Nucleotides: [62,108]
    #   -> StemTop ED, Nucleotides: [74,96]
    # Let's define ideal dot-bracket structures
    i_db_seq_SwitchOFF = '...................................(((((((((...((((((...........))))))...)))))))))'
    i_db_seq_SwitchOFF_GFP = '...................................(((((((((...((((((...........))))))...)))))))))..(((.......(((((.....)))))..)))....'
    i_db_seq_SwitchOFF_NoTo = '(((((((((...((((((...........))))))...)))))))))..(((.......(((((.....)))))..)))....'
    i_db_seq_SwitchON = '...((((((((((((((((((((((((((((((....................)))))))))))))))))))))))))))))).............................'
    i_db_seq_SwitchON_GFP = '...((((((((((((((((((((((((((((((....................))))))))))))))))))))))))))))))...............................(((.......(((((.....)))))..)))....'
    i_db_seq_ToeholdON = '.....................((((((((((((....................))))))))))))'
    i_db_seq_Stem = '(((((((((...((((((...........))))))...)))))))))'
    i_db_seq_StemTop = '((((((...........))))))'
    (n_ed_seq_SwitchOFF, i_ed_seq_SwitchOFF) = ensemble_defect_calc(seq_SwitchOFF, db_seq_SwitchOFF, i_db_seq_SwitchOFF)
    (n_ed_seq_SwitchOFF_GFP, i_ed_seq_SwitchOFF_GFP) = ensemble_defect_calc(seq_SwitchOFF_GFP, db_seq_SwitchOFF_GFP, i_db_seq_SwitchOFF_GFP)
    (n_ed_seq_SwitchOFF_NoTo, i_ed_seq_SwitchOFF_NoTo) = ensemble_defect_calc(seq_SwitchOFF_NoTo, db_seq_SwitchOFF_NoTo, i_db_seq_SwitchOFF_NoTo)
    (n_ed_seq_SwitchON, i_ed_seq_SwitchON) = ensemble_defect_calc(seq_SwitchON, db_seq_SwitchON, i_db_seq_SwitchON)
    (n_ed_seq_SwitchON_GFP, i_ed_seq_SwitchON_GFP) = ensemble_defect_calc(seq_SwitchON_GFP, db_seq_SwitchON_GFP, i_db_seq_SwitchON_GFP)
    (n_ed_seq_ToeholdON, i_ed_seq_ToeholdON) = ensemble_defect_calc(seq_ToeholdON, db_seq_ToeholdON, i_db_seq_ToeholdON)
    (n_ed_seq_Stem, i_ed_seq_Stem) = ensemble_defect_calc(seq_Stem, db_seq_Stem, i_db_seq_Stem)
    (n_ed_seq_StemTop, i_ed_seq_StemTop) = ensemble_defect_calc(seq_StemTop, db_seq_StemTop, i_db_seq_StemTop)
    
    
    # COMPILATION OF ALL PARAMETERS INTO SINGLE DATAFRAME
    data = {'seq_SwitchOFF': [seq_SwitchOFF],
            'db_seq_SwitchOFF': [db_seq_SwitchOFF],
            'mfe_seq_SwitchOFF': [mfe_seq_SwitchOFF],
            'n_ed_seq_SwitchOFF': [n_ed_seq_SwitchOFF],
            'i_ed_seq_SwitchOFF': [i_ed_seq_SwitchOFF],
            'seq_SwitchOFF_GFP': [seq_SwitchOFF_GFP],
            'db_seq_SwitchOFF_GFP': [db_seq_SwitchOFF_GFP],
            'mfe_seq_SwitchOFF_GFP': [mfe_seq_SwitchOFF_GFP],
            'n_ed_seq_SwitchOFF_GFP': [n_ed_seq_SwitchOFF_GFP],
            'i_ed_seq_SwitchOFF_GFP': [i_ed_seq_SwitchOFF_GFP],
            'seq_SwitchOFF_NoTo': [seq_SwitchOFF_NoTo],
            'db_seq_SwitchOFF_NoTo': [db_seq_SwitchOFF_NoTo],
            'mfe_seq_SwitchOFF_NoTo': [mfe_seq_SwitchOFF_NoTo],
            'n_ed_seq_SwitchOFF_NoTo': [n_ed_seq_SwitchOFF_NoTo],
            'i_ed_seq_SwitchOFF_NoTo': [i_ed_seq_SwitchOFF_NoTo],
            'seq_SwitchON': [seq_SwitchON],
            'db_seq_SwitchON': [db_seq_SwitchON],
            'mfe_seq_SwitchON': [mfe_seq_SwitchON],
            'n_ed_seq_SwitchON': [n_ed_seq_SwitchON],
            'i_ed_seq_SwitchON': [i_ed_seq_SwitchON],
            'seq_SwitchON_GFP': [seq_SwitchON_GFP],
            'db_seq_SwitchON_GFP': [db_seq_SwitchON_GFP],
            'mfe_seq_SwitchON_GFP': [mfe_seq_SwitchON_GFP],
            'n_ed_seq_SwitchON_GFP': [n_ed_seq_SwitchON_GFP],
            'i_ed_seq_SwitchON_GFP': [i_ed_seq_SwitchON_GFP],
            'seq_Trigger': [seq_Trigger],
            'db_seq_Trigger': [db_seq_Trigger],
            'mfe_seq_Trigger': [mfe_seq_Trigger],
            'seq_ToeholdOFF': [seq_ToeholdOFF],
            'db_seq_ToeholdOFF': [db_seq_ToeholdOFF],
            'mfe_seq_ToeholdOFF': [mfe_seq_ToeholdOFF],
            'seq_ToeholdON': [seq_ToeholdON],
            'db_seq_ToeholdON': [db_seq_ToeholdON],
            'mfe_seq_ToeholdON': [mfe_seq_ToeholdON],
            'n_ed_seq_ToeholdON': [n_ed_seq_ToeholdON],
            'i_ed_seq_ToeholdON': [i_ed_seq_ToeholdON],
            'seq_Stem': [seq_Stem],
            'db_seq_Stem': [db_seq_Stem],
            'mfe_seq_Stem': [mfe_seq_Stem],
            'n_ed_seq_Stem': [n_ed_seq_Stem],
            'i_ed_seq_Stem': [i_ed_seq_Stem],
            'seq_AscendingStem': [seq_AscendingStem],
            'db_seq_AscendingStem': [db_seq_AscendingStem],
            'mfe_seq_AscendingStem': [mfe_seq_AscendingStem],
            'seq_DescendingStem': [seq_DescendingStem],
            'db_seq_DescendingStem': [db_seq_DescendingStem],
            'mfe_seq_DescendingStem': [mfe_seq_DescendingStem],
            'seq_StemTop': [seq_StemTop],
            'db_seq_StemTop': [db_seq_StemTop],
            'mfe_seq_StemTop': [mfe_seq_StemTop],
            'n_ed_seq_StemTop': [n_ed_seq_StemTop],
            'i_ed_seq_StemTop': [i_ed_seq_StemTop],
            'seq_RBS_GFP': [seq_RBS_GFP],
            'db_seq_RBS_GFP': [db_seq_RBS_GFP],
            'mfe_seq_RBS_GFP': [mfe_seq_RBS_GFP],
            'seq_RBS_Linker': [seq_RBS_Linker],
            'db_seq_RBS_Linker': [db_seq_RBS_Linker],
            'mfe_seq_RBS_Linker': [mfe_seq_RBS_Linker],  
           }
    
    params = pd.DataFrame(data)
    return params

In [8]:
# Define function to perform per processor
def toehold_batch_processor(data):
    #Create empty dataframe for output
    data_proc_on_off_with_params = pd.DataFrame(data=None)
        
    #Iterate over dataframe toehold data
    for index, row in data.iterrows():
        # Calculate parameters and generate new extended toehold dataframe
        toehold = data.loc[[index]].reset_index(drop=True)
        params = toehold_rational_param_calc(toehold)
        toehold_with_params = pd.concat([toehold, params], axis=1)
        #Append data for new extended dataframe
        data_proc_on_off_with_params = data_proc_on_off_with_params.append(toehold_with_params)
        #Update TQDM progress bar
        pbar.update(multiprocessing.cpu_count()-1)
    return data_proc_on_off_with_params

In [9]:
def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [10]:
#Initialize the process bar to count until the number of rows in dataframe
pbar = tqdm(total=len(data_proc_on_off))

# Parallel processing to obtain toehold parameters
data_proc_on_off_with_params = parallelize_dataframe(data_proc_on_off, toehold_batch_processor)

#Conclude process bar
pbar.close()

100%|█████████▉| 243984/244000 [2:03:13<00:00, 51.32it/s]


In [12]:
# Define and save new dataframe to CSV
data_path_proc = (data_folder + data_filename.replace('.csv','_with_params.csv'))
data_proc_on_off_with_params.to_csv(data_path_proc, index=False)

# Show data with calculated parameters  data
display(data_proc_on_off_with_params.head())

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,seq_Stem,seq_StemTop,seq_SwitchOFF,seq_SwitchOFF_GFP,seq_SwitchOFF_NoTo,seq_SwitchON,seq_SwitchON_GFP,seq_ToeholdOFF,seq_ToeholdON,seq_Trigger
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,...,AAAAAATGGAAAACAGTTAACAGAGGAGAAACTGTATGCCATTTTTT,ACAGTTAACAGAGGAGAAACTGT,GGGAACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAA...,GGGAACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAA...,AAAAAATGGAAAACAGTTAACAGAGGAGAAACTGTATGCCATTTTT...,GGGAACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACA...,GGGAACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACA...,GGGAACCAAACACACAAACGCACAAAAAAAAAAAA,GGGAACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACA...,GGGAACTGTTTTCCATTTTTTTTTTTTTTTTTT
0,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,...,ATGGAAAACAGTTACTAAAACAGAGGAGATTAGTAATGGTTTTCCAT,TACTAAAACAGAGGAGATTAGTA,GGGAACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAG...,GGGAACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAG...,ATGGAAAACAGTTACTAAAACAGAGGAGATTAGTAATGGTTTTCCA...,GGGTTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACA...,GGGTTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACA...,GGGAACCAAACACACAAACGCACAAAAAAAAAAAA,GGGTTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACA...,GGGTTAGTAACTGTTTTCCATTTTTTTTTTTTT
0,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,...,CTACTATTGTTAATTTAGAACAGAGGAGACTAAATATGCAATAGTAG,ATTTAGAACAGAGGAGACTAAAT,GGGAACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTT...,GGGAACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTT...,CTACTATTGTTAATTTAGAACAGAGGAGACTAAATATGCAATAGTA...,GGGCTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACA...,GGGCTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACA...,GGGAACCAAACACACAAACGCACAAAAAAAAATTA,GGGCTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACA...,GGGCTAAATTAACAATAGTAGTAATTTTTTTTT
0,AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...,TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_6492,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TCCAAGTAGTAGTCCTACGTTATTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAATAACGTAGGACTACTACTTGGA,AACAGAGGAGA,...,GTAGGACTACTACTTGGAAACAGAGGAGATCCAAGATGTAGTCCTAC,CTTGGAAACAGAGGAGATCCAAG,GGGAACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACT...,GGGAACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACT...,GTAGGACTACTACTTGGAAACAGAGGAGATCCAAGATGTAGTCCTA...,GGGTCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACA...,GGGTCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACA...,GGGAACCAAACACACAAACGCACAAAAAAAATAAC,GGGTCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACA...,GGGTCCAAGTAGTAGTCCTACGTTATTTTTTTT
0,AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...,ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2624,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACATATTAGTAACTGTTTTCCATTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAATGGAAAACAGTTACTAATATGT,AACAGAGGAGA,...,AAACAGTTACTAATATGTAACAGAGGAGAACATATATGTAACTGTTT,ATATGTAACAGAGGAGAACATAT,GGGAACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACT...,GGGAACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACT...,AAACAGTTACTAATATGTAACAGAGGAGAACATATATGTAACTGTT...,GGGACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACA...,GGGACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACA...,GGGAACCAAACACACAAACGCACAAAAAAAATGGA,GGGACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACA...,GGGACATATTAGTAACTGTTTTCCATTTTTTTT


-----------------------------------------------------------------------------------------------------------------