# REVERSE COMPLEMENT CALCULATOR (DNA SEQUENCES)
## by Luis Soenksen
### Rev 0.1 20190218

In [1]:
## Import Libraries
# General system libraries
import os
import numpy as np
import pandas as pd
from time import time
from IPython.display import Image

# Multiprocessing
import multiprocessing

# DNA/RNA Analysis Libraries (Biopython, ViennaRNA, pysster)
# Biopython Lib
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import Alphabet, DNAAlphabet, ProteinAlphabet
from Bio.Alphabet import generic_rna, generic_dna, generic_protein, IUPAC
# ViennaRNA Lib
import RNA
# pysster Lib
from pysster import utils
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster.One_Hot_Encoder import One_Hot_Encoder
from pysster.Alphabet_Encoder import Alphabet_Encoder

# Import TPOT libs
from tpot import TPOTRegressor

# Import sklearn libs
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.metrics import median_absolute_error, r2_score

# Math & Visualization Libs
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Progress Bar
from tqdm import tqdm

# Warnings
import warnings

Using TensorFlow backend.


In [2]:
# Create Data folder if not existent
data_folder = "data/gate_data/"
if not os.path.isdir(data_folder):
    os.makedirs(data_folder)
    
for filename in os.listdir(data_folder):
    if filename.endswith(".csv"): 
        print(os.path.join(data_folder, filename))

data/gate_data/Off_Gate4_R3_Rev_counts.csv
data/gate_data/Off_Gate1_R3_Rev_counts.csv
data/gate_data/Off_Gate2_R3_Rev_counts.csv
data/gate_data/Off_Gate3_R3_Rev_counts.csv
data/gate_data/On_Gate2_R3_Rev_counts.csv
data/gate_data/On_Gate3_R3_Rev_counts.csv
data/gate_data/Off_Gate1_R3_Rev_counts_RevComp.csv
data/gate_data/On_Gate1_R3_Rev_counts.csv
data/gate_data/On_Gate4_R3_Rev_counts.csv


In [3]:
# Define path to load desired Toehold dataset file (.csv)
#data_filename = "Off_Gate1_R3_Rev_counts.csv"
data_filename = "Off_Gate2_R3_Rev_counts.csv"
#data_filename = "Off_Gate3_R3_Rev_counts.csv"
#data_filename = "Off_Gate4_R3_Rev_counts.csv"
#data_filename = "On_Gate1_R3_Rev_counts.csv"
#data_filename = "On_Gate2_R3_Rev_counts.csv"
#data_filename = "On_Gate3_R3_Rev_counts.csv"
#data_filename = "On_Gate4_R3_Rev_counts.csv"

data_path = data_folder + data_filename
data = pd.read_csv(data_path, low_memory=False)

#Show dataframe
data.head()

Unnamed: 0,counts,seq
0,1,AAAAAAAAAAATGGAAAATCTCCTCTGTTTTTTCCATTTTTTTTTT...
1,1,AAAAAAAAAACATGGAAAATCTCCTCTGTTTTTTACATTTTTTTTT...
2,1,AAAAAAAAAACATGGAAAATTTCCTCTGTTTTTTTCATTTTTTTTT...
3,1,AAAAAAAAAACATTGTCGCTCTCCTCTGTTGCGACAGCATTTTTTT...
4,1,AAAAAAAAACATAAAAAATCTCCTCTGTTTTTTTTTTTTTTTTTGT...


In [4]:
#Initialize the process bar to count until the number of rows in dataframe
pbar = tqdm(total=data.shape[0])

#Iterate over dataframe data
for index, row in data.iterrows():
    # Calculate and update DNA reverse complement
    i_seq = Seq(data['seq'][index], generic_dna)
    data.at[index,'seq'] = str(i_seq.reverse_complement())
    #Update process bar
    pbar.update(1)
#Conclude process bar
pbar.close()

 81%|████████  | 428287/531712 [00:40<00:09, 10442.91it/s]

TypeError: The sequence data given to a Seq object should be a string (not another Seq object etc)

 81%|████████  | 429351/531712 [01:00<00:09, 10442.91it/s]

In [None]:
data_path_proc = (data_folder + data_filename.replace('.csv','_RevComp.csv'))
data.to_csv(data_path_proc, index=False)

#Show reverse complement data
display(data.head())