# Format DNA sequence
1. Paste in your DNA
1. Run the first cell (command-Enter)
1. All other cells can run independently after running the first cell

In [1]:
# Paste dna sequence in between the triple parentheses and ensure there are no parentheses in your sequence
dna = '''

        1 atgaataata atatgtcatt aaagaaaaag agcattctcg cgttagcttg ctatgtttgt
       61 tttttaatta cggttattgg cagtattact tattgtgttg ttgaacctcc tattcgtgaa
      121 aatcttgaaa acaatcttaa tttacgcact caacttttag ctagggaaat agaggatccg
      181 cttaatcatt ctctttccat attgcatgca cttgttgggg tcgctgcaag cggttactct
      241 atcgatgttc ttgaagatat gacttattct gttttcaaag agagcgacga tattattatt
      301 agtggtggca tttggccaga acccaataca ttggagccat ccaaacaatt agctagcatt
      361 ttttattctc gtgatgaacg aggaaaaatc acctctgtta acgattacaa tattcctagc
      421 aatacacctt atcaacaaga gtcatggtat acatcggttt cacatgaaaa cagtagccat


      '''


### PARAMETERS (OPTIONAL) ###

# Name of molecule
# replace dna_molecule with your molecule's name
# best to avoid spaces and special characters; ok to use underscore
dna_name = 'dna_molecule'

# If you have RNA instead of DNA change this to True
is_rna = False

# Leave this function alone.
# It removes spaces and non-DNA characters from your DNA sequence.

dna_bases = 'GATCRYSWKMBVDHN'
rna_bases = 'GAUCRYSWKMBVDHN'

def clean(sequence, nucleic_acid='dna'):
    '''Clean up the input DNA sequence
    
       Remove the numbers and any non-DNA bases (including uracil!)
       If nucleic acid is set to 'rna' then uracil is permitted and not thymine
       Returns the sequence in uppercase
    '''
    acceptable_bases = eval(nucleic_acid + '_bases')
    sequence_clean = ''.join(n.upper() for n in sequence if n.upper() in acceptable_bases)
    
    # Do this in a class instead!
    # sequence lengths = [len(sequence), len(sequence_clean)]
    # report on the results
    
    return sequence_clean

# Wish list:
# output fasta format 60/line

# Here is the cleaned version of the DNA you submitted
# (or any errors found during the cleaning process)

print(clean(dna))

ATGAATAATAATATGTCATTAAAGAAAAAGAGCATTCTCGCGTTAGCTTGCTATGTTTGTTTTTTAATTACGGTTATTGGCAGTATTACTTATTGTGTTGTTGAACCTCCTATTCGTGAAAATCTTGAAAACAATCTTAATTTACGCACTCAACTTTTAGCTAGGGAAATAGAGGATCCGCTTAATCATTCTCTTTCCATATTGCATGCACTTGTTGGGGTCGCTGCAAGCGGTTACTCTATCGATGTTCTTGAAGATATGACTTATTCTGTTTTCAAAGAGAGCGACGATATTATTATTAGTGGTGGCATTTGGCCAGAACCCAATACATTGGAGCCATCCAAACAATTAGCTAGCATTTTTTATTCTCGTGATGAACGAGGAAAAATCACCTCTGTTAACGATTACAATATTCCTAGCAATACACCTTATCAACAAGAGTCATGGTATACATCGGTTTCACATGAAAACAGTAGCCAT


In [2]:
# Lowercase

print(clean(dna).lower())

atgaataataatatgtcattaaagaaaaagagcattctcgcgttagcttgctatgtttgttttttaattacggttattggcagtattacttattgtgttgttgaacctcctattcgtgaaaatcttgaaaacaatcttaatttacgcactcaacttttagctagggaaatagaggatccgcttaatcattctctttccatattgcatgcacttgttggggtcgctgcaagcggttactctatcgatgttcttgaagatatgacttattctgttttcaaagagagcgacgatattattattagtggtggcatttggccagaacccaatacattggagccatccaaacaattagctagcattttttattctcgtgatgaacgaggaaaaatcacctctgttaacgattacaatattcctagcaatacaccttatcaacaagagtcatggtatacatcggtttcacatgaaaacagtagccat


In [3]:
# Fasta format with 60 characters to a line
def fasta_format(sequence):
    output = '>' + dna_name + '\n'
    output += '\n'.join(sequence[n:n+60] for n in range(0,len(sequence),60))
    return output

print(fasta_format(clean(dna)))

>dna_molecule
ATGAATAATAATATGTCATTAAAGAAAAAGAGCATTCTCGCGTTAGCTTGCTATGTTTGT
TTTTTAATTACGGTTATTGGCAGTATTACTTATTGTGTTGTTGAACCTCCTATTCGTGAA
AATCTTGAAAACAATCTTAATTTACGCACTCAACTTTTAGCTAGGGAAATAGAGGATCCG
CTTAATCATTCTCTTTCCATATTGCATGCACTTGTTGGGGTCGCTGCAAGCGGTTACTCT
ATCGATGTTCTTGAAGATATGACTTATTCTGTTTTCAAAGAGAGCGACGATATTATTATT
AGTGGTGGCATTTGGCCAGAACCCAATACATTGGAGCCATCCAAACAATTAGCTAGCATT
TTTTATTCTCGTGATGAACGAGGAAAAATCACCTCTGTTAACGATTACAATATTCCTAGC
AATACACCTTATCAACAAGAGTCATGGTATACATCGGTTTCACATGAAAACAGTAGCCAT


In [4]:
# Reverse complement (DNA only right now)
def reverse_complement(sequence):
    '''Return the DNA Reverse Complement'''
    complement = ''.maketrans('GATCRYSWKMBVDHN', 'CTAGYRSWMKVBHDN')
    return sequence.translate(complement)[::-1]

print(reverse_complement(clean(dna)))

ATGGCTACTGTTTTCATGTGAAACCGATGTATACCATGACTCTTGTTGATAAGGTGTATTGCTAGGAATATTGTAATCGTTAACAGAGGTGATTTTTCCTCGTTCATCACGAGAATAAAAAATGCTAGCTAATTGTTTGGATGGCTCCAATGTATTGGGTTCTGGCCAAATGCCACCACTAATAATAATATCGTCGCTCTCTTTGAAAACAGAATAAGTCATATCTTCAAGAACATCGATAGAGTAACCGCTTGCAGCGACCCCAACAAGTGCATGCAATATGGAAAGAGAATGATTAAGCGGATCCTCTATTTCCCTAGCTAAAAGTTGAGTGCGTAAATTAAGATTGTTTTCAAGATTTTCACGAATAGGAGGTTCAACAACACAATAAGTAATACTGCCAATAACCGTAATTAAAAAACAAACATAGCAAGCTAACGCGAGAATGCTCTTTTTCTTTAATGACATATTATTATTCAT
