Following along with Chapter 8 on Dictionaries 

In [None]:
# storing data
# counting numbers of As or Ts in a sequence is easy
# counting numbers of ATs or other dinucleotides is ok
# but counting numbers of all trinucleodies gets complicated 
# even if you use a loop

In [1]:
dna = "AATGATCGATCGTACGCTGA"
all_counts = []
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
           for base3 in ['A', 'T', 'G', 'C']:
              trinucleotide = base1 + base2 + base3
              count = dna.count(trinucleotide)
              print("count is " + str(count) + " for " + trinucleotide)
              all_counts.append(count)
print(all_counts)

count is 0 for AAA
count is 1 for AAT
count is 0 for AAG
count is 0 for AAC
count is 0 for ATA
count is 0 for ATT
count is 1 for ATG
count is 2 for ATC
count is 0 for AGA
count is 0 for AGT
count is 0 for AGG
count is 0 for AGC
count is 0 for ACA
count is 0 for ACT
count is 1 for ACG
count is 0 for ACC
count is 0 for TAA
count is 0 for TAT
count is 0 for TAG
count is 1 for TAC
count is 0 for TTA
count is 0 for TTT
count is 0 for TTG
count is 0 for TTC
count is 2 for TGA
count is 0 for TGT
count is 0 for TGG
count is 0 for TGC
count is 0 for TCA
count is 0 for TCT
count is 2 for TCG
count is 0 for TCC
count is 0 for GAA
count is 2 for GAT
count is 0 for GAG
count is 0 for GAC
count is 1 for GTA
count is 0 for GTT
count is 0 for GTG
count is 0 for GTC
count is 0 for GGA
count is 0 for GGT
count is 0 for GGG
count is 0 for GGC
count is 0 for GCA
count is 1 for GCT
count is 0 for GCG
count is 0 for GCC
count is 0 for CAA
count is 0 for CAT
count is 0 for CAG
count is 0 for CAC
count is 0 f

In [None]:
# all those at the bottom are unreadable 
# how do you know which count is for which trinucleotide?
# you can modify the loop to make two lists 

In [2]:
dna = "AATGATCGATCGTACGCTGA"
all_trinucleotides = []
all_counts = []
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
           for base3 in ['A', 'T', 'G', 'C']:
              trinucleotide = base1 + base2 + base3
              count = dna.count(trinucleotide)
              all_trinucleotides.append(trinucleotide)
              all_counts.append(count)
print(all_counts)
print(all_trinucleotides)


[0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0]
['AAA', 'AAT', 'AAG', 'AAC', 'ATA', 'ATT', 'ATG', 'ATC', 'AGA', 'AGT', 'AGG', 'AGC', 'ACA', 'ACT', 'ACG', 'ACC', 'TAA', 'TAT', 'TAG', 'TAC', 'TTA', 'TTT', 'TTG', 'TTC', 'TGA', 'TGT', 'TGG', 'TGC', 'TCA', 'TCT', 'TCG', 'TCC', 'GAA', 'GAT', 'GAG', 'GAC', 'GTA', 'GTT', 'GTG', 'GTC', 'GGA', 'GGT', 'GGG', 'GGC', 'GCA', 'GCT', 'GCG', 'GCC', 'CAA', 'CAT', 'CAG', 'CAC', 'CTA', 'CTT', 'CTG', 'CTC', 'CGA', 'CGT', 'CGG', 'CGC', 'CCA', 'CCT', 'CCG', 'CCC']


In [3]:
# with this, you can index the two lists in the same way 
i = all_trinucleotides.index('TGA')
c = all_counts[i]
print('count for TGA is ' + str(c))

count for TGA is 2


In [None]:
# but this requires you to make sure neither list changes at all 
# also too slow 
# To find the index of a given trinucleotide in the all_trinucleotides list,
# Python has to look at each element one at a time until it finds the one we're looking for. 
# This means that as the size of the list grows, the time taken to look up the count for a 
# given element will grow alongside it

In [None]:
# what we want is a way to store pairs of data 
# this is a key and value pair, in a dictionary

In [None]:
# creating a dictionary 
# Each pair of data, consisting of a key and a value, is called an item

In [1]:
# you can make a dictionary like this:
enzymes = { 'EcoRI':r'GAATTC', 'AvaII':r'GG(A|T)CC', 'BisI':'GC[ATGC]GC' }
# or like this: 
enzymes = {
   'EcoRI' : r'GAATTC',
   'AvaII' : r'GG(A|T)CC',
   'BisI'  : r'GC[ATGC]GC'
}

In [2]:
# how to retreive information (the value)
print(enzymes['BisI'])
# looks very similar to using a list, but instead of giving the index of the element we want,
# we're giving the key for the value that we want to retrieve

GC[ATGC]GC


In [None]:
# keys have to be strings or numbers 
# and you cannot use a key twice 
# but the value can be anything 

In [5]:
# most often you'll want to create an empty dictionary and write to it over time 

enzymes = {}
enzymes['EcoRI'] = r'GAATTC'
enzymes['AvaII] =  r'GG(A|T)CC'
enzymes['BisI'] =  r'GC[ATGC]GC'

SyntaxError: invalid syntax (4016866751.py, line 5)

In [7]:
# remove a key from the dictionary 
# also tells you the value 
enzymes = {
   'EcoRI' : r'GAATTC',
   'AvaII' : r'GG(A|T)CC',
   'BisI'  : r'GC[ATGC]GC'
}
# remove the EcoRI enzyme from the dict
enzymes.pop('EcoRI')

'GAATTC'

In [None]:
# how to do the trinucleotide count from above with a dictionary 

In [8]:
dna = "AATGATCGATCGTACGCTGA"
counts = {} # make an empty dictionary 
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
       for base3 in ['A', 'T', 'G', 'C']:
           trinucleotide = base1 + base2 + base3
           count = dna.count(trinucleotide)
           counts[trinucleotide] = count
print(counts)

{'AAA': 0, 'AAT': 1, 'AAG': 0, 'AAC': 0, 'ATA': 0, 'ATT': 0, 'ATG': 1, 'ATC': 2, 'AGA': 0, 'AGT': 0, 'AGG': 0, 'AGC': 0, 'ACA': 0, 'ACT': 0, 'ACG': 1, 'ACC': 0, 'TAA': 0, 'TAT': 0, 'TAG': 0, 'TAC': 1, 'TTA': 0, 'TTT': 0, 'TTG': 0, 'TTC': 0, 'TGA': 2, 'TGT': 0, 'TGG': 0, 'TGC': 0, 'TCA': 0, 'TCT': 0, 'TCG': 2, 'TCC': 0, 'GAA': 0, 'GAT': 2, 'GAG': 0, 'GAC': 0, 'GTA': 1, 'GTT': 0, 'GTG': 0, 'GTC': 0, 'GGA': 0, 'GGT': 0, 'GGG': 0, 'GGC': 0, 'GCA': 0, 'GCT': 1, 'GCG': 0, 'GCC': 0, 'CAA': 0, 'CAT': 0, 'CAG': 0, 'CAC': 0, 'CTA': 0, 'CTT': 0, 'CTG': 1, 'CTC': 0, 'CGA': 1, 'CGT': 1, 'CGG': 0, 'CGC': 1, 'CCA': 0, 'CCT': 0, 'CCG': 0, 'CCC': 0}


In [9]:
# you can easily look up a count of a specific trinucleotide 
print(counts['TGA'])

2


In [10]:
# what if you don't want to have all those zeros in there 
dna = "AATGATCGATCGTACGCTGA"
counts = {}
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
       for base3 in ['A', 'T', 'G', 'C']:
           trinucleotide = base1 + base2 + base3
           count = dna.count(trinucleotide)
           if count > 0:
                counts[trinucleotide] = count
print(counts)

{'AAT': 1, 'ATG': 1, 'ATC': 2, 'ACG': 1, 'TAC': 1, 'TGA': 2, 'TCG': 2, 'GAT': 2, 'GTA': 1, 'GCT': 1, 'CTG': 1, 'CGA': 1, 'CGT': 1, 'CGC': 1}


In [11]:
# if you look up a trinuc that has a number it prints out fine 
print(counts['TGA'])

2


In [12]:
# but if you look up a trinuc that is zero it gives you an error 
print(counts['AAA'])

KeyError: 'AAA'

In [14]:
# you can use an if statement to check if a trinuc exitsts if you don't want an error 
if 'AAA' in counts:
    print(counts('AAA'))

In [None]:
# but that won't print anything if it's not there (unless you do an else)

In [None]:
# you can use the get method
# it works just like square brackets if used like this

In [15]:
print(counts['TGA'])
print(counts.get('TGA'))

2
2


In [None]:
# but it also takes an optional second argument 
# which is the default value to be returned if the key isn't present in the dictionary

In [16]:
# we know if the trinuc is not in the dictionary, then the count is 0 
print("count for TGA is " + str(counts.get('TGA', 0)))
print("count for AAA is " + str(counts.get('AAA', 0)))
print("count for GTA is " + str(counts.get('GTA', 0)))
print("count for TTT is " + str(counts.get('TTT', 0)))

count for TGA is 2
count for AAA is 0
count for GTA is 1
count for TTT is 0


In [17]:
# what if we wanted to take our counts dictionary variable from 
# the code above and print out all trinucleotides where the count was 2
# we can add that into the loop
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
       for base3 in ['A', 'T', 'G', 'C']:
           trinucleotide = base1 + base2 + base3
           if counts.get(trinucleotide, 0) == 2:
              print(trinucleotide)

ATC
TGA
TCG
GAT


In [None]:
# But it seems inefficient to go through the whole process of generating all possible trinucleotides again
# we can use the keys method instead

In [18]:
# When used on a dictionary, the keys method returns a list of all the keys in the dictionary
print(counts.keys())

dict_keys(['AAT', 'ATG', 'ATC', 'ACG', 'TAC', 'TGA', 'TCG', 'GAT', 'GTA', 'GCT', 'CTG', 'CGA', 'CGT', 'CGC'])


In [19]:
# now it is easy to make code hat prints all trinucs with counts of 2 
for trinucleotide in counts.keys():
    if counts.get(trinucleotide) == 2:
       print(trinucleotide)

ATC
TGA
TCG
GAT


In [None]:
# important note: dictionaries are inherently unordered 
# when we use the keys method to iterate over a dictionary, 
# we can't rely on processing the items in the same order that we added them

In [20]:
# If we want to control the order in which keys are printed we can use the sorted method 
# to sort the list before processing it
for trinucleotide in sorted(counts.keys()):
    if counts.get(trinucleotide) == 2:
       print(trinucleotide)

ATC
GAT
TCG
TGA


In [None]:
# In the example code above, the first thing we need to do inside the loop is to look up the value 
# for the current key. This is a very common pattern when iterating over dictionaries – 
# so common, in fact, that Python has a special shorthand for it
# this is how you could do it 
# for key in my_dict.keys():
    # value = my_dict.get(key)
    # do something with key and value

In [None]:
# but you can also use the items method to iterate over pairs of data 
# for key, value in my_dict.items():
   # do something with key and value

In [None]:
# the items method returns a pair of values, so you have to give it two variable names 
# at the start of the loop

In [21]:
# this does the same thing as code above 
for trinucleotide, count in counts.items():
    if count == 2:
       print(trinucleotide)

ATC
TGA
TCG
GAT


Exerciese: DNA translation 

In [None]:
# Write a program that will translate a DNA sequence into protein. 
# Your program should use the standard genetic code which can be found:
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes#SG1

In [None]:
# thoughts
# need to make a dictionary for all the trinucs to protein codes 
# should make a function to take a DNA sequence and split into trinucleotides(?)
# or maybe use the code above to make a dictionary of all codons in the sequence? 
# can't do a find and replace because it has to go in specific order... 

In [1]:
# start by making a dictionary of all the trinucleotides and their proteins 
protein_code = {
    "TTT" : r"Phe",
    "TTC" : r"Phe",
    "TTA" : r"Leu",
    "TTG" : r"Leu",
    "CTT" : r"Leu",
    "CTC" : r"Leu",
    "CTA" : r"Leu",
    "CTG" : r"Leu",
    "ATT" : r"Ile",
    "ATC" : r"Ile",
    "ATA" : r"Ile",
    "ATG" : r"Met",
    "GTT" : r"Val",
    "GTC" : r"Val",
    "GTA" : r"Val",
    "GTG" : r"Val",
    "TCT" : r"Ser",
    "TCC" : r"Ser",
    "TCA" : r"Ser",
    "TCG" : r"Ser",
    "CCT" : r"Pro",
    "CCC" : r"Pro",
    "CCA" : r"Pro",
    "CCG" : r"Pro",
    "ACT" : r"Thr",
    "ACC" : r"Thr",
    "ACA" : r"Thr",
    "ACG" : r"Thr",
    "GCT" : r"Ala",
    "GCC" : r"Ala",
    "GCA" : r"Ala",
    "GCG" : r"Ala",
    "TAT" : r"Tyr",
    "TAC" : r"Tyr",
    "TAA" : r"Ter",
    "TAG" : r"Ter",
    "CAT" : r"His",
    "CAC" : r"His",
    "CAA" : r"Gln",
    "CAG" : r"Gln",
    "AAT" : r"Asn",
    "AAC" : r"Asn",
    "AAA" : r"Lys",
    "AAG" : r"Lys",
    "GAT" : r"Asp",
    "GAC" : r"Asp",
    "GAA" : r"Glu",
    "GAG" : r"Glu",
    "TGT" : r"Cys",
    "TGC" : r"Cys",
    "TGA" : r"Ter",
    "TGG" : r"Trp",
    "CGT" : r"Arg",
    "CGC" : r"Arg",
    "CGA" : r"Arg",
    "CGG" : r"Arg",
    "AGT" : r"Ser",
    "AGC" : r"Ser",
    "AGA" : r"Arg",
    "AGG" : r"Arg",
    "GGT" : r"Gly",
    "GGC" : r"Gly",
    "GGA" : r"Gly",
    "GGG" : r"Gly"
}

In [2]:
# I did some googling to help me figure out a way to split up a DNA sequence into codons 
# https://stackoverflow.com/questions/41006466/how-to-split-up-a-string-every-every-three-indices-either-starting-at-index-0-1

def codons(seq,frame):
    n = len(seq)
    for i in range(frame - 1, n - 2, 3):
        yield seq[i:i+3]

In [88]:
test = 'CTCTTGAGGCGGCGACGCTGATGTTAGACGCCGGTGCTCATGAGTAAC' # make a test sequence 
# I can make it print all the codons 
for codon in codons(test,1): print(codon)

CTC
TTG
AGG
CGG
CGA
CGC
TGA
TGT
TAG
ACG
CCG
GTG
CTC
ATG
AGT
AAC


In [89]:
# can also make it into a list 
# and I can save that list
# I think this is what I will need 
for i in range(1,2):
    print(list(codons(test,i)))
    sequence_codons = list(codons(test,i))
    
print(sequence_codons)

['CTC', 'TTG', 'AGG', 'CGG', 'CGA', 'CGC', 'TGA', 'TGT', 'TAG', 'ACG', 'CCG', 'GTG', 'CTC', 'ATG', 'AGT', 'AAC']
['CTC', 'TTG', 'AGG', 'CGG', 'CGA', 'CGC', 'TGA', 'TGT', 'TAG', 'ACG', 'CCG', 'GTG', 'CTC', 'ATG', 'AGT', 'AAC']


In [95]:
# I went to google again to figure out how to replace items in a list with key value pairs 
# https://stackoverflow.com/questions/59154213/how-to-replace-items-in-list-with-a-keys-from-dictionary-in-python

# this doesn't like it if not all the trinucs are in the list....  
# need to remove a key from the dictionary if it's not in the sequence

for key, value in protein_code.items(): 
    #index = sequence_codons.index(key)
    if key != sequence_codons[index]: 
        protein_code.pop(key)
        print(protein_code)
    #sequence_codons[index] = value
    #print(sequence_condons)

IndexError: list index out of range

In [23]:
# cannot do the method that reverses the dictionary because there are multiple codons for each AA
# reversed_dict = {protein_code[k]:k for k in protein_code}
# reversed_dict
# result = [reversed_dict[elem] for elem in sequence_codons]

{'Phe': 'TTC',
 'Leu': 'CTG',
 'Ile': 'ATA',
 'Met': 'ATG',
 'Val': 'GTG',
 'Ser': 'AGC',
 'Pro': 'CCG',
 'Thr': 'ACG',
 'Ala': 'GCG',
 'Tyr': 'TAC',
 'Ter': 'TGA',
 'His': 'CAC',
 'Gln': 'CAG',
 'Asn': 'AAC',
 'Lys': 'AAG',
 'Asp': 'GAC',
 'Glu': 'GAG',
 'Cys': 'TGC',
 'Trp': 'TGG',
 'Arg': 'AGG',
 'Gly': 'GGG'}

In [24]:
[key for key, value in sorted(protein_code.items(), key = lambda x:sequence_codons.index(x[1]))]

ValueError: 'Phe' is not in list

In [26]:
for AA, replacement in protein_code.items():
    test_protein = sequence_condons.replace(AA, replacement)

AttributeError: 'list' object has no attribute 'replace'

In [33]:
for key, val in protein_code.items():
    for i, v in enumerate(sequence_codons):
        if v == val:
            sequence_conds[i] = key
            sequence_conds


In [35]:
test_protein = []
for trinuc, AA in protein_code.items():
    for codon in sequence_codons:
        if sequence_codons[codon] == trinuc:
            sequence_codons[codon] = trinuc
            print(sequence_codons)
            

TypeError: list indices must be integers or slices, not str

In [None]:
if a_list[i] == 'aple':
        a_list[i] = 'apple'