Following along with the chapter 

In [None]:
# making a function to get AT content out of a sequence

In [1]:
def get_at_content(dna):
    length = len(dna)
    a_count = dna.count('A')
    t_count = dna.count('T')
    at_content = (a_count + t_count) / length
    return at_content

In [None]:
# def means define, then get_at_content(dna) is the function with the argument (dna). The argument is a string. 
# Everything inside the function is indented in the same way 
# return says what to output
# "dna" doesn't mean anything, we put in the sequence or the string

In [2]:
get_at_content("ATGACTGGACCA")

0.5

In [3]:
# or define as a variable 
AT_content = get_at_content("ATGACTGGACCA")
print(AT_content)

0.5


In [None]:
# any variables created inside the function only exist inside the function. Cannot look at t_content

In [4]:
# Expanding the function 
def get_at_content(dna):
    length = len(dna)
    a_count = dna.count('A')
    t_count = dna.count('T')
    at_content = (a_count + t_count) / length
    return at_content

my_at_content = get_at_content("ATGCGCGATCGATCGAATCG")
print(str(my_at_content))
print(get_at_content("ATGCATGCAACTGTAGC"))
print(get_at_content("aactgtagctagctagcagcgta"))

0.45
0.5294117647058824
0.0


In [None]:
# too many sig figs, and a 0 value

In [5]:
# can we round the at content to lessent the SF
# can we make sure the case of the sequence is always upper
def get_at_content(dna):
    length = len(dna)
    a_count = dna.upper().count('A')
    t_count = dna.upper().count('T')
    at_content = (a_count + t_count) / length
    return round(at_content, 2)
my_at_content = get_at_content("ATGCGCGATCGATCGAATCG")
print(str(my_at_content))
print(get_at_content("ATGCATGCAACTGTAGC"))
print(get_at_content("aactgtagctagctagcagcgta"))

0.45
0.53
0.52


In [6]:
# what if you want to be able to specify how many significant figures to go to? 
# you can add in an argument to the function 
def get_at_content(dna, sig_figs):
    length = len(dna)
    a_count = dna.upper().count('A')
    t_count = dna.upper().count('T')
    at_content = (a_count + t_count) / length
    return round(at_content, sig_figs)
test_dna = "ATGCATGCAACTGTAGC"
print(get_at_content(test_dna, 1))
print(get_at_content(test_dna, 2))
print(get_at_content(test_dna, 3))

0.5
0.53
0.529


In [11]:
# functions can print instead of return

def print_at_content(dna):
    length = len(dna)
    a_count = dna.upper().count('A')
    t_count = dna.upper().count('T')
    at_content = (a_count + t_count) / length
    print(str(round(at_content, 2)))
get_at_content(test_dna,2)

0.53

In [None]:
# however calculating the AT content and printing it are two different jobs
# want to write functions that do only one job 
# that way you are able to do anything with the AT content later not just print it 

In [13]:
# you can also call functions with keyword arguments 
get_at_content(dna="ATCGTGACTCG", sig_figs=2)
# order doesn't matter because you specify the arguments 
get_at_content(sig_figs=2, dna="ATCGTGACTCG")

0.45

In [15]:
# you can also set a default value for an argument 
# set it at the beginning when defining arguments 
def get_at_content(dna, sig_figs=2):
    length = len(dna)
    a_count = dna.upper().count('A')
    t_count = dna.upper().count('T')
    at_content = (a_count + t_count) / length
    return round(at_content, sig_figs)

get_at_content("ATCGTGACTCG")
get_at_content("ATCGTGACTCG", 3)
get_at_content("ATCGTGACTCG", sig_figs=4)

0.4545

In [17]:
# sometimes you need to test functions 
# use the assert function 
assert get_at_content("ATGC") == 0.5
# we know that the AT content of ATGC should be 50% 
# will give an error if false

In [18]:
# what if your dna input has unknown bases?
assert get_at_content("ATGCNNNNNNNNNN") == 0.5

AssertionError: 

In [19]:
# you get an error, but what if you still need to know the AT content? 
# you can modify the function to remove possible Ns 
def get_at_content(dna, sig_figs=2):
    dna = dna.replace('N', '')
    length = len(dna)
    a_count = dna.upper().count('A')
    t_count = dna.upper().count('T')
    at_content = (a_count + t_count) / length
    return round(at_content, sig_figs)

In [20]:
# you can do a bunch of assertations to make sure every part of the function works 
assert get_at_content("A") == 1
assert get_at_content("G") == 0
assert get_at_content("ATGC") == 0.5
assert get_at_content("AGG") == 0.33
assert get_at_content("AGG", 1) == 0.3
assert get_at_content("AGG", 5) == 0.33333

Exercises

In [None]:
# percentage of amino acid residues 
# Write a function that takes two arguments – a protein sequence and an amino acid residue code 
# – and returns the percentage of the protein that the amino acid makes up
# use this MSRSLLLRFLLFLLLLPPLP as a test sequence

In [26]:
def get_aa_content(protein, aa, sig_figs=1):
    length = len(protein)
    aa_num = protein.upper().count(aa.upper())
    aa_content = aa_num / length 
    aa_percent = aa_content * 100
    return round(aa_percent, sig_figs)
  
# my own test    
get_aa_content("MSRS", "S", 2)

50.0

In [31]:
# test the function further with assert
assert get_aa_content("MSRSLLLRFLLFLLLLPPLP", "M") == 5
assert get_aa_content("MSRSLLLRFLLFLLLLPPLP", "r") == 10
assert get_aa_content("MSRSLLLRFLLFLLLLPPLP", "L") == 50
assert get_aa_content("MSRSLLLRFLLFLLLLPPLP", "Y") == 0

In [None]:
#Modify the function from part one so that it accepts a list of amino acid residues rather than a single one. 
# If no list is given, the function should return the percentage of hydrophobic amino acid residues 
# (A, I, L, M, F, W, Y and V)

In [None]:
# supposed to be a for loop but I can't do those 

In [40]:
def get_aa_content(protein, aa=["A", "I", "L", "M", "F", "W", "Y", "V"], sig_figs=1):
    length = len(protein)
    convert = "".join(aa)
    makeup = convert.upper()
    convertback = list(makeup)
    aa_num = protein.upper().count(convertback)
    aa_content = aa_num / length 
    aa_percent = aa_content * 100
    return round(aa_percent, sig_figs)
get_aa_content("MSRS", ["S", "M"], 2)

SM


TypeError: must be str, not list