# Amino acid calculator
#### data was taken from hmdb.ca/proteins/HMDBP02075 and fasta header was introduced to provide practice 

## Import amino acid sequence from fasta file and make sure its length is valid


In [2]:
## define path to albumin gene txt file

albumin_gene_file =  'albumin_gene.txt'

# Open the file in read mode
with open(albumin_gene_file, 'r') as file:
    # Read the entire content of the file
    # .read() reads the file
    # .strip() removes any leading or trailin white space
    # .split('\n') makes a temporary list of all elements using \n as a separator(makes a new list of every line)
    # [1:] says selects the second line of the file to the end (because we dont want the header line!)
    # '\n.join' rebuilds the list using the \n character as as separator (because that is what we split on!)
    gene_sequence = '\n'.join(file.read().strip().split('\n')[1:])

#remove pesky new line characters once and for all
gene_sequence=gene_sequence.replace('\n', '')


# Make sure the DNA sequence length is divisible by 3
# try changing this number and see what happens
if len(gene_sequence) % 3 != 0:
    raise ValueError("Invalid DNA sequence length")


### Extract codons from the gene sequence using a loop

In [3]:
# Split the gene sequence into codons
# this loop starts at 0 and goes til the length of our gene, incrementing by 3 at each step (0,3,6,9...)
# append appends each substring from gene sequence as a codon
albumin_codons = []
for i in range(0, len(gene_sequence), 3):
    albumin_codons.append(gene_sequence[i:i+3])

# Print the list of codons
print(albumin_codons)

['ATG', 'AAG', 'TGG', 'GTA', 'ACC', 'TTT', 'ATT', 'TCC', 'CTT', 'CTT', 'TTT', 'CTC', 'TTT', 'AGC', 'TCG', 'GCT', 'TAT', 'TCC', 'AGG', 'GGT', 'GTG', 'TTT', 'CGT', 'CGA', 'GAT', 'GCA', 'CAC', 'AAG', 'AGT', 'GAG', 'GTT', 'GCT', 'CAT', 'CGG', 'TTT', 'AAA', 'GAT', 'TTG', 'GGA', 'GAA', 'GAA', 'AAT', 'TTC', 'AAA', 'GCC', 'TTG', 'GTG', 'TTG', 'ATT', 'GCC', 'TTT', 'GCT', 'CAG', 'TAT', 'CTT', 'CAG', 'CAG', 'TGT', 'CCA', 'TTT', 'GAA', 'GAT', 'CAT', 'GTA', 'AAA', 'TTA', 'GTG', 'AAT', 'GAA', 'GTA', 'ACT', 'GAA', 'TTT', 'GCA', 'AAA', 'ACA', 'TGT', 'GTT', 'GCT', 'GAT', 'GAG', 'TCA', 'GCT', 'GAA', 'AAT', 'TGT', 'GAC', 'AAA', 'TCA', 'CTT', 'CAT', 'ACC', 'CTT', 'TTT', 'GGA', 'GAC', 'AAA', 'TTA', 'TGC', 'ACA', 'GTT', 'GCA', 'ACT', 'CTT', 'CGT', 'GAA', 'ACC', 'TAT', 'GGT', 'GAA', 'ATG', 'GCT', 'GAC', 'TGC', 'TGT', 'GCA', 'AAA', 'CAA', 'GAA', 'CCT', 'GGG', 'AGA', 'AAT', 'GAA', 'TGC', 'TTC', 'TTG', 'CAA', 'CAC', 'AAA', 'GAT', 'GAC', 'AAC', 'CCA', 'AAC', 'CTC', 'CCC', 'CGA', 'TTG', 'GTG', 'AGA', 'CCA', 'GAG'

### Define Dictionary to relate codons to amino acids
##### Note: there is an easier way to do this using the'biopython' library. Feel free to give that a try on your own!

In [4]:
#Dictionary with key: value pairs codon: amino acid respectively
codon_table = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
}

### Iterate through our codon list and create an amino acid list 


In [5]:
calculated_amino_acid_sequence = ''

for codon in albumin_codons:
    # get is a command that returns the value for a given key in a dictionary
    # An X will get inserted for unknown codons
    amino_acid = codon_table.get(codon, 'X')  
    calculated_amino_acid_sequence += amino_acid

print(calculated_amino_acid_sequence)

MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKALVLIAFAQYLQQCPFEDHVKLVNEVTEFAKTCVADESAENCDKSLHTLFGDKLCTVATLRETYGEMADCCAKQEPGRNECFLQHKDDNPNLPRLVRPEVDVMCTAFHDNEETFLKKYLYEIARRHPYFYAPELLFFAKRYKAAFTECCQAADKAACLLPKLDELRDEGKASSAKQRLKCASLQKFGERAFKAWAVARLSQRFPKAEFAEVSKLVTDLTKVHTECCHGDLLECADDRADLAKYICENQDSISSKLKECCEKPLLEKSHCIAEVENDEMPADLPSLAADFVESKDVCKNYAEAKDVFLGMFLYEYARRHPDYSVVLLLRLAKTYETTLEKCCAAADPHECYAKVFDEFKPLVEEPQNLIKQNCELFEQLGEYKFQNALLVRYTKKVPEVSTPTLVEVSRNLGKVGSKCCKHPEAKRMPCAEDYLSVVLNQLCVLHEKTPVSDRVTKCCTESLVNRRPCFSALEVDETYVPKEFNAETFTFHADICTLSEKERQIKKQTALVELVKHKPKATKEQLKAVMDDFAAFVEKCCKADDKETCFAEEGKKLVAASQAALGL*


## Lets check how we did!   

In [6]:
# For every matching amino acid in the reference file, check if we got the same amino acid and compute a percentile score

albumin_aa_file =  'albumin_aa.txt'


# Open the file in read mode
with open(albumin_aa_file, 'r') as file:
    # Read the entire content of the file
    # .read() reads the file
    # .strip() removes any leading or trailin white space
    # .split('\n') makes a temporary list of all elements using \n as a separator(makes a new list of every line)
    # [1:] says selects the second line of the file to the end (because we dont want the header line!)
    # '\n.join' rebuilds the list using the \n character as as separator (because that is what we split on!)
    true_aa_sequence = '\n'.join(file.read().strip().split('\n')[1:])

#remove pesky new line characters once and for all
true_aa_sequence=true_aa_sequence.replace('\n', '')

# this will get updated soon
score_numerator = 0 
# we use the length of the true amino acid sequence as our dictionary has a * for stop codons which is not present in the 
# true aa sequence. this makes true aa sequence < calculated aa sequence by 1 amino acid
score_denominator = len(true_aa_sequence)

# check how many we got right and print out the indices of incorrect amino acids
for i in range(0,len(true_aa_sequence)):
    if calculated_amino_acid_sequence[i] == true_aa_sequence[i]:
        score_numerator += 1
    else:
        print(i)
    
result = score_numerator/score_denominator  
print(round(result,4))



120
440
0.9967


## We got two incorrect! Not bad! It is harder to go the other way as multiple codons can encode for the same amino acid, but feel free to give it a try! Or try to make mRNA from the DNA!