# Written to calculate weight of a protein sequence
## Written by: Wheaton Schroeder
## Latest version: 06/21/2023

#### Import packages

In [1]:
#import necessary packages

import pandas as pd
import numpy as np
import cobra
from collections import OrderedDict
from copy import deepcopy
import os
import urllib.request
import re
import certifi
import ssl

print("success")

success


#### function to read the amino acid weights for caclulating protein mass into a dictionary

In [2]:
def make_aa_mass_dic():

  #initialized the dictionary
  aa_mass = {}

  #read the model
  file_handle = open('PROTEIN_amino_acid_map.txt','r')

  #read the full model into a string
  aa_map_str = file_handle.readlines()

  #count lines
  line_count = 0

  #for each line
  for line in aa_map_str:

    #skip the first line, as it is a header
    if line_count == 0:

      line_count += 1

    else:

      #here we read the file
      #split by spaces, we want the first and last items
      items = re.split("\s+",line.strip())

      aa_mass[items[0]] = items[-1]

  return aa_mass

aa_mass = make_aa_mass_dic()

aa_mass

{'A': '72.08',
 'C': '104.14',
 'D': '115.08',
 'E': '129.11',
 'F': '148.17',
 'G': '58.05',
 'H': '138.14',
 'I': '114.16',
 'K': '130.18',
 'L': '114.16',
 'M': '132.2',
 'N': '115.1',
 'P': '98.12',
 'Q': '129.13',
 'R': '158.19',
 'S': '88.08',
 'T': '102.1',
 'V': '100.13',
 'W': '187.21',
 'Y': '164.17'}

#### function to calculate weight of protiens in g/mmol from amino acid sequence

In [3]:
def calc_aa_mass(aaseq, mass_dict):

  #search in the sequence for each of the amino acids
  #note: need float here, otherwise writes the weight a number of times equal to mass_dict['A'], seems to automatically treat things as a string
  mass_a = len(re.findall(r'A', aaseq)) * float(mass_dict['A'])
  mass_c = len(re.findall(r'C', aaseq)) * float(mass_dict['C'])
  mass_d = len(re.findall(r'D', aaseq)) * float(mass_dict['D'])
  mass_e = len(re.findall(r'E', aaseq)) * float(mass_dict['E'])
  mass_f = len(re.findall(r'F', aaseq)) * float(mass_dict['F'])
  mass_g = len(re.findall(r'G', aaseq)) * float(mass_dict['G'])
  mass_h = len(re.findall(r'H', aaseq)) * float(mass_dict['H'])
  mass_i = len(re.findall(r'I', aaseq)) * float(mass_dict['I'])
  mass_k = len(re.findall(r'K', aaseq)) * float(mass_dict['K'])
  mass_l = len(re.findall(r'L', aaseq)) * float(mass_dict['L'])
  mass_m = len(re.findall(r'M', aaseq)) * float(mass_dict['M'])
  mass_n = len(re.findall(r'N', aaseq)) * float(mass_dict['N'])
  mass_p = len(re.findall(r'P', aaseq)) * float(mass_dict['P'])
  mass_q = len(re.findall(r'Q', aaseq)) * float(mass_dict['Q'])
  mass_r = len(re.findall(r'R', aaseq)) * float(mass_dict['R'])
  mass_s = len(re.findall(r'S', aaseq)) * float(mass_dict['S'])
  mass_t = len(re.findall(r'T', aaseq)) * float(mass_dict['T'])
  mass_v = len(re.findall(r'V', aaseq)) * float(mass_dict['V'])
  mass_w = len(re.findall(r'W', aaseq)) * float(mass_dict['W'])
  mass_y = len(re.findall(r'Y', aaseq)) * float(mass_dict['Y'])
  mass_a = mass_a + len(re.findall(r'a', aaseq)) * float(mass_dict['A'])
  mass_c = mass_c + len(re.findall(r'c', aaseq)) * float(mass_dict['C'])
  mass_d = mass_d + len(re.findall(r'd', aaseq)) * float(mass_dict['D'])
  mass_e = mass_e + len(re.findall(r'e', aaseq)) * float(mass_dict['E'])
  mass_f = mass_f + len(re.findall(r'f', aaseq)) * float(mass_dict['F'])
  mass_g = mass_g + len(re.findall(r'g', aaseq)) * float(mass_dict['G'])
  mass_h = mass_h + len(re.findall(r'h', aaseq)) * float(mass_dict['H'])
  mass_i = mass_i + len(re.findall(r'i', aaseq)) * float(mass_dict['I'])
  mass_k = mass_k + len(re.findall(r'k', aaseq)) * float(mass_dict['K'])
  mass_l = mass_l + len(re.findall(r'l', aaseq)) * float(mass_dict['L'])
  mass_m = mass_m + len(re.findall(r'm', aaseq)) * float(mass_dict['M'])
  mass_n = mass_n + len(re.findall(r'n', aaseq)) * float(mass_dict['N'])
  mass_p = mass_p + len(re.findall(r'p', aaseq)) * float(mass_dict['P'])
  mass_q = mass_q + len(re.findall(r'q', aaseq)) * float(mass_dict['Q'])
  mass_r = mass_r + len(re.findall(r'r', aaseq)) * float(mass_dict['R'])
  mass_s = mass_s + len(re.findall(r's', aaseq)) * float(mass_dict['S'])
  mass_t = mass_t + len(re.findall(r't', aaseq)) * float(mass_dict['T'])
  mass_v = mass_v + len(re.findall(r'v', aaseq)) * float(mass_dict['V'])
  mass_w = mass_w + len(re.findall(r'w', aaseq)) * float(mass_dict['W'])
  mass_y = mass_y + len(re.findall(r'y', aaseq)) * float(mass_dict['Y'])

  mass = float(mass_a) + float(mass_c) + float(mass_d) + float(mass_e) + float(mass_f) + float(mass_g) + float(mass_h) + float(mass_i) + float(mass_k) + float(mass_l) + float(mass_m) + float(mass_n) + float(mass_p) + float(mass_q) + float(mass_r) + float(mass_s) + float(mass_t) + float(mass_v) + float(mass_w) + float(mass_y)

  #so far, mass is at g/mol, need to divide by 1000 to get the g/mmol

  mass = mass/1000

  return mass


#test this function using Clo1313_1396
mass = calc_aa_mass('MSRMTLKSSMKKRILSLVIAVVFLSLTGVFPSGLIETKVSAAKITENYQFDSRIRLNSIGFIPNHSKKATIAANCSTFYVVKEDGTIVYTGTATSMFDNDTKETVYIADFSSVNEEGTYYLAVPGVGKSVNFKIAMNVYEDAFKTAMLGMYLLRCGTSVSATYNGIHYSHGPCHTNDAYLDYINGQHTKKDSTKGWHDAGDYNKYVVNAGITVGSMFLAWEHFKDQLEPVALEIPEKNNSIPDFLDELKYEIDWILTMQYPDGSGRVAHKVSTRNFGGFIMPENEHDERFFVPWSSAATADFVAMTAMAARIFRPYDPQYAEKCINAAKVSYEFLKNNPANVFANQSGFSTGEYATVSDADDRLWAAAEMWETLGDEEYLRDFENRAAQFSKKIEADFDWDNVANLGMFTYLLSERPGKNPALVQSIKDSLLSTADSIVRTSQNHGYGRTLGTTYYWGCNGTVVRQTMILQVANKISPNNDYVNAALDAISHVFGRNYYNRSYVTGLGINPPMNPHDRRSGADGIWEPWPGYLVGGGWPGPKDWVDIQDSYQTNEIAINWNAALIYALAGFVNYNSAQNEVLYGDVNDDGKVNSTDLTLLKRYVLKAVSTLPSSKAEKNADVNRDGRVNSSDVTILSRYLIRVIEKLPI', make_aa_mass_dic())

print("Example mass of protein from Clo1313_1396 calculated as: "+str(mass)+" kDa")

Example mass of protein from Clo1313_1396 calculated as: 73.03244999999998 kDa


#### Calculate protein mass

In [29]:
aa_seq = 'MTKIANKYEVIDNVEKLEKALKRLREAQSVYATYTQEQVDKIFFEAAMAANKMRIPLAKMAVEETGMGVVEDKVIKNHYASEYIYNAYKNTKTCGVIEEDPAFGIKKIAEPLGVIAAVIPTTNPTSTAIFKTLIALKTRNAIIISPHPRAKNSTIEAAKIVLEAAVKAGAPEGIIGWIDVPSLELTNLVMREADVILATGGPGLVKAAYSSGKPAIGVGAGNTPAIIDDSADIVLAVNSIIHSKTFDNGMICASEQSVIVLDGVYKEVKKEFEKRGCYFLNEDETEKVRKTIIINGALNAKIVGQKAHTIANLAGFEVPETTKILIGEVTSVDISEEFAHEKLCPVLAMYRAKDFDDALDKAERLVADGGFGHTSSLYIDTVTQKEKLQKFSERMKTCRILVNTPSSQGGIGDLYNFKLAPSLTLGCGSWGGNSVSDNVGVKHLLNIKTVAERRENMLWFRTPEKIYIKRGCLPVALDELKNVMGKKKAFIVTGNFLYNNGYTKPITDKLDEMGIVHKTFFDVSPDPSLASAKAGAAEMLAFQPDTIIAVGGGSAMDAAKIMWVMYEHPEVDFMDMAMRFMDIRKRVYTFPKMGQKAYFIAIPTSAGTGSEVTPFAVITDEKTGIKYPLADYELLPDMAIVDADMMMNAPKGLTAASGIDALTHALEAYVSMLATDYTDSLALRAIKMIFEYLPRAYENGASDPVAREKMANAATIAGMAFANAFLGVCHSMAHKLGAFYHLPHGVANALMINEVIRFNSSEAPTKMGTFPQYDHPRTLERYAEIADYIGLKGKNNEEKVENLIKAIDELKEKVGIRKTIKDYDIDEKEFLDRLDEMVEQAFDDQCTGTNPRYPLMNEIRQMYLNAYYGGAKK'

mass = calc_aa_mass(aa_seq,make_aa_mass_dic())

mass

96.79497000000002

In [5]:
aa_seq.count('')

375

#### Count amino acids by type used to build new sequences for knocked in genes manually

In [6]:
res = aa_seq.count('A')
print ("ala = ",res)

ala =  41


In [7]:
res = aa_seq.count('C')
print ("cys = ",res)

cys =  4


In [8]:
res = aa_seq.count('D')
print ("asp = ",res)

asp =  26


In [9]:
res = aa_seq.count('E')
print ("glu = ",res)

glu =  13


In [10]:
res = aa_seq.count('F')
print ("phe = ",res)

phe =  11


In [11]:
res = aa_seq.count('G')
print ("gly = ",res)

gly =  27


In [12]:
res = aa_seq.count('H')
print ("his = ",res)

his =  8


In [13]:
res = aa_seq.count('I')
print ("ile = ",res)

ile =  21


In [14]:
res = aa_seq.count('K')
print ("lys = ",res)

lys =  30


In [15]:
res = aa_seq.count('L')
print ("leu = ",res)

leu =  26


In [16]:
res = aa_seq.count('M')
print ("met = ",res)

met =  9


In [17]:
res = aa_seq.count('N')
print ("asn = ",res)

asn =  13


In [18]:
res = aa_seq.count('P')
print ("pro = ",res)

pro =  12


In [19]:
res = aa_seq.count('Q')
print ("gln = ",res)

gln =  11


In [20]:
res = aa_seq.count('R')
print ("arg = ",res)

arg =  17


In [21]:
res = aa_seq.count('S')
print ("ser = ",res)

ser =  25


In [22]:
res = aa_seq.count('T')
print ("thr = ",res)

thr =  24


In [23]:
res = aa_seq.count('W')
print ("trp = ",res)

trp =  5


In [24]:
res = aa_seq.count('V')
print ("val = ",res)

val =  45


In [25]:
res = aa_seq.count('Y')
print ("tyr = ",res)

tyr =  6
