# Written to calculate weight of each protein sequence in C. therm genome and report results
## Written by: Wheaton Schroeder
## Latest version: 07/12/2023

#### Import packages

In [1]:
#import necessary packages

import pandas as pd
import numpy as np
import cobra
from collections import OrderedDict
from copy import deepcopy
import os
import urllib.request
import re
import certifi
import ssl

print("success")

success


#### function to read the amino acid weights for caclulating protein mass into a dictionary

In [2]:
def make_aa_mass_dic():

  #initialized the dictionary
  aa_mass = {}

  #read the model
  file_handle = open('PROTEIN_amino_acid_map.txt','r')

  #read the full model into a string
  aa_map_str = file_handle.readlines()

  #count lines
  line_count = 0

  #for each line
  for line in aa_map_str:

    #skip the first line, as it is a header
    if line_count == 0:

      line_count += 1

    else:

      #here we read the file
      #split by spaces, we want the first and last items
      items = re.split("\s+",line.strip())

      aa_mass[items[0]] = items[-1]

  return aa_mass

aa_mass = make_aa_mass_dic()

#check it worked
aa_mass

{'A': '72.08',
 'C': '104.14',
 'D': '115.08',
 'E': '129.11',
 'F': '148.17',
 'G': '58.05',
 'H': '138.14',
 'I': '114.16',
 'K': '130.18',
 'L': '114.16',
 'M': '132.2',
 'N': '115.1',
 'P': '98.12',
 'Q': '129.13',
 'R': '158.19',
 'S': '88.08',
 'T': '102.1',
 'V': '100.13',
 'W': '187.21',
 'Y': '164.17'}

#### function to calculate weight of protiens in g/mmol from amino acid sequence

In [3]:
def calc_aa_mass(aaseq, mass_dict):

  #search in the sequence for each of the amino acids
  #note: need float here, otherwise writes the weight a number of times equal to mass_dict['A'], seems to automatically treat things as a string
  mass_a = len(re.findall(r'A', aaseq)) * float(mass_dict['A'])
  mass_c = len(re.findall(r'C', aaseq)) * float(mass_dict['C'])
  mass_d = len(re.findall(r'D', aaseq)) * float(mass_dict['D'])
  mass_e = len(re.findall(r'E', aaseq)) * float(mass_dict['E'])
  mass_f = len(re.findall(r'F', aaseq)) * float(mass_dict['F'])
  mass_g = len(re.findall(r'G', aaseq)) * float(mass_dict['G'])
  mass_h = len(re.findall(r'H', aaseq)) * float(mass_dict['H'])
  mass_i = len(re.findall(r'I', aaseq)) * float(mass_dict['I'])
  mass_k = len(re.findall(r'K', aaseq)) * float(mass_dict['K'])
  mass_l = len(re.findall(r'L', aaseq)) * float(mass_dict['L'])
  mass_m = len(re.findall(r'M', aaseq)) * float(mass_dict['M'])
  mass_n = len(re.findall(r'N', aaseq)) * float(mass_dict['N'])
  mass_p = len(re.findall(r'P', aaseq)) * float(mass_dict['P'])
  mass_q = len(re.findall(r'Q', aaseq)) * float(mass_dict['Q'])
  mass_r = len(re.findall(r'R', aaseq)) * float(mass_dict['R'])
  mass_s = len(re.findall(r'S', aaseq)) * float(mass_dict['S'])
  mass_t = len(re.findall(r'T', aaseq)) * float(mass_dict['T'])
  mass_v = len(re.findall(r'V', aaseq)) * float(mass_dict['V'])
  mass_w = len(re.findall(r'W', aaseq)) * float(mass_dict['W'])
  mass_y = len(re.findall(r'Y', aaseq)) * float(mass_dict['Y'])
  mass_a = mass_a + len(re.findall(r'a', aaseq)) * float(mass_dict['A'])
  mass_c = mass_c + len(re.findall(r'c', aaseq)) * float(mass_dict['C'])
  mass_d = mass_d + len(re.findall(r'd', aaseq)) * float(mass_dict['D'])
  mass_e = mass_e + len(re.findall(r'e', aaseq)) * float(mass_dict['E'])
  mass_f = mass_f + len(re.findall(r'f', aaseq)) * float(mass_dict['F'])
  mass_g = mass_g + len(re.findall(r'g', aaseq)) * float(mass_dict['G'])
  mass_h = mass_h + len(re.findall(r'h', aaseq)) * float(mass_dict['H'])
  mass_i = mass_i + len(re.findall(r'i', aaseq)) * float(mass_dict['I'])
  mass_k = mass_k + len(re.findall(r'k', aaseq)) * float(mass_dict['K'])
  mass_l = mass_l + len(re.findall(r'l', aaseq)) * float(mass_dict['L'])
  mass_m = mass_m + len(re.findall(r'm', aaseq)) * float(mass_dict['M'])
  mass_n = mass_n + len(re.findall(r'n', aaseq)) * float(mass_dict['N'])
  mass_p = mass_p + len(re.findall(r'p', aaseq)) * float(mass_dict['P'])
  mass_q = mass_q + len(re.findall(r'q', aaseq)) * float(mass_dict['Q'])
  mass_r = mass_r + len(re.findall(r'r', aaseq)) * float(mass_dict['R'])
  mass_s = mass_s + len(re.findall(r's', aaseq)) * float(mass_dict['S'])
  mass_t = mass_t + len(re.findall(r't', aaseq)) * float(mass_dict['T'])
  mass_v = mass_v + len(re.findall(r'v', aaseq)) * float(mass_dict['V'])
  mass_w = mass_w + len(re.findall(r'w', aaseq)) * float(mass_dict['W'])
  mass_y = mass_y + len(re.findall(r'y', aaseq)) * float(mass_dict['Y'])

  mass = float(mass_a) + float(mass_c) + float(mass_d) + float(mass_e) + float(mass_f) + float(mass_g) + float(mass_h) + float(mass_i) + float(mass_k) + float(mass_l) + float(mass_m) + float(mass_n) + float(mass_p) + float(mass_q) + float(mass_r) + float(mass_s) + float(mass_t) + float(mass_v) + float(mass_w) + float(mass_y)

  #so far, mass is at g/mol, need to divide by 1000 to get the g/mmol

  mass = mass/1000

  return mass


#test this function using Clo1313_1396
mass = calc_aa_mass('MSRMTLKSSMKKRILSLVIAVVFLSLTGVFPSGLIETKVSAAKITENYQFDSRIRLNSIGFIPNHSKKATIAANCSTFYVVKEDGTIVYTGTATSMFDNDTKETVYIADFSSVNEEGTYYLAVPGVGKSVNFKIAMNVYEDAFKTAMLGMYLLRCGTSVSATYNGIHYSHGPCHTNDAYLDYINGQHTKKDSTKGWHDAGDYNKYVVNAGITVGSMFLAWEHFKDQLEPVALEIPEKNNSIPDFLDELKYEIDWILTMQYPDGSGRVAHKVSTRNFGGFIMPENEHDERFFVPWSSAATADFVAMTAMAARIFRPYDPQYAEKCINAAKVSYEFLKNNPANVFANQSGFSTGEYATVSDADDRLWAAAEMWETLGDEEYLRDFENRAAQFSKKIEADFDWDNVANLGMFTYLLSERPGKNPALVQSIKDSLLSTADSIVRTSQNHGYGRTLGTTYYWGCNGTVVRQTMILQVANKISPNNDYVNAALDAISHVFGRNYYNRSYVTGLGINPPMNPHDRRSGADGIWEPWPGYLVGGGWPGPKDWVDIQDSYQTNEIAINWNAALIYALAGFVNYNSAQNEVLYGDVNDDGKVNSTDLTLLKRYVLKAVSTLPSSKAEKNADVNRDGRVNSSDVTILSRYLIRVIEKLPI', make_aa_mass_dic())

print("Example mass of protein from Clo1313_1396 calculated as: "+str(mass)+" kDa")

Example mass of protein from Clo1313_1396 calculated as: 73.03244999999998 kDa


#### create

#### Read the annotated genome

In [6]:
#state the file
genome_file = "Ctherm_DSM1313_genome_annotated.gbff"

#read the file
with open(genome_file) as file:
    data = file.read()

#remove the unnecessary bit
junk1, keep1 = data.split("FEATURES")

#for debugging
#print(junk)

#remove the ACGT genome, not needed for this context
keep2, junk2 = keep1.split("CONTIG")

#split the annotation at each gene
genes = keep2.split("     gene            ")

#first element is still annotation stuff we don't need
genes.pop(0)

#for debugging, comment out if unused
#print(genes[0])

#create a string for writing the output to
out_str = "old_locus\tmass\ttranslation"

#for each gene
for gene in genes:

    #read the old locus tag
    locus_match = re.search(r"\/old_locus_tag\=\"(?P<locus>.+?)\"", gene)

    if (locus_match is None):

        #no locus found, skip to next element
        1

    else: 

        #for debugging, comment out if unused
        #print("gene: ",gene,"\tmatch: ",locus_match['locus'])
        #print("gene: ",locus_match['locus'])

        #read the traslation
        trans_match = re.search(r"\/translation\=\"(?P<trans>(.+\n)*.+)\"", gene)

        #print("match: ",trans_match)

        if (trans_match is None): 

            #no match
            1

        else: 
            
            #for debugging
            #print("found a translation: ",trans_match['trans'])

            #remove formatting from the translation so all we have left is the AA sequence
            traslation = trans_match['trans'].replace("\n","")
            traslation = traslation.replace(" ","")

            #for debugging
            #print("translation: ",traslation)

            #calcualte the weight of the 
            aa_mass = calc_aa_mass(trans_match['trans'],make_aa_mass_dic())
            
            #for debugging
            #print("mass: ",aa_mass)

            out_str = out_str+"\n"+locus_match['locus']+"\t"+str(aa_mass)+"\t"+traslation

#for debugging, comment if unused
print(out_str)

old_locus	mass	translation
Clo1313_0001	50.72788	MNTQLNEIWQKTLGLLKNELTEISFNTWIKTIDPLSLTGNTINLAVPAEFNKGILESRYQTLIKNAIKQVTFKEYEIAFIVPSQENLNKLTKQTESAGNEDSPLSVLNPKYTFDTFVIGNSNRFAHAAALAVAEAPGKAYNPLFIYGGVGLGKTHLMHAIGHYILEQNSSQRVLYVSSEKFTNELINAIKDNRNEEFRSKYRNIDVLLIDDIQFIAGKERTEEEFFHTFNALYEANKQIILSSDKPPKEISLEDRLRSRFEWGLIADMQAPDLETRIAILRKKAQLENLTVPNEVIVFIADKIASNIRELEGALNRVIAYSSLTENEITVELASEALKDILSANKAKVLNCTTIQEAVARYFDIRPEEFKSKKRTRDIAFPRQIAMYLCRELTEMSLPKIGEEFGGRDHTTVIHACEKISEEIESNSETRRAVSEIKRNLLGK
Clo1313_0002	41.53706	MKIVCSKEQLMEGINVVQKAVPTKATLTILEGILLEAYDNFKMTGNDLELGIECLIDADILEKGSIVLNSKMFGDIVRRLPDSEVLIEVKENNTVIIECDNSHFELRGMPSDSFPSLPSIEKENMIKVSQKAIRDMIRQTLFAVSMEGTRPILTGSLIECAGNEITFVSIDGFRMALRKNFNNEGFSEFSVVVPAKTLSEIGKILQPVDEDIYIYSSQNQILFEIGNCKVVSRLLEGEYLNYKSIIPPEYETSVRLRTEDLLSSLERASLITSDEKKYPVKFNIIDDKIIITSNTEIGAVREEIRVEVNGSNMEVGFNPRYFIEALRVIDDELVDIYFNSSVGPCTIRPLEGDSFAYMILPVRINK
Clo1313_0003	7.94928	MENIKINTEFIKLDQFLKWTKTVSMGSEAKLMIRSGLVKVNGEVELRRGRKLRTGDIVEINDKKFQIV
Clo1313_0004	43.436319999999

#### Write results to a file

In [7]:
with open('./locus_to_aa.tsv', 'w') as f:
    f.write(out_str)

#### Calculate protein mass

In [None]:
mass = calc_aa_mass('MSYLTVGLILILALFAIRFSNRYGIPALLLFIVLGMFFSIGIDFGDYEFADTFATVALMVIMFYGGFGTNWKMGKPVAKEAIVLSSLGVITTALMTGLFCHYVLGFKLLEGMLIGSIVGSTDYASVSNILRSKNLNLKYNTASLLELESGSNDPTAFTMTMVFLSAIIGTKLSVPVLILSQVVLGIVMGCIFSFVIGKLLKNYSLESDGLYAVFMASIILVTYAATDLLGGNGYLALYILGIYLGNMEFKGKRNIMFFFDGFTEIMQIGLFFILGLLSELPKFIAGLPWALAIMLFMIVIARPVTVYGLMLPFRLKFNQLNIISLAGIRGAAAIAFAIMAVNSPAVLSVDVYHIVFGICVLSSLIQGSLMPFAAKRLDMLDPGDTVLKTFNYYQDKSEIGFLETRIGPNSSLIGKKVKDLNLTFDFIVAKIERNGKTIVPRGHVTIKENDLIVIGGAVHFDKTGHELTEFTISKGHKWQNKYIKDLGLPHNHLIIMIQREGNEIIVPVGDTLLLEGDKIIMIKAEHPLEFPLANEMAT',make_aa_mass_dic())

mass