In [67]:
import Bio
import json
import requests
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis

#### Cleaning and Preparing the Data

In [68]:
prot_seq = pd.read_csv('Disprot_sequence.csv') 
prot_seq

Unnamed: 0,uniref50,sequence
0,UniRef50_P03265,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...
1,UniRef50_P49913,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...
2,UniRef50_P03045,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...
3,UniRef50_P99999,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...
4,UniRef50_P27695,MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...
...,...,...
2361,UniRef50_P05777,MSLLTEVETYVLSIIPSGPLKAEIAQRLEDVFAGKNTDLEVLMEWL...
2362,UniRef50_P03431,MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...
2363,UniRef50_P03466,MATKGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTE...
2364,UniRef50_P03495,MDPNTVSSFQVDCFLWHVRKRVADQELGDAPFLDRLRRDQKSLRGR...


In [69]:
import re
def clean_prot(line):
    """ cleaning AAs by removing invalid AA's
    """
        
    # remove here 
    line = re.sub("U", "", line)
    line = re.sub("X", "", line)
    line = re.sub("J", "", line)
    line = re.sub("U", "", line)
    
    # replace here
    line = re.sub("Z", "Q", line)
    line = re.sub("B", "N", line)
    
    return line

X = "MDSVYVDIDADSAFLKALQRAYPMFEVEPRQVTSNDHANARAFSHLAIKLIEQEIDPDSTILDIGSAPARRMMSDRKYHCVCPMRSAEDPERLANYARKLASAAGKVLDRNISGKIGDLQAVMAVPDTETPTFCLHTDVSCRQRADVAIYQDVYAVHAPTSLYHQAIKGVRVAYWVGFDTTPFMYNAMAGAYPSYSTNWADEQVLKAKNIGLCSTDLTEGRRGKLSIMRGKKLKPCDRVLFSVGSTLYPESRTLLKSWHLPSVFHLKGKLSFTCRCDTVVSCEGYVVKRITMSPGLYGKTIGYAVTHHADGFLMCKTTDTVDGERVSFSVCTYVPATICDQMTGILATEVTPEDAQKLLVGLNQRIVVNGRTQRNTNTMKNYLLPVVAQAFSKWAKECRKDMEDEKLLGVRERTLTCCCLWAFKKQKTHTVYKRPDTQSIQKVQAEFDSFVVPGLWSSGLSIPLRTRIKWLLRKVPKTDLIPYSGNAQEAQDAEKEAEEEREAELTHEALPPLQAAQEDVQVEIDVEQLEDRAGAGIIETPRGAIKVTAQLTDHVVGEYLVLSPQTVLRSQKLSLIHALAEQVKTCTHSGRAGRYAVEAYDGRVLVPSGYAISPEDFQSLSESATMVYNEREFVNRKLHHIAMHGPALNTDEESYELVRAERTEHEYVYDVDQRRCCKKEEAAGLVLVGDLTNPPYHEFAYEGLKIRPACPYKIAVIGVFGVPGSGKSAIIKNLVTRQDLVTSGKKENCQEISTDVMRQRGLEISARTVDSLLLNGCNRPVDVLYVDEAFACHSGTLLALIALVRPRQKVVLCGDPKQCGFFNMMQMKVNYNHNICTQVYHKSISRRCTLPVTAIVSSLHYEGKMRTTNEYNMPIVVDTTGSTKPDPGDLVLTCFRGWVKQLQIDYRGHEVMTAAASQGLTRKGVYAVRQKVNENPLYASTSEHVNVLLTRTEGKLVWKTLSGDPWIKTLQNPPKGNFKATIKEWEVEHASIMAGICSHQVTFDTFQNKANVCWAKSLVPILETAGIKLNDRQWSQIIQAFKEDKAYSPEVALNEICTRMYGVDLDSGLFSKPLVSVYYADNHWDNRPGGKMFGFNPEAASILERKYPFTKGKWNINKQICVTTRRIEDFNPTTNIIPVNRRLPHSLVAEHRPVKGERMEWLVNKINGHHVLLVSGYNLALPTKRVTWVAPLGVRGADYTYNLELGLPATLGRYDLVVINIHTPFRIHHYQQCVDHAMKLQMLGGDSLRLLKPGGSLLIRAYGYADRTSERVICVLGRKFRSSRALKPPCVTSNTEMFFLFSNFDNGRRNFTTHVMNNQLNAAFVGQATRAGCAPSYRVKRMDIAKNDEECVVNAANPRGLPGDGVCKAVYKKWPESFKNSATPVGTAKTVMCGTYPVIHAVGPNFSNYSESEGDRELAAAYREVAKEVTRLGVNSVAIPLLSTGVYSGGKDRLTQSLNHLFTAMDSTDADVVIYCRDKEWEKKISEAIQMRTQVELLDEHISIDCDVIRVHPDSSLAGRKGYSTTEGALYSYLEGTRFHQTAVDMAEIYTMWPKQTEANEQVCLYALGESIESIRQKCPVDDADASSPPKTVPCLCRYAMTPERVTRLRMNHVTNIIVCSSFPLPKYKIEGVQKVKCSKVMLFDHNVPSRVSPREYRSSQESVQEVSTTTSLTHSQFDLSADGETLPVPSDLDADAPALEPALDDGAVHTLPTIIGNLAAVSDWVMSTVPVAPPRRRRGRNLTVTCDEREGNITPMASVRFFRAELCPAVQETAETRDTAISLQAPPSTTMELSHPPISFGAPSETFPITFGDFDEGEIESLSSELLTFGDFLPGEVDDLTDSDWSTCPDTDDELXLDRAGGYIFSSDTGPGHLQQKSVRQSVLPVNTLEEVHEEKCYPPKLDELKEQLLLKKLQESASMANRSRYQSRKVENMKATIIQRLKRGCKLYLMAETPKVPTYRTIYPAPVYSPPINVRLSNPESAVAACNEFLARNYPTVSSYQITDEYDAYLDMVDGSESCLDRATFNPSKLRSYPKQHAYHAPSIRSAVPSPFQNTLQNVLAAATKRNCNVTQMRELPTLDSAVFNVECFKKFACNREYWEEFAASPIRITTENLTTYVTKLKGPKAAALFAKTHNLLPLQDVPMDRFTVDMKRDVKVTPGTKHTEERPKVQVIQAAEPLATAYLCGIHRELVRRLNAVLLPNVHTLFDMSAEDFDAIIAAHFKPGDAVLETDIASFDKSQDDSLALTALMLLEDLGVDHSLLDLIEAAFGEISSCHLPTGTRFKFGAMMKSGMFLTLFVNTLLNITIASRVLEDRLTKSACAAFIGDDNIIHGVVSDELMAARCATWMNMEVKIIDAVVSQKAPYFCGGFILHDTVTGTACRVADPLKRLFKLGKPLAAGDEQDEDRRRALADEVIRWQRTGLIDELEKAVYSRYEVQGISVAVMSMATFASSRSNFEKLRGPVITLYGGPK"
X_X = "JJJMDSVYVXXXDIDADSAZZQQFLKAXLQRAYPM"
X_clean = clean_prot(X)
X_Xclean = clean_prot(X_X)

In [70]:
# replacing all instances of X in AA sequences
seq_with_X = prot_seq['sequence'].apply(clean_prot)
seq_with_X

0       MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...
1       MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...
2       MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...
3       MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...
4       MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...
                              ...                        
2361    MSLLTEVETYVLSIIPSGPLKAEIAQRLEDVFAGKNTDLEVLMEWL...
2362    MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...
2363    MATKGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTE...
2364    MDPNTVSSFQVDCFLWHVRKRVADQELGDAPFLDRLRRDQKSLRGR...
2365    MNPNQKIITIGSVCMTIGMANLILQIGNIISIWISHSIQLGNQNQI...
Name: sequence, Length: 2366, dtype: object

In [71]:
""" testing if the replace_X function works in clearing out unambiguous AAs (X)
"""
X = X_Xclean
no_X = "MASVHESLYFNPMMTNGVVHANVFGIKDWVT"

analyze_protX = ProteinAnalysis(X)
analyze_prot_no = ProteinAnalysis(no_X)

count1 = analyze_protX.count_amino_acids()['A']
count2= analyze_prot_no.count_amino_acids()['A']

print(f"analyze_protX {count1} | analyze_prot_no {count2}")


count1 = analyze_protX.molecular_weight()
count2= analyze_prot_no.molecular_weight()

print(f"analyze_protX {count1} | analyze_prot_no {count2}")
  
# gravy(self)


analyze_protX 4 | analyze_prot_no 2
analyze_protX 3232.5972999999994 | analyze_prot_no 3496.0002999999997


In [72]:
#adding the clean_prot to our dataframe
prot_seq["cleaned sequence"] = seq_with_X 
prot_seq

Unnamed: 0,uniref50,sequence,cleaned sequence
0,UniRef50_P03265,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...
1,UniRef50_P49913,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...
2,UniRef50_P03045,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...
3,UniRef50_P99999,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...
4,UniRef50_P27695,MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...,MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...
...,...,...,...
2361,UniRef50_P05777,MSLLTEVETYVLSIIPSGPLKAEIAQRLEDVFAGKNTDLEVLMEWL...,MSLLTEVETYVLSIIPSGPLKAEIAQRLEDVFAGKNTDLEVLMEWL...
2362,UniRef50_P03431,MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...,MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...
2363,UniRef50_P03466,MATKGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTE...,MATKGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTE...
2364,UniRef50_P03495,MDPNTVSSFQVDCFLWHVRKRVADQELGDAPFLDRLRRDQKSLRGR...,MDPNTVSSFQVDCFLWHVRKRVADQELGDAPFLDRLRRDQKSLRGR...


#### Analyzing the AA Sequences & Adding additional Characteristics
 <a ref = https://biopython.org/wiki/ProtParam read more about functions used here \>
+ molecular weight | prot.molecular_weight()
+ hydrophobicity | prot.gravy()

In [73]:
# applying the Protein Analysis function of BioPython to convert
# amino acids to be evaluated though built in functions (molecular weight)
analyze_prot = prot_seq["cleaned sequence"].apply(ProteinAnalysis)

In [74]:
# GOOD LUCK :D
analyze_prot[0].count_amino_acids()['A']

47

In [76]:
# testing to see if molecular weight is working on a smaller dataset
list_prot = list(analyze_prot[:10])
molecular_w = [x.molecular_weight() for x in list_prot]
merge_list = tuple(zip(list(prot_seq["cleaned sequence"][:10]), molecular_w))
merge_list

(('MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKRMRRRIESEDEEDSSQDALVPRTPSPRPSTSAADLAIAPKKKKKRPSPKPERPPSPEVIVDSEEEREDVALQMVGFSNPPVLIKHGKGGKRTVRRLNEDDPVARGMRTQEEEEEPSEAESEITVMNPLSVPIVSAWEKGMEAARALMDKYHVDNDLKANFKLLPDQVEALAAVCKTWLNEEHRGLQLTFTSKKTFVTMMGRFLQAYLQSFAEVTYKHHEPTGCALWLHRCAEIEGELKCLHGSIMINKEHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNTDARCCVHDAACPANQFSGKSCGMFFSEGAKAQVAFKQIKAFMQALYPNAQTGHGHLLMPLRCECNSKPGHAPFLGRQLPKLTPFALSNAEDLDADLISDKSVLASVHHPALIVFQCCNPVYRNSRAQGGGPNCDFKISAPDLLNALVMVRSLWSENFTELPRMVVPEFKWSTKHQYRNVSLPVAHSDARQNPFDF',
  59138.73500000009),
 ('MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGINQRSSDANLYRLLDLDPRPTMDGDPDTPKPVSFTVKETVCPRTTQQSPEDCDFKKDGLVKRCMGTVTLNQARGSFDISCDKDNKRFALLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES',
  19301.1649),
 ('MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKSRVESALNPIDLTVLAEYHKQIESNLQRIERKNQRTWYSKPGERGITCSGRQKIKGKSIPLI',
  12298.116199999986),
 ('MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPGFTYTDANKNKGITWKEETLMEYLENPKKYIPGTKMIFAGIKKKTEREDLIAYLKKATNE',
  118

In [78]:
list_prot2 = list(analyze_prot)
prot_seq["molecular weight"] = [x.molecular_weight() for x in list_prot2]
prot_seq["hydrophobicity (GRAVY)" ] = [x.gravy() for x in list_prot2]
prot_seq["instability index"] = [x.instability_index() for x in list_prot2]
prot_seq["aromaticity"] = [x.aromaticity() for x in list_prot2]
prot_seq


Unnamed: 0,uniref50,sequence,cleaned sequence,molecular weight,hydrophobicity (GRAVY),instability index,aromaticity
0,UniRef50_P03265,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,59138.7350,-0.611153,58.945936,0.058601
1,UniRef50_P49913,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...,19301.1649,-0.441765,37.586471,0.064706
2,UniRef50_P03045,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,12298.1162,-0.919626,46.321495,0.037383
3,UniRef50_P99999,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,11832.6058,-0.875238,15.507619,0.085714
4,UniRef50_P27695,MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...,MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...,35554.0730,-0.583019,43.411950,0.088050
...,...,...,...,...,...,...,...
2361,UniRef50_P05777,MSLLTEVETYVLSIIPSGPLKAEIAQRLEDVFAGKNTDLEVLMEWL...,MSLLTEVETYVLSIIPSGPLKAEIAQRLEDVFAGKNTDLEVLMEWL...,27892.9815,-0.214683,37.745635,0.051587
2362,UniRef50_P03431,MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...,MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...,86575.4670,-0.514399,43.107820,0.088507
2363,UniRef50_P03466,MATKGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTE...,MATKGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTE...,56376.2004,-0.603414,44.810462,0.078313
2364,UniRef50_P03495,MDPNTVSSFQVDCFLWHVRKRVADQELGDAPFLDRLRRDQKSLRGR...,MDPNTVSSFQVDCFLWHVRKRVADQELGDAPFLDRLRRDQKSLRGR...,25867.4491,-0.337826,52.071739,0.052174
