In [1]:
import pandas as pd
from Bio.SeqUtils import ProtParam
from Bio.Seq import Seq
import numpy as np

In [2]:
# Carregar arquivo .tsv exportado do UniProt
caminho_arquivo = "uniprotkb.tsv"  # Altere para o nome do seu arquivo
df = pd.read_csv(caminho_arquivo, sep="\t")

In [3]:
# Função para calcular propriedades da sequência
def calcular_propriedades(seq_str):
    try:
        seq = Seq(seq_str)
        analyser = ProtParam.ProteinAnalysis(str(seq))

        pi = analyser.isoelectric_point()
        gravy = analyser.gravy()
        charge = analyser.charge_at_pH(7.0)
        aa_count = analyser.count_amino_acids()

        total = sum(aa_count.values())
        polar = sum([aa_count.get(aa, 0) for aa in ['Q', 'N', 'H', 'S', 'T', 'Y', 'C', 'W']])
        apolar = sum([aa_count.get(aa, 0) for aa in ['A', 'V', 'L', 'I', 'P', 'F', 'M', 'G']])

        proporcao_polar = polar / total if total > 0 else 0
        proporcao_apolar = apolar / total if total > 0 else 0

        return pd.Series([pi, gravy, charge, proporcao_polar, proporcao_apolar])

    except Exception as e:
        return pd.Series([np.nan]*5)

In [4]:
# Aplicar função
df[['Ponto_Isoeletrico', 'Hidrofobicidade', 'Carga_Total', 
    'Proporcao_Polar', 'Proporcao_Apolar']] = df['Sequence'].apply(calcular_propriedades)

In [5]:
print(df.head())

        Entry   Entry Name                                           Organism  \
0  A0A009IHW8  ABTIR_ACIB9           Acinetobacter baumannii (strain 1295743)   
1  A0A023I7E1   ENG1_RHIMI                                  Rhizomucor miehei   
2  A0A024B7W1   POLG_ZIKVF  Zika virus (isolate ZIKV/Human/French Polynesi...   
3  A0A024RXP8   GUX1_HYPJR  Hypocrea jecorina (strain ATCC 56765 / BCRC 32...   
4  A0A024SC78  CUTI1_HYPJR  Hypocrea jecorina (strain ATCC 56765 / BCRC 32...   

   Organism (ID)                                      Protein names  \
0        1310613  2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...   
1           4839  Glucan endo-1,3-beta-D-glucosidase 1 (Endo-1,3...   
2        2043570  Genome polyprotein [Cleaved into: Capsid prote...   
3        1344414  Exoglucanase 1 (EC 3.2.1.91) (1,4-beta-cellobi...   
4        1344414                             Cutinase (EC 3.1.1.74)   

                                    Protein families  \
0                             

In [6]:
print(df.describe)

<bound method NDFrame.describe of              Entry   Entry Name  \
0       A0A009IHW8  ABTIR_ACIB9   
1       A0A023I7E1   ENG1_RHIMI   
2       A0A024B7W1   POLG_ZIKVF   
3       A0A024RXP8   GUX1_HYPJR   
4       A0A024SC78  CUTI1_HYPJR   
...            ...          ...   
472561      Q9ZVQ8  PP2B8_ARATH   
472562      Q9ZVQ9  PP2B7_ARATH   
472563      Q9ZVR0  PP2B6_ARATH   
472564      Q9ZVR1  PP2B5_ARATH   
472565      Q9ZVR3  PP2B4_ARATH   

                                                 Organism  Organism (ID)  \
0                Acinetobacter baumannii (strain 1295743)        1310613   
1                                       Rhizomucor miehei           4839   
2       Zika virus (isolate ZIKV/Human/French Polynesi...        2043570   
3       Hypocrea jecorina (strain ATCC 56765 / BCRC 32...        1344414   
4       Hypocrea jecorina (strain ATCC 56765 / BCRC 32...        1344414   
...                                                   ...            ...   
472561       

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472566 entries, 0 to 472565
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Entry              472566 non-null  object 
 1   Entry Name         472566 non-null  object 
 2   Organism           472566 non-null  object 
 3   Organism (ID)      472566 non-null  int64  
 4   Protein names      472566 non-null  object 
 5   Protein families   442737 non-null  object 
 6   Sequence           472566 non-null  object 
 7   Gene Names         461521 non-null  object 
 8   Gene Ontology IDs  472566 non-null  object 
 9   Length             472566 non-null  int64  
 10  Mass               472566 non-null  int64  
 11  Ponto_Isoeletrico  470582 non-null  float64
 12  Hidrofobicidade    470582 non-null  float64
 13  Carga_Total        470582 non-null  float64
 14  Proporcao_Polar    470582 non-null  float64
 15  Proporcao_Apolar   470582 non-null  float64
dtypes:

In [8]:
print(df.columns)

Index(['Entry', 'Entry Name', 'Organism', 'Organism (ID)', 'Protein names',
       'Protein families', 'Sequence', 'Gene Names', 'Gene Ontology IDs',
       'Length', 'Mass', 'Ponto_Isoeletrico', 'Hidrofobicidade', 'Carga_Total',
       'Proporcao_Polar', 'Proporcao_Apolar'],
      dtype='object')


In [9]:
df_selecionado = df[['Sequence', 'Mass', 'Ponto_Isoeletrico', 'Hidrofobicidade', 'Carga_Total', 'Proporcao_Polar', 'Proporcao_Apolar', 'Length', 'Gene Ontology IDs']]

print(df_selecionado.head())

                                            Sequence    Mass  \
0  MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...   30922   
1  MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...   89495   
2  MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...  379113   
3  MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...   54111   
4  MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...   25924   

   Ponto_Isoeletrico  Hidrofobicidade  Carga_Total  Proporcao_Polar  \
0           6.990139        -0.667286    -0.017768         0.308550   
1           5.426839        -0.443090   -17.929917         0.336683   
2           8.665405        -0.147327    39.096809         0.273736   
3           4.601423        -0.433658   -18.260172         0.455253   
4           4.543448         0.225806   -11.187354         0.290323   

   Proporcao_Apolar  Length                                  Gene Ontology IDs  
0          0.371747     269  GO:0003953; GO:0007165; GO:0019677; GO:0050135...  
1          0.453518     79

In [10]:
#apagar colunas que tenham proporçao polar ou apolar null
# df_selecionado

df_selecionado = df_selecionado.dropna(subset=['Proporcao_Polar', 'Proporcao_Apolar'])

df_selecionado

Unnamed: 0,Sequence,Mass,Ponto_Isoeletrico,Hidrofobicidade,Carga_Total,Proporcao_Polar,Proporcao_Apolar,Length,Gene Ontology IDs
0,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,30922,6.990139,-0.667286,-0.017768,0.308550,0.371747,269,GO:0003953; GO:0007165; GO:0019677; GO:0050135...
1,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,89495,5.426839,-0.443090,-17.929917,0.336683,0.453518,796,GO:0000272; GO:0005576; GO:0042973; GO:0052861...
2,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,379113,8.665405,-0.147327,39.096809,0.273736,0.498685,3423,GO:0003724; GO:0003725; GO:0003968; GO:0004252...
3,MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...,54111,4.601423,-0.433658,-18.260172,0.455253,0.412451,514,GO:0005576; GO:0016162; GO:0030245; GO:0030248
4,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,25924,4.543448,0.225806,-11.187354,0.290323,0.560484,248,GO:0005576; GO:0016052; GO:0050525
...,...,...,...,...,...,...,...,...,...
472561,MTKTRCMHEHFRKIVQRVKKTLRLSASDKSHGVAELDDLPEECVSI...,35239,8.758046,-0.297049,5.821549,0.255738,0.442623,305,GO:0030246
472562,MTKTRCMHVHFRKILQRVKKTLRLSASDQQSQGVTEPLSLGDLPEE...,35685,8.327139,-0.291857,3.088849,0.319218,0.423453,307,GO:0030246
472563,MGQKLGVDSRQKIRQVLGSSSKVQKHDVESIGGGGGEIVPGHSPFD...,34480,5.425305,-0.351792,-7.098072,0.286645,0.449511,307,GO:0030246
472564,MGQKHGVDTRGKGAEFCGCWEILTEFINGSSASFDDLPDDCLAIIS...,32015,8.843918,-0.235563,5.000495,0.285211,0.478873,284,GO:0030246


In [11]:
# realizar a limpega do gene ontology ids
# só iremos utilizar 0005198 (Estrutural)  0005215 (Transporte) 0090722 (Receptora) 0003824 (Enzima)
# Só iremos utilizar:

#   0005198 (Estrutural)
##Lista filhos
# GO:0008147
# GO:0008316
# GO:0140094
# GO:0140073
# GO:0160123
# GO:0097493
# GO:0043886
# GO:0097099
# GO:0039660
# GO:0005201
# GO:0005200
# GO:0019911
# GO:0003735
# GO:0030527
# GO:0030281
# GO:0030280
# GO:0008307
# GO:0005199
# GO:0017056
# GO:0016490
# GO:0042302
# GO:0005213
# GO:0098918
# GO:0005212
# GO:0140756


#   0005215 (Transporte)
##Lista filhos
# GO:0032409
# GO:0160187
# GO:0141110
# GO:0022857
# GO:0005319
# GO:0032410
# GO:0140318
# GO:1990351
# GO:0032411
# GO:0141108
# GO:0141109


#   0005488 (Receptora)
## Lista filhos
# GO:1990300 
# GO:0003682
# GO:0008289
# GO:1901265
# GO:0035274
# GO:0030246
# GO:0035275
# GO:1902314
# GO:0097367
# GO:0019215
# GO:0015643
# GO:0033218
# GO:0046906
# GO:1901681
# GO:0005527
# GO:0072341
# GO:0043546
# GO:0046812
# GO:1901338
# GO:0031409
# GO:0003676
# GO:0043515
# GO:1905594
# GO:0034617
# GO:0051100
# GO:0005549
# GO:0051099
# GO:0003823
# GO:0097160
# GO:0050840
# GO:0051098
# GO:0042277
# GO:0051192
# GO:0005515
# GO:1904483
# GO:0050809
# GO:0042562
# GO:0044877
# GO:0043176
# GO:0097243
# GO:0016597
# GO:0036094

#   0003824 (Enzima)
## Lista filhos
# GO:0016787
# GO:0008047
# GO:0016740
# GO:0140640
# GO:0016491
# GO:0009975
# GO:0016218
# GO:0016829
# GO:0140096
# GO:0032451
# GO:0016853
# GO:0043085
# GO:0043086
# GO:0030234
# GO:0004857
# GO:0061783
# GO:0016874
# GO:1904091
# GO:1902494
# GO:0050790
# GO:0044867
# GO:0160215

# Lista dos GO IDs das classes desejadas (sem o prefixo "GO:")
# go_ids_desejados = ['0005198', '0005215', '0005488', '0003824']

# Dicionário de mapeamento GO IDs (pai + filhos) para os rótulos
go_mapeamento = {
    '0005198': 'Estrutural',
    '0008147': 'Estrutural', '0008316': 'Estrutural', '0140094': 'Estrutural', '0140073': 'Estrutural',
    '0160123': 'Estrutural', '0097493': 'Estrutural', '0043886': 'Estrutural', '0097099': 'Estrutural',
    '0039660': 'Estrutural', '0005201': 'Estrutural', '0005200': 'Estrutural', '0019911': 'Estrutural',
    '0003735': 'Estrutural', '0030527': 'Estrutural', '0030281': 'Estrutural', '0030280': 'Estrutural',
    '0008307': 'Estrutural', '0005199': 'Estrutural', '0017056': 'Estrutural', '0016490': 'Estrutural',
    '0042302': 'Estrutural', '0005213': 'Estrutural', '0098918': 'Estrutural', '0005212': 'Estrutural', '0140756': 'Estrutural',

    '0005215': 'Transporte',
    '0032409': 'Transporte', '0160187': 'Transporte', '0141110': 'Transporte', '0022857': 'Transporte',
    '0005319': 'Transporte', '0032410': 'Transporte', '0140318': 'Transporte', '1990351': 'Transporte',
    '0032411': 'Transporte', '0141108': 'Transporte', '0141109': 'Transporte',

    '0005488': 'Receptora',
    '1990300': 'Receptora', '0003682': 'Receptora', '0008289': 'Receptora', '1901265': 'Receptora',
    '0035274': 'Receptora', '0030246': 'Receptora', '0035275': 'Receptora', '1902314': 'Receptora',
    '0097367': 'Receptora', '0019215': 'Receptora', '0015643': 'Receptora', '0033218': 'Receptora',
    '0046906': 'Receptora', '1901681': 'Receptora', '0005527': 'Receptora', '0072341': 'Receptora',
    '0043546': 'Receptora', '0046812': 'Receptora', '1901338': 'Receptora', '0031409': 'Receptora',
    '0003676': 'Receptora', '0043515': 'Receptora', '1905594': 'Receptora', '0034617': 'Receptora',
    '0051100': 'Receptora', '0005549': 'Receptora', '0051099': 'Receptora', '0003823': 'Receptora',
    '0097160': 'Receptora', '0050840': 'Receptora', '0051098': 'Receptora', '0042277': 'Receptora',
    '0051192': 'Receptora', '0005515': 'Receptora', '1904483': 'Receptora', '0050809': 'Receptora',
    '0042562': 'Receptora', '0044877': 'Receptora', '0043176': 'Receptora', '0097243': 'Receptora',
    '0016597': 'Receptora', '0036094': 'Receptora',

    '0003824': 'Enzima',
    '0016787': 'Enzima', '0008047': 'Enzima', '0016740': 'Enzima', '0140640': 'Enzima',
    '0016491': 'Enzima', '0009975': 'Enzima', '0016218': 'Enzima', '0016829': 'Enzima',
    '0140096': 'Enzima', '0032451': 'Enzima', '0016853': 'Enzima', '0043085': 'Enzima',
    '0043086': 'Enzima', '0030234': 'Enzima', '0004857': 'Enzima', '0061783': 'Enzima',
    '0016874': 'Enzima', '1904091': 'Enzima', '1902494': 'Enzima', '0050790': 'Enzima',
    '0044867': 'Enzima', '0160215': 'Enzima',
}

# Aplica a limpeza e mapeamento
df_selecionado['Rotulos'] = df_selecionado['Gene Ontology IDs'].apply(
    lambda x: ';'.join(sorted(set(
        go_mapeamento.get(go.replace('GO:', '').strip())
        for go in str(x).split(';')
        if go_mapeamento.get(go.replace('GO:', '').strip()) is not None
    )))
)

# Remove linhas sem nenhum rótulo identificado
df_selecionado = df_selecionado[df_selecionado['Rotulos'].str.len() > 0]

# Visualizar as classes encontradas
print(df_selecionado['Rotulos'].value_counts())


Rotulos
Estrutural                         54715
Receptora                          15818
Enzima                             13758
Transporte                          2651
Enzima;Estrutural                   1153
Enzima;Receptora                     732
Estrutural;Receptora                 289
Receptora;Transporte                  49
Estrutural;Transporte                 21
Enzima;Transporte                     12
Enzima;Receptora;Transporte            5
Enzima;Estrutural;Receptora            4
Estrutural;Receptora;Transporte        2
Name: count, dtype: int64


In [12]:
df_selecionado

Unnamed: 0,Sequence,Mass,Ponto_Isoeletrico,Hidrofobicidade,Carga_Total,Proporcao_Polar,Proporcao_Apolar,Length,Gene Ontology IDs,Rotulos
2,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,379113,8.665405,-0.147327,39.096809,0.273736,0.498685,3423,GO:0003724; GO:0003725; GO:0003968; GO:0004252...,Estrutural;Receptora
6,MMKMKQQGLVADLLPNIRVMKTFGHFVFNYYNDNSSKYLHKVYCCV...,54109,7.961926,0.301046,2.557886,0.315900,0.529289,478,GO:0004984; GO:0005549; GO:0005886; GO:0007165...,Receptora
10,MKFLAFGLIYFHFCILNRCEYITSSTIQKCYNSSNEPNNCSQKAVI...,82663,8.429322,-0.246361,9.643771,0.446092,0.394879,742,GO:0005886; GO:0005911; GO:0007338; GO:0008289...,Receptora
20,MALIRLVAPERVFSDLASMVAYPNFQVQDKITLLGSAGGDFTFTTT...,69928,5.811867,-0.029032,-7.584984,0.328725,0.499232,651,GO:0016829; GO:0098015; GO:0098671; GO:0098994...,Enzima
21,MEENKKTVDGSVDFTEEQEALVVKSWNAMKNNSCDLSLKFFTKILE...,39609,6.211729,-0.494017,-4.417179,0.282051,0.441595,351,GO:0001666; GO:0005344; GO:0005634; GO:0005737...,Enzima
...,...,...,...,...,...,...,...,...,...,...
472561,MTKTRCMHEHFRKIVQRVKKTLRLSASDKSHGVAELDDLPEECVSI...,35239,8.758046,-0.297049,5.821549,0.255738,0.442623,305,GO:0030246,Receptora
472562,MTKTRCMHVHFRKILQRVKKTLRLSASDQQSQGVTEPLSLGDLPEE...,35685,8.327139,-0.291857,3.088849,0.319218,0.423453,307,GO:0030246,Receptora
472563,MGQKLGVDSRQKIRQVLGSSSKVQKHDVESIGGGGGEIVPGHSPFD...,34480,5.425305,-0.351792,-7.098072,0.286645,0.449511,307,GO:0030246,Receptora
472564,MGQKHGVDTRGKGAEFCGCWEILTEFINGSSASFDDLPDDCLAIIS...,32015,8.843918,-0.235563,5.000495,0.285211,0.478873,284,GO:0030246,Receptora
