In [2]:
#!pip install tensorflow keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Flatten, Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping
import h5py


In [1]:
data_dir = 'data/train/'

In [6]:
train_set = pd.read_csv(data_dir+'train_set.tsv', delimiter='\t')
print(train_set.shape)
train_set.head()

(4277047, 3)


Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226


In [9]:
train_set.groupby('Protein_ID').count()['GO_term'].sort_values(ascending=False)

Protein_ID
Q02248    494
P05067    449
P31749    443
Q62226    438
P01137    428
         ... 
P06436      2
P16011      2
P68930      2
P42487      2
O49139      2
Name: GO_term, Length: 123969, dtype: int64

In [13]:
train_set.groupby('GO_term').count()['Protein_ID'].sort_values(ascending=False)

GO_term
GO:0005575    84638
GO:0110165    83893
GO:0008150    83064
GO:0005622    67293
GO:0043226    58004
              ...  
GO:0015038       49
GO:0042169       49
GO:0016894       48
GO:0140463       48
GO:0005160       48
Name: Protein_ID, Length: 3004, dtype: int64

In [30]:
train_set.groupby('aspect').count()

Unnamed: 0_level_0,Protein_ID,GO_term
aspect,Unnamed: 1_level_1,Unnamed: 2_level_1
biological_process,2634883,2634883
cellular_component,1109632,1109632
molecular_function,532532,532532


In [73]:
train_set.head()

Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226


In [67]:
train_set.groupby('aspect').nunique()

Unnamed: 0_level_0,Protein_ID,GO_term
aspect,Unnamed: 1_level_1,Unnamed: 2_level_1
biological_process,83064,1487
cellular_component,84638,678
molecular_function,55698,839


In [15]:
len(train_set['GO_term'].unique())

3004

In [42]:
train_ids_df = pd.read_csv(data_dir+'train_ids.txt',header = None)
train_ids_df.columns = ['Protein_ID']
print(train_ids_df.shape)
train_ids_df.head()

(123969, 1)


Unnamed: 0,Protein_ID
0,P91124
1,Q55DL5
2,O81027
3,Q04418
4,Q8IXT2


In [5]:
# Load train_embeddings.h5 containing the prott5 embeddings

train_embeddings = []
protein_ids = []

with h5py.File(data_dir+'train_embeddings.h5', 'r') as f:
    for protein_id in f.keys():  # protein ids
        embeddings = f[protein_id][:]
        train_embeddings.append(embeddings)
        protein_ids.append(protein_id)

# Convert the list of embeddings to a numpy array
prott5_embeddings = np.array(train_embeddings)

# Create a DataFrame from the embeddings array
column_num = prott5_embeddings.shape[1]
train_df = pd.DataFrame(prott5_embeddings, columns=["Column_" + str(i) for i in range(1, column_num + 1)])

# Set protein_ids as the index of the DataFrame
#train_df.index = protein_ids

print(train_df.shape)
train_df.head()

(123969, 1024)


Unnamed: 0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,Column_10,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
0,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,0.008865,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.00898,-0.003506,-0.024612,0.03476,-0.031006
1,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,0.025497,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.08197,0.017456,-0.032959,0.053192,0.029907
2,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,-0.034973,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
3,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,-0.018631,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
4,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,0.029907,...,-0.100647,-0.063293,0.002346,-0.104675,-0.000757,-0.047485,0.003002,-0.036774,0.103577,0.005245


In [62]:
prot_id_has_cc = train_set[train_set['aspect'] == 'cellular_component']['Protein_ID'].unique().tolist()
prot_id_has_cc

['P91124',
 'Q55DL5',
 'O81027',
 'Q04418',
 'Q7ZT12',
 'Q07627',
 'Q03370',
 'Q9M647',
 'B1AMW7',
 'A1Z840',
 'Q05595',
 'P9WLA1',
 'P30041',
 'Q9Y0Y5',
 'Q96DV4',
 'P0C342',
 'P54144',
 'Q8R2Z3',
 'Q9VEX1',
 'Q9UYD1',
 'Q13287',
 'A0A2R8YGX0',
 'Q9UUI3',
 'Q57ZS4',
 'Q06496',
 'Q387U0',
 'Q8VYE2',
 'P45879',
 'Q5RL73',
 'C9JQU7',
 'P40616',
 'Q6MZP7',
 'O08700',
 'Q9SUV1',
 'Q8N661',
 'F4J5N9',
 'Q99720',
 'Q86XP1',
 'Q9Z0I9',
 'P51427',
 'Q969Z0',
 'Q12117',
 'B7Z0K2',
 'C6KSN5',
 'P56971',
 'A0A7I2V594',
 'Q54JT7',
 'Q38BM5',
 'G5EE70',
 'Q8SXA8',
 'O62531',
 'Q02375',
 'Q8VYN9',
 'Q9H8V3',
 'P33891',
 'D2Y5Q2',
 'A2A2V1',
 'Q9VUF8',
 'P40031',
 'Q9BXY0',
 'Q9ESD1',
 'P78344',
 'Q9JM54',
 'P28331',
 'P39683',
 'P35577',
 'P76090',
 'A0A0A0MQW1',
 'Q5UCC4',
 'Q5W0B1',
 'Q3EC77',
 'Q6A332',
 'A0A6Q8PFQ9',
 'C9JRZ8',
 'Q15007',
 'Q5XJA3',
 'Q9SRB0',
 'Q21966',
 'Q96IL0',
 'P63027',
 'P55850',
 'Q583J9',
 'A0A0A0MS29',
 'E5RFP0',
 'Q9VF10',
 'O14359',
 'Q9LZB8',
 'A0A1D8PGT0',
 'C0LGG7

In [66]:
len(prot_id_has_cc)

84638

In [71]:
id_embed_df = pd.concat((train_ids_df, train_df), axis=1)
cc_train = id_embed_df[id_embed_df['Protein_ID'].isin(prot_id_has_cc)]

In [72]:
cc_train

Unnamed: 0,Protein_ID,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
0,P91124,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.008980,-0.003506,-0.024612,0.034760,-0.031006
1,Q55DL5,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.081970,0.017456,-0.032959,0.053192,0.029907
2,O81027,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
3,Q04418,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
5,Q7ZT12,0.056488,0.019241,0.112122,0.019608,-0.055939,-0.016129,-0.045105,-0.152466,0.003454,...,-0.096985,-0.064880,0.009117,-0.106934,0.004780,-0.051544,0.001547,-0.038788,0.106018,0.013321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123964,Q8LE52,0.052277,-0.062469,0.046478,0.046082,-0.041992,-0.009956,0.027161,-0.061401,0.038422,...,-0.036865,-0.017426,0.019196,-0.034882,-0.044739,0.024338,-0.055084,0.013901,0.059326,0.014725
123965,Q9LY87,-0.011299,-0.036957,0.029297,0.031891,0.006027,0.079468,-0.014832,-0.047791,0.055023,...,-0.040680,-0.006996,0.018005,-0.024414,0.061401,0.041229,-0.021011,-0.014709,0.019791,0.052887
123966,P22082,0.040405,-0.013908,0.025421,0.075012,-0.050293,0.058685,-0.032135,-0.073975,0.061798,...,-0.066711,-0.011276,0.002800,-0.014954,0.015190,0.010483,-0.010162,0.027557,0.027039,0.017517
123967,P63001,-0.008362,-0.026291,0.037354,0.033264,-0.044861,-0.003189,-0.009598,-0.061432,0.027451,...,-0.037476,-0.020966,0.011360,0.027267,-0.006855,-0.005241,-0.057404,-0.012024,0.000682,0.045898


In [78]:
## TAKEN FROM OTHER NOTEBOOK
## COMPILE Y LABLES INTO DF
#num_of_labels = 1500 # CURRENTLY TAKING ALL
num_of_labels = 687 # NUMBER OF CC GO TERMS

# Take value counts in descending order and fetch first 1500 `GO term ID` as labels
labels = train_set['GO_term'].value_counts().index[:num_of_labels].tolist()
#labels = train_set['GO_term'].value_counts().tolist()
# note added filtering for ONLY cellular component
#train_set_updated = train_set.loc[train_set['GO_term'].isin(labels)]
#train_set_updated = train_set.loc[train_set['GO_term'].isin(labels) & (train_set['aspect'] == 'cellular_component')]
train_set_updated = train_set


# Create an empty dataframe of required size for storing the labels,
# i.e, train_size x num_of_labels (142246 x 1500)
train_size = np.array(protein_ids).shape[0] # len(X)
train_labels = np.zeros((train_size ,num_of_labels))

# Convert from numpy to pandas series for better handling
series_train_protein_ids = pd.Series(protein_ids)

# Loop through each label
for i in range(num_of_labels):
    # For each label, fetch the corresponding train_terms data
    n_train_terms = train_set_updated[train_set_updated['GO_term'] ==  labels[i]]
    
    # Fetch all the unique EntryId aka proteins related to the current label(GO term ID)
    label_related_proteins = n_train_terms['Protein_ID'].unique()
    
    # In the series_train_protein_ids pandas series, if a protein is related
    # to the current label, then mark it as 1, else 0.
    # Replace the ith column of train_Y with with that pandas series.
    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)
    

# Convert train_Y numpy into pandas dataframe
labels_df = pd.DataFrame(data = train_labels, columns = labels)
print(labels_df.shape)

(123969, 687)


In [85]:
X = cc_train
y = labels_df.iloc[cc_train.index]
y

Unnamed: 0,GO:0005575,GO:0110165,GO:0008150,GO:0005622,GO:0043226,GO:0009987,GO:0003674,GO:0043229,GO:0043227,GO:0005737,...,GO:0099513,GO:0060090,GO:0006644,GO:0070848,GO:0019941,GO:0099177,GO:0050804,GO:1902532,GO:0005667,GO:0016798
0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123964,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123965,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123966,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123967,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
#  Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [99]:
X_train.iloc[:,1:]

Unnamed: 0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,Column_10,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
57275,0.067383,0.030548,0.011909,0.045471,-0.006641,0.045868,-0.069641,-0.041473,-0.015045,-0.093079,...,-0.063416,-0.002037,-0.056732,-0.028687,0.029861,-0.021347,-0.038757,0.034363,-0.034424,-0.005363
52450,0.031525,0.120117,0.030441,0.011177,-0.015251,0.007626,-0.009270,-0.074280,0.023544,-0.049194,...,-0.049438,-0.005131,-0.036255,-0.018921,0.042023,-0.016327,-0.034637,-0.038910,0.007427,0.013496
95625,0.051056,0.008263,0.076172,0.035522,0.027252,0.106445,-0.057098,-0.048828,-0.009148,-0.005997,...,-0.019196,0.032745,0.022354,-0.081177,-0.008156,-0.020996,-0.029602,0.045105,0.023331,0.057587
82884,0.005070,-0.087769,-0.000177,0.012009,-0.031586,0.063477,-0.005371,-0.080261,-0.009590,0.039459,...,0.026810,-0.013573,0.019730,-0.088745,0.008553,0.083008,0.035980,0.033356,0.042480,0.028503
6587,0.048859,0.030548,0.028473,-0.003517,0.001398,0.072449,-0.032684,-0.060638,0.038696,-0.057465,...,-0.086853,-0.044769,-0.004410,0.012207,0.023636,-0.014793,-0.029343,-0.018967,-0.007362,0.019119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,0.018784,0.037048,0.030228,0.042175,0.011719,0.020966,-0.031525,-0.046387,-0.000349,0.012962,...,0.014015,0.002598,-0.003323,0.001747,0.046783,0.005169,-0.026703,-0.013672,-0.030441,-0.000006
79816,0.018982,0.042206,0.025177,0.014626,-0.019135,0.024353,-0.030472,-0.031647,0.053802,-0.000672,...,-0.023254,0.017014,0.012650,-0.019287,0.023941,-0.009338,0.023270,0.032288,0.015869,-0.004513
112326,-0.059113,-0.035980,0.055695,-0.006058,-0.028595,0.024200,-0.040802,-0.094666,-0.003313,-0.015854,...,0.046387,-0.019028,0.029236,-0.084106,-0.029053,-0.004059,0.003288,-0.012566,0.042389,0.071411
1209,-0.057343,-0.015625,-0.013527,0.034576,0.040253,0.017014,0.001482,-0.088562,0.021774,0.001718,...,-0.042542,-0.014931,0.013908,-0.056580,0.017578,-0.004860,-0.037170,0.017471,0.076111,0.005451


In [100]:
import tensorflow as tf
INPUT_SHAPE = [train_df.shape[1]]
BATCH_SIZE = 5120

model = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),    
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=num_of_labels,activation='sigmoid')
])


# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC(multi_label=True, curve='PR', name='average_precision')]
)

hist = model.fit(
    X_train.iloc[:,1:], y_train,
    validation_data=(X_val.iloc[:,1:], y_val),
    batch_size=BATCH_SIZE,
    epochs=200
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [98]:
X_train

Unnamed: 0,Protein_ID,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
57275,P23500,0.067383,0.030548,0.011909,0.045471,-0.006641,0.045868,-0.069641,-0.041473,-0.015045,...,-0.063416,-0.002037,-0.056732,-0.028687,0.029861,-0.021347,-0.038757,0.034363,-0.034424,-0.005363
52450,Q94BP3,0.031525,0.120117,0.030441,0.011177,-0.015251,0.007626,-0.009270,-0.074280,0.023544,...,-0.049438,-0.005131,-0.036255,-0.018921,0.042023,-0.016327,-0.034637,-0.038910,0.007427,0.013496
95625,P02315,0.051056,0.008263,0.076172,0.035522,0.027252,0.106445,-0.057098,-0.048828,-0.009148,...,-0.019196,0.032745,0.022354,-0.081177,-0.008156,-0.020996,-0.029602,0.045105,0.023331,0.057587
82884,Q06525,0.005070,-0.087769,-0.000177,0.012009,-0.031586,0.063477,-0.005371,-0.080261,-0.009590,...,0.026810,-0.013573,0.019730,-0.088745,0.008553,0.083008,0.035980,0.033356,0.042480,0.028503
6587,Q7DA74,0.048859,0.030548,0.028473,-0.003517,0.001398,0.072449,-0.032684,-0.060638,0.038696,...,-0.086853,-0.044769,-0.004410,0.012207,0.023636,-0.014793,-0.029343,-0.018967,-0.007362,0.019119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,Q9NUD9,0.018784,0.037048,0.030228,0.042175,0.011719,0.020966,-0.031525,-0.046387,-0.000349,...,0.014015,0.002598,-0.003323,0.001747,0.046783,0.005169,-0.026703,-0.013672,-0.030441,-0.000006
79816,Q9W0M7,0.018982,0.042206,0.025177,0.014626,-0.019135,0.024353,-0.030472,-0.031647,0.053802,...,-0.023254,0.017014,0.012650,-0.019287,0.023941,-0.009338,0.023270,0.032288,0.015869,-0.004513
112326,P71620,-0.059113,-0.035980,0.055695,-0.006058,-0.028595,0.024200,-0.040802,-0.094666,-0.003313,...,0.046387,-0.019028,0.029236,-0.084106,-0.029053,-0.004059,0.003288,-0.012566,0.042389,0.071411
1209,Q57XW0,-0.057343,-0.015625,-0.013527,0.034576,0.040253,0.017014,0.001482,-0.088562,0.021774,...,-0.042542,-0.014931,0.013908,-0.056580,0.017578,-0.004860,-0.037170,0.017471,0.076111,0.005451


In [97]:

# Check for any object types or anomalies
print(pd.DataFrame(X_train).info())
print(pd.DataFrame(y_train).info())

<class 'pandas.core.frame.DataFrame'>
Index: 67710 entries, 57275 to 22443
Columns: 1025 entries, Protein_ID to Column_1024
dtypes: float16(1024), object(1)
memory usage: 133.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 67710 entries, 57275 to 22443
Columns: 687 entries, GO:0005575 to GO:0016798
dtypes: float64(687)
memory usage: 355.4 MB
None


In [91]:
exp_train = train_set
exp_train = train_set[train_set['Protein_ID'].isin(prot_id_has_cc)] #TAKE ONLY EXAMPLES THAT HAVE CC INFO
exp_train = train_set[train_set['aspect'] == 'cellular_component'] # TAKE ONLY CELLULAR COMPONENT INFORMATION; NECESSARY

Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226
...,...,...,...
4277042,P28271,biological_process,GO:0010608
4277043,P28271,biological_process,GO:0080090
4277044,P28271,biological_process,GO:0006417
4277045,P28271,biological_process,GO:0051246
