Data preparation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Flatten, Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping
import h5py
import tensorflow as tf
from google.colab import drive
drive.mount('/content/drive/')
data_dir = '/content/drive/MyDrive/bioproject/biological_data_pfp/biological_data_pfp/train/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
train_set = pd.read_csv(data_dir+'train_set.tsv', delimiter='\t')
print(train_set.shape)
train_set.head()

(4277047, 3)


Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226


In [4]:
train_ids_df = pd.read_csv(data_dir+'train_ids.txt',header = None)
train_ids_df.columns = ['Protein_ID']
print(train_ids_df.shape)
train_ids_df.head()

(123969, 1)


Unnamed: 0,Protein_ID
0,P91124
1,Q55DL5
2,O81027
3,Q04418
4,Q8IXT2


In [5]:
train_embeddings = []
protein_ids = []

with h5py.File(data_dir+'train_embeddings.h5', 'r') as f:
    for protein_id in f.keys():  # protein ids
        embeddings = f[protein_id][:]
        train_embeddings.append(embeddings)
        protein_ids.append(protein_id)

# Convert the list of embeddings to a numpy array
prott5_embeddings = np.array(train_embeddings)

# Create a DataFrame from the embeddings array
column_num = prott5_embeddings.shape[1]
train_df = pd.DataFrame(prott5_embeddings, columns=["Column_" + str(i) for i in range(1, column_num + 1)])

# Set protein_ids as the index of the DataFrame
#train_df.index = protein_ids

print(train_df.shape)
train_df.head()

(123969, 1024)


Unnamed: 0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,Column_10,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
0,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,0.008865,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.00898,-0.003506,-0.024612,0.03476,-0.031006
1,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,0.025497,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.08197,0.017456,-0.032959,0.053192,0.029907
2,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,-0.034973,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
3,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,-0.018631,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
4,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,0.029907,...,-0.100647,-0.063293,0.002346,-0.104675,-0.000757,-0.047485,0.003002,-0.036774,0.103577,0.005245


In [6]:
print(len(protein_ids))
protein_ids[0]

123969


'A0A009IHW8'

In [7]:
id_embed_df = pd.concat((pd.DataFrame(protein_ids, columns=['Protein_ID']), train_df), axis=1)   ################################# Concatenating protein_ids instead of train_ids_df
id_embed_df

Unnamed: 0,Protein_ID,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
0,A0A009IHW8,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.008980,-0.003506,-0.024612,0.034760,-0.031006
1,A0A021WW32,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.081970,0.017456,-0.032959,0.053192,0.029907
2,A0A021WZA4,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
3,A0A023FBW4,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
4,A0A023FBW7,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,...,-0.100647,-0.063293,0.002346,-0.104675,-0.000757,-0.047485,0.003002,-0.036774,0.103577,0.005245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123964,X6RLK1,0.052277,-0.062469,0.046478,0.046082,-0.041992,-0.009956,0.027161,-0.061401,0.038422,...,-0.036865,-0.017426,0.019196,-0.034882,-0.044739,0.024338,-0.055084,0.013901,0.059326,0.014725
123965,X6RLN4,-0.011299,-0.036957,0.029297,0.031891,0.006027,0.079468,-0.014832,-0.047791,0.055023,...,-0.040680,-0.006996,0.018005,-0.024414,0.061401,0.041229,-0.021011,-0.014709,0.019791,0.052887
123966,X6RLP6,0.040405,-0.013908,0.025421,0.075012,-0.050293,0.058685,-0.032135,-0.073975,0.061798,...,-0.066711,-0.011276,0.002800,-0.014954,0.015190,0.010483,-0.010162,0.027557,0.027039,0.017517
123967,X6RLR1,-0.008362,-0.026291,0.037354,0.033264,-0.044861,-0.003189,-0.009598,-0.061432,0.027451,...,-0.037476,-0.020966,0.011360,0.027267,-0.006855,-0.005241,-0.057404,-0.012024,0.000682,0.045898


In [8]:
prot_id_has_cc = train_set[train_set['aspect'] == 'cellular_component']['Protein_ID'].unique().tolist()
cc_train = id_embed_df[id_embed_df['Protein_ID'].isin(prot_id_has_cc)]
prot_id_has_mf = train_set[train_set['aspect'] == 'molecular_function']['Protein_ID'].unique().tolist() ###################### Other aspects:
mf_train = id_embed_df[id_embed_df['Protein_ID'].isin(prot_id_has_mf)]                                  ######################
prot_id_has_bp = train_set[train_set['aspect'] == 'biological_process']['Protein_ID'].unique().tolist() ######################
bp_train = id_embed_df[id_embed_df['Protein_ID'].isin(prot_id_has_bp)]                                  ######################


In [9]:
cc_train

Unnamed: 0,Protein_ID,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
1,A0A021WW32,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.081970,0.017456,-0.032959,0.053192,0.029907
2,A0A021WZA4,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
12,A0A023GPJ3,0.015121,0.011017,0.021698,-0.025116,0.039612,0.008392,-0.051453,-0.025406,-0.020264,...,-0.027298,-0.029373,-0.051300,-0.062500,0.023376,0.066040,0.024963,-0.040497,0.026001,0.018082
15,A0A023GUT0,-0.004139,-0.012878,0.071594,0.016052,-0.039825,0.022476,-0.110901,-0.085144,0.010040,...,-0.027267,-0.075562,0.030533,0.075623,-0.061554,-0.042664,0.042511,-0.025208,0.014473,0.057037
17,A0A023IM54,-0.016510,0.025253,0.043335,0.015579,-0.016785,0.031143,-0.068115,-0.047333,0.101135,...,-0.006344,-0.015419,0.016693,-0.050568,-0.024155,-0.000398,-0.006649,0.007561,0.051880,0.052826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123964,X6RLK1,0.052277,-0.062469,0.046478,0.046082,-0.041992,-0.009956,0.027161,-0.061401,0.038422,...,-0.036865,-0.017426,0.019196,-0.034882,-0.044739,0.024338,-0.055084,0.013901,0.059326,0.014725
123965,X6RLN4,-0.011299,-0.036957,0.029297,0.031891,0.006027,0.079468,-0.014832,-0.047791,0.055023,...,-0.040680,-0.006996,0.018005,-0.024414,0.061401,0.041229,-0.021011,-0.014709,0.019791,0.052887
123966,X6RLP6,0.040405,-0.013908,0.025421,0.075012,-0.050293,0.058685,-0.032135,-0.073975,0.061798,...,-0.066711,-0.011276,0.002800,-0.014954,0.015190,0.010483,-0.010162,0.027557,0.027039,0.017517
123967,X6RLR1,-0.008362,-0.026291,0.037354,0.033264,-0.044861,-0.003189,-0.009598,-0.061432,0.027451,...,-0.037476,-0.020966,0.011360,0.027267,-0.006855,-0.005241,-0.057404,-0.012024,0.000682,0.045898


In [10]:
mf_train

Unnamed: 0,Protein_ID,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
0,A0A009IHW8,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.008980,-0.003506,-0.024612,0.034760,-0.031006
3,A0A023FBW4,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
4,A0A023FBW7,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,...,-0.100647,-0.063293,0.002346,-0.104675,-0.000757,-0.047485,0.003002,-0.036774,0.103577,0.005245
5,A0A023FDY8,0.056488,0.019241,0.112122,0.019608,-0.055939,-0.016129,-0.045105,-0.152466,0.003454,...,-0.096985,-0.064880,0.009117,-0.106934,0.004780,-0.051544,0.001547,-0.038788,0.106018,0.013321
6,A0A023FF81,-0.000163,0.041138,0.098633,0.012909,-0.031494,-0.016129,-0.014793,-0.157837,-0.018585,...,-0.054810,0.015991,-0.014160,-0.086670,0.051880,-0.059387,-0.004559,0.048309,0.110474,0.027573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123918,X5KCU9,0.030869,-0.041443,-0.026169,0.019669,0.008804,0.005413,-0.055847,-0.071655,0.000172,...,-0.053162,0.002834,-0.033478,-0.068848,0.033661,0.056885,0.000184,0.009277,0.021133,0.016586
123919,X5KJC0,0.053864,0.097351,0.010437,0.018051,-0.049103,0.045563,-0.038574,-0.054688,0.066528,...,0.008675,0.034363,0.033905,-0.014412,0.088074,-0.040558,0.013382,-0.019440,-0.031494,-0.006958
123920,X5L1L5,0.052826,0.097229,0.010933,0.022873,-0.047455,0.046021,-0.037720,-0.054840,0.068909,...,0.005947,0.033142,0.035278,-0.013283,0.085327,-0.039551,0.012573,-0.017914,-0.030838,-0.008064
123921,X5L565,0.053955,0.097717,0.010117,0.022079,-0.047607,0.045807,-0.037964,-0.053772,0.068237,...,0.006489,0.033722,0.035767,-0.013329,0.088318,-0.039795,0.011810,-0.018219,-0.031830,-0.008003


In [11]:
bp_train

Unnamed: 0,Protein_ID,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
0,A0A009IHW8,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.008980,-0.003506,-0.024612,0.034760,-0.031006
1,A0A021WW32,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.081970,0.017456,-0.032959,0.053192,0.029907
8,A0A023FFD0,0.021286,0.043488,0.130493,-0.020630,-0.053345,-0.022629,-0.033478,-0.135376,0.018372,...,-0.065613,-0.061371,0.019577,-0.103210,0.091614,-0.088135,-0.035736,-0.008224,0.113586,0.057220
12,A0A023GPJ3,0.015121,0.011017,0.021698,-0.025116,0.039612,0.008392,-0.051453,-0.025406,-0.020264,...,-0.027298,-0.029373,-0.051300,-0.062500,0.023376,0.066040,0.024963,-0.040497,0.026001,0.018082
13,A0A023GPK8,-0.006481,0.010765,-0.005814,0.027664,0.030334,0.023651,-0.018524,-0.036652,0.031708,...,0.020264,0.021179,-0.001020,-0.069702,0.036530,0.028183,-0.004238,-0.016342,0.045044,0.017700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123918,X5KCU9,0.030869,-0.041443,-0.026169,0.019669,0.008804,0.005413,-0.055847,-0.071655,0.000172,...,-0.053162,0.002834,-0.033478,-0.068848,0.033661,0.056885,0.000184,0.009277,0.021133,0.016586
123919,X5KJC0,0.053864,0.097351,0.010437,0.018051,-0.049103,0.045563,-0.038574,-0.054688,0.066528,...,0.008675,0.034363,0.033905,-0.014412,0.088074,-0.040558,0.013382,-0.019440,-0.031494,-0.006958
123920,X5L1L5,0.052826,0.097229,0.010933,0.022873,-0.047455,0.046021,-0.037720,-0.054840,0.068909,...,0.005947,0.033142,0.035278,-0.013283,0.085327,-0.039551,0.012573,-0.017914,-0.030838,-0.008064
123921,X5L565,0.053955,0.097717,0.010117,0.022079,-0.047607,0.045807,-0.037964,-0.053772,0.068237,...,0.006489,0.033722,0.035767,-0.013329,0.088318,-0.039795,0.011810,-0.018219,-0.031830,-0.008003


In [12]:
#This is a useless code block
print(train_set['GO_term'].value_counts().index[:].tolist())
print(len(train_set['GO_term'].value_counts().index[:].tolist()))
train_set_mf=train_set[train_set['aspect']=='molecular_function']
print(train_set_mf['GO_term'].value_counts().index[:].tolist())
print(len(train_set_mf['GO_term'].value_counts().index[:].tolist()))

['GO:0005575', 'GO:0110165', 'GO:0008150', 'GO:0005622', 'GO:0043226', 'GO:0009987', 'GO:0003674', 'GO:0043229', 'GO:0043227', 'GO:0005737', 'GO:0043231', 'GO:0065007', 'GO:0005488', 'GO:0050789', 'GO:0050794', 'GO:0008152', 'GO:0050896', 'GO:0005515', 'GO:0005634', 'GO:0071704', 'GO:0003824', 'GO:0032501', 'GO:0016020', 'GO:0044237', 'GO:0032502', 'GO:0044238', 'GO:0048856', 'GO:0006807', 'GO:0071944', 'GO:0019222', 'GO:0071840', 'GO:0005829', 'GO:0007275', 'GO:0048518', 'GO:0016043', 'GO:0031323', 'GO:0032991', 'GO:0043170', 'GO:0060255', 'GO:0051716', 'GO:0005886', 'GO:0043228', 'GO:0043232', 'GO:0031974', 'GO:0043233', 'GO:0070013', 'GO:0048522', 'GO:0009889', 'GO:0009058', 'GO:0048519', 'GO:0031326', 'GO:0006950', 'GO:1901576', 'GO:0010556', 'GO:0080090', 'GO:0048731', 'GO:0031981', 'GO:0010468', 'GO:0044249', 'GO:1901564', 'GO:0042221', 'GO:0051171', 'GO:0051179', 'GO:0048523', 'GO:0012505', 'GO:0097159', 'GO:1901360', 'GO:0051234', 'GO:0034641', 'GO:0048869', 'GO:0030154', 'GO:0

In [13]:
train_set_cc=train_set[train_set['aspect']=='cellular_component']         ######################## Restricting to only the cellular component aspects

labels_cc = train_set_cc['GO_term'].value_counts().index[:].tolist()      ######################## We take all the go terms used for cc aspects

train_set_updated = train_set_cc                                         ####################### Here I copy train_set_cc only to keep the code after coherent

train_size = cc_train.shape[0]                                            ########################## The n° of rows of cc_train is the number of rows of train_labels
train_labels = np.zeros((train_size , len(labels_cc))) ########################## The length of labels_cc is how many go terms we need to predict

series_train_protein_ids = cc_train['Protein_ID']   ###################### restrict to only cc

for i in range(len(labels_cc)):   ##############################  we go go-term by go-term

    n_train_terms = train_set_updated[train_set_updated['GO_term'] ==  labels_cc[i]]  ########################## I added _cc after labels for coherency

    label_related_proteins = n_train_terms['Protein_ID'].unique()

    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)


labels_cc_df = pd.DataFrame(data = train_labels, columns = labels_cc)
print(labels_cc_df.shape)
labels_cc_df

(84638, 678)


Unnamed: 0,GO:0005575,GO:0110165,GO:0005622,GO:0043226,GO:0043229,GO:0043227,GO:0005737,GO:0043231,GO:0005634,GO:0016020,...,GO:0005885,GO:0005751,GO:0032809,GO:0098688,GO:0090571,GO:0043194,GO:0031010,GO:0042470,GO:0000935,GO:0031941
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84633,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84634,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84635,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84636,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
train_set_mf=train_set[train_set['aspect']=='molecular_function']         ######################## Restricting to only the cellular component aspects

labels_mf = train_set_mf['GO_term'].value_counts().index[:].tolist()      ######################## We take all the go terms used for cc aspects

train_set_updated = train_set_mf                                         ####################### Here I copy train_set_cc only to keep the code after coherent

train_size = mf_train.shape[0]                                            ########################## The n° of rows of cc_train is the number of rows of train_labels
train_labels = np.zeros((train_size , len(labels_mf))) ########################## The length of labels_cc is how many go terms we need to predict

series_train_protein_ids = mf_train['Protein_ID']   ###################### restrict to only cc

for i in range(len(labels_mf)):   ##############################  we go go-term by go-term

    n_train_terms = train_set_updated[train_set_updated['GO_term'] ==  labels_mf[i]]  ########################## I added _cc after labels for coherency

    label_related_proteins = n_train_terms['Protein_ID'].unique()

    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)


labels_mf_df = pd.DataFrame(data = train_labels, columns = labels_mf)
print(labels_mf_df.shape)
labels_mf_df

(55698, 839)


Unnamed: 0,GO:0003674,GO:0005488,GO:0005515,GO:0003824,GO:0097159,GO:0003676,GO:0016740,GO:0016787,GO:0140096,GO:0003677,...,GO:0010485,GO:0010314,GO:0070492,GO:0042169,GO:0008556,GO:0015038,GO:0043014,GO:0005160,GO:0016894,GO:0140463
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55693,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55694,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55695,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55696,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
train_set_bp=train_set[train_set['aspect']=='biological_process']         ######################## Restricting to only the cellular component aspects

labels_bp = train_set_bp['GO_term'].value_counts().index[:].tolist()      ######################## We take all the go terms used for cc aspects

train_set_updated = train_set_bp                                         ####################### Here I copy train_set_cc only to keep the code after coherent

train_size = bp_train.shape[0]                                            ########################## The n° of rows of cc_train is the number of rows of train_labels
train_labels = np.zeros((train_size , len(labels_bp))) ########################## The length of labels_cc is how many go terms we need to predict

series_train_protein_ids = bp_train['Protein_ID']   ###################### restrict to only cc

for i in range(len(labels_bp)):   ##############################  we go go-term by go-term

    n_train_terms = train_set_updated[train_set_updated['GO_term'] ==  labels_bp[i]]  ########################## I added _cc after labels for coherency

    label_related_proteins = n_train_terms['Protein_ID'].unique()

    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)
    if i%100==0:
      print(i)

labels_bp_df = pd.DataFrame(data = train_labels, columns = labels_bp)
print(labels_bp_df.shape)
labels_bp_df

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
(83064, 1487)


Unnamed: 0,GO:0008150,GO:0009987,GO:0065007,GO:0050789,GO:0050794,GO:0008152,GO:0050896,GO:0071704,GO:0032501,GO:0044237,...,GO:0008356,GO:0032680,GO:0030522,GO:0048546,GO:0002702,GO:0072073,GO:1901989,GO:0061014,GO:0006814,GO:0019722
0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83059,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83060,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83061,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83062,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Models:

In [16]:
X_cc = cc_train
y_cc = labels_cc_df
X_train_cc, X_val_cc, y_train_cc, y_val_cc = train_test_split(X_cc,y_cc,test_size=0.2,random_state=42)

X_mf = mf_train
y_mf = labels_mf_df
X_train_mf, X_val_mf, y_train_mf, y_val_mf = train_test_split(X_mf,y_mf,test_size=0.2,random_state=42)

X_bp = bp_train
y_bp = labels_bp_df
X_train_bp, X_val_bp, y_train_bp, y_val_bp = train_test_split(X_bp,y_bp,test_size=0.2,random_state=42)

CC models.

In [17]:
INPUT_SHAPE = [train_df.shape[1]]
BATCH_SIZE = 5120

model_cc = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=len(labels_cc),activation='sigmoid')
])


# Compile model
model_cc.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC(multi_label=True, curve='PR', name='average_precision')]
)

early_stopper = EarlyStopping(
    monitor='val_auc',     # The performance metric to monitor
    patience=10,            # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restores model weights from the epoch with the best value of the monitored quantity
)


hist_cc = model_cc.fit(
    X_train_cc.iloc[:,1:], y_train_cc,
    validation_data=(X_val_cc.iloc[:,1:], y_val_cc),
    batch_size=BATCH_SIZE,
    epochs=80,
    callbacks=early_stopper
)
#val_loss: 10:0.40 min:0.0319 at 54

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80


regularization done badly

In [19]:
INPUT_SHAPE = [train_df.shape[1]]
BATCH_SIZE = 5120

model_cc_l1 = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer='l1'),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer='l1'),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer='l1'),
    tf.keras.layers.Dense(units=len(labels_cc),activation='sigmoid',kernel_regularizer='l1')
])


# Compile model
model_cc_l1.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC(multi_label=True, curve='PR', name='average_precision')]
)

hist_cc_l1 = model_cc_l1.fit(
    X_train_cc.iloc[:,1:], y_train_cc,
    validation_data=(X_val_cc.iloc[:,1:], y_val_cc),
    batch_size=BATCH_SIZE,
    epochs=80,callbacks=early_stopper
)
#val_loss: 10:0.40 min:0.0319 at 54

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [1]:
#no errors here, just colab ram crashing. To be re tested, from what I remember it has a similar rate to the original model non reg
from keras.regularizers import l1, l2, l1_l2
INPUT_SHAPE = [train_df.shape[1]]
BATCH_SIZE = 5120

model_cc_l1 = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer=l1(0.0000001)),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer=l1(0.0000001)),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer=l1(0.0000001)),
    tf.keras.layers.Dense(units=len(labels_cc),activation='sigmoid',kernel_regularizer=l1(0.0000001))
])
early_stopper = EarlyStopping(
    monitor='val_auc',     # The performance metric to monitor
    patience=10,            # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restores model weights from the epoch with the best value of the monitored quantity
)

# Compile model
model_cc_l1.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC(multi_label=True, curve='PR', name='average_precision')]
)

hist_cc_l1 = model_cc_l1.fit(
    X_train_cc.iloc[:,1:], y_train_cc,
    validation_data=(X_val_cc.iloc[:,1:], y_val_cc),
    batch_size=BATCH_SIZE,
    epochs=80,callbacks=early_stopper
)
#val_loss: 10:0.40 min:0.0319 at 54

NameError: name 'train_df' is not defined

In [None]:
#never tried, probably doesn't work unless we put a small constant
INPUT_SHAPE = [train_df.shape[1]]
BATCH_SIZE = 5120

model_cc_l2 = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer='l2'),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer='l2'),
    tf.keras.layers.Dense(units=256, activation='relu',kernel_regularizer='l2'),
    tf.keras.layers.Dense(units=len(labels_cc),activation='sigmoid',kernel_regularizer='l2')
])


# Compile model
model_cc_l2.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC(multi_label=True, curve='PR', name='average_precision')]
)

hist_cc_l2 = model_cc_l2.fit(
    X_train_cc.iloc[:,1:], y_train_cc,
    validation_data=(X_val_cc.iloc[:,1:], y_val_cc),
    batch_size=BATCH_SIZE,
    epochs=80,callbacks=early_stopper
)

mf,bp models. never tested

In [None]:
INPUT_SHAPE = [train_df.shape[1]]
BATCH_SIZE = 5120

model_mf = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=len(labels_mf),activation='sigmoid')
])


# Compile model
model_mf.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC(multi_label=True, curve='PR', name='average_precision')]
)

hist_mf = model_mf.fit(
    X_train_mf.iloc[:,1:], y_train_mf,
    validation_data=(X_val_mf.iloc[:,1:], y_val_mf),
    batch_size=BATCH_SIZE,
    epochs=200
)

In [None]:
INPUT_SHAPE = [train_df.shape[1]]
BATCH_SIZE = 5120

model_bp = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=len(labels_bp),activation='sigmoid')
])


# Compile model
model_bp.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC(multi_label=True, curve='PR', name='average_precision')]
)

hist_bp = model_bp.fit(
    X_train_bp.iloc[:,1:], y_train_bp,
    validation_data=(X_val_bp.iloc[:,1:], y_val_bp),
    batch_size=BATCH_SIZE,
    epochs=200
)