In [None]:
!pip install tensorflow keras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Flatten, Dense, Input
from tensorflow.keras.utils import to_categorical
import h5py




In [None]:
from google.colab import drive
drive.mount('/content/drive/')
data_dir = '/content/drive/MyDrive/BiologicalData/biological_data_pfp/train/'


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Load the dataset
train_set = pd.read_csv(data_dir+'train_set.tsv', delimiter='\t')

protein_sequences = {}
with open(data_dir+'train.fasta', 'r') as fasta_file:
    current_protein = ''
    for line in fasta_file:
        if line.startswith('>'):
            current_protein = line.strip()[1:]
            protein_sequences[current_protein] = ''
        else:
            protein_sequences[current_protein] += line.strip()

# Load Protein IDs

train_ids_df = pd.read_csv(data_dir+'train_ids.txt',header = None)
train_ids_df.columns = ['Protein_ID']

# Load train_protein2ipr dat file

train_protein2ipr = pd.read_csv(data_dir+'train_protein2ipr.dat',header = None, sep='\t')
train_protein2ipr.columns = ['Protein_ID','IPR_ID','desc','db','start','end']

# Load ProtT5 embeddings
with h5py.File(data_dir+'train_embeddings.h5', 'r') as f:
    embeddings_list = []

    for protein_id in f.keys(): #protein ids
        # Extract the embeddings for each protein
        embeddings = f[protein_id][:]
        embeddings_list.append(embeddings)


prott5_embeddings = np.array(embeddings_list)
print(prott5_embeddings.shape)

(123969, 1024)


In [None]:
# Using only the first 10.000 rows for label encoding, since using the whole set would fill up completely Colab's 12GB resources.
# If training on my machine, I have 32GB of memory to use (although training on CPU).
subset_train_set = train_set.head(10000)

label_encoder_mf = LabelEncoder()
labels_mf = label_encoder_mf.fit_transform(subset_train_set[subset_train_set['aspect'] == 'molecular_function']['GO_term'])
labels_mf = to_categorical(labels_mf)

label_encoder_bp = LabelEncoder()
labels_bp = label_encoder_bp.fit_transform(subset_train_set[subset_train_set['aspect'] == 'biological_process']['GO_term'])
labels_bp = to_categorical(labels_bp)

label_encoder_cc = LabelEncoder()
labels_cc = label_encoder_cc.fit_transform(subset_train_set[subset_train_set['aspect'] == 'cellular_component']['GO_term'])
labels_cc = to_categorical(labels_cc)


In [None]:
# Map protein IDs to their corresponding ProtT5 embeddings
protein_id_to_embedding = dict(zip(train_set['Protein_ID'], prott5_embeddings))


X = []
y_mf, y_bp, y_cc = [], [], []

for protein_id, label_mf, label_bp, label_cc in zip(train_set['Protein_ID'], labels_mf, labels_bp, labels_cc):
    sequence = protein_sequences.get(protein_id, '')  # Get protein sequence from fasta file
    embedding = protein_id_to_embedding.get(protein_id, [])  # Get ProtT5 embedding

    # Ensure both sequence and embedding are available
    if sequence != '' and len(embedding) > 0:
        X.append(embedding)  # Use ProtT5 embedding as input
        y_mf.append(label_mf)
        y_bp.append(label_bp)
        y_cc.append(label_cc)

# Convert lists to arrays
X = np.array(X)
y_mf = np.array(y_mf)
y_bp = np.array(y_bp)
y_cc = np.array(y_cc)



In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train_mf, y_val_mf, y_train_bp, y_val_bp, y_train_cc, y_val_cc = train_test_split(
    X,
    y_mf,
    y_bp,
    y_cc,
    test_size=0.2,
    random_state=42
)

In [None]:
# Here I had to do some processing since the dimensions for the NN are not compatible.
# Still trying to understand why dimensions don't add up.

prott5_embeddings = np.array(embeddings_list)

# Assuming prott5_embeddings.shape is (123969, 1024)
print(prott5_embeddings.shape)

# Reshape to add time step dimension
prott5_embeddings = np.expand_dims(prott5_embeddings, axis=1)

# Now, the shape should be (123969, 1, 1024)
print(prott5_embeddings.shape)

# Build the model with pre-trained embeddings
embedding_dim = prott5_embeddings.shape[2]  # Assuming ProtT5 embeddings shape is (num_proteins, 1, embedding_dim)
input_layer = Input(shape=(1, embedding_dim))
conv1d_layer = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(input_layer)

num_classes_mf = len(np.unique(labels_mf))
num_classes_bp = len(np.unique(labels_bp))
num_classes_cc = len(np.unique(labels_cc))


maxpooling_layer = MaxPooling1D(pool_size=1)(conv1d_layer)
lstm_layer = LSTM(units=100, return_sequences=True)(maxpooling_layer)
flatten_layer = Flatten()(lstm_layer)

# Define output layers for each sub-ontology
output_layer_mf = Dense(units=num_classes_mf, activation='softmax', name='output_mf')(flatten_layer)
output_layer_bp = Dense(units=num_classes_bp, activation='softmax', name='output_bp')(flatten_layer)
output_layer_cc = Dense(units=num_classes_cc, activation='softmax', name='output_cc')(flatten_layer)

# Create the model with multiple outputs
# Probably will need to change this and create a model for each sub-ontology
model = Model(inputs=input_layer, outputs=[output_layer_mf, output_layer_bp, output_layer_cc])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Assuming X_train has shape (123969, 1024)
# Reshape to add time step dimension
X_train_reshaped = np.expand_dims(X_train, axis=1)

# Now, the shape should be (123969, 1, 1024)

print(X_train_reshaped.shape)

# Not working from here

model.fit(
    X_train_reshaped,
    {'output_mf': y_train_mf, 'output_bp': y_train_bp, 'output_cc': y_train_cc},
    epochs=10,
    batch_size=32
)


(123969, 1024)
(123969, 1, 1024)
(900, 1, 1024)
Epoch 1/10


ValueError: ignored