In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict

# Amino acid mapping
amino_acids = list("ACDEFGHIKLMNPQRSTVWYX")
amino_acid_to_index = {amino_acid: index for index, amino_acid in enumerate(amino_acids)}

# Function to read FASTA file
def read_fasta(file_path):
    sequences = defaultdict(str)
    with open(file_path, 'r') as file:
        current_id = None
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                current_id = line.split()[0][1:]
            else:
                sequences[current_id] += line
    return sequences

# Function to convert sequence to indices
def convert_sequence_to_indices(sequence):
    return [amino_acid_to_index[aa] for aa in sequence if aa in amino_acid_to_index]

# Function to pad sequences
def pad_sequences_manually(sequences, maxlen):
    padded_sequences = [seq + [0] * (maxlen - len(seq)) for seq in sequences]
    return np.array(padded_sequences)

# Read sequences
train_sequences_path = "/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta"
train_sequences = read_fasta(train_sequences_path)
corrected_train_sequences = {protein_id.split()[0]: sequence for protein_id, sequence in train_sequences.items()}

# Read terms
train_terms_path = "/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv"
train_terms = pd.read_csv(train_terms_path, sep='\t', names=['protein', 'term', 'aspect'], skiprows=1)
train_terms_grouped = train_terms.groupby('protein')['term'].apply(list)

# Identify proteins with both sequences and terms
proteins_with_both_corrected = set(corrected_train_sequences.keys()) & set(train_terms_grouped.keys())

# Select a subset
subset_protein_ids_corrected = list(proteins_with_both_corrected)[:500]
subset_sequences_corrected = [corrected_train_sequences[protein_id] for protein_id in subset_protein_ids_corrected]
subset_terms_corrected = [train_terms_grouped[protein_id] for protein_id in subset_protein_ids_corrected]

# Convert and pad sequences
indexed_subset_sequences_corrected = [convert_sequence_to_indices(seq) for seq in subset_sequences_corrected]
max_length_corrected = max(len(seq) for seq in indexed_subset_sequences_corrected)
padded_subset_sequences_corrected = pad_sequences_manually(indexed_subset_sequences_corrected, maxlen=max_length_corrected)

# Binarize labels
mlb = MultiLabelBinarizer()
binarized_labels_corrected = mlb.fit_transform(subset_terms_corrected)


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.layers import Dropout

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length_corrected),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dense(output_dim, activation='sigmoid')
])


In [16]:
from tensorflow.keras.losses import BinaryCrossentropy

model.compile(optimizer='adam', loss=BinaryCrossentropy(label_smoothing=0.05), metrics=['accuracy'])



In [17]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler

def lr_schedule(epoch):
    return 0.001 * (0.1 ** (epoch // 10))

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_subset_sequences_corrected, binarized_labels_corrected, epochs=10, batch_size=32,
          callbacks=[LearningRateScheduler(lr_schedule)])






Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f58109c7790>

In [18]:
# Save the entire model to a file
# model.save('protein_function_model.h5')

In [1]:
from collections import defaultdict
import numpy as np
from tensorflow.keras.models import load_model

# Amino acid mapping
amino_acids = list("ACDEFGHIKLMNPQRSTVWYX")
amino_acid_to_index = {amino_acid: index for index, amino_acid in enumerate(amino_acids)}

# Function to read FASTA file
def read_fasta(file_path):
    sequences = defaultdict(str)
    with open(file_path, 'r') as file:
        current_id = None
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                current_id = line.split()[0][1:]
            else:
                sequences[current_id] += line
    return sequences

# Function to truncate or pad sequences to a specific length
def truncate_or_pad_sequences(sequences, maxlen):
    new_sequences = [seq[:maxlen] if len(seq) > maxlen else seq + [0] * (maxlen - len(seq)) for seq in sequences]
    return np.array(new_sequences)

# Read test sequences
test_sequences_raw = read_fasta('/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta')

# Define the size of the subset (e.g., 500 proteins)
subset_size = 500

# Select a subset of test sequences
subset_test_sequences = list(test_sequences_raw.values())[:subset_size]

# Convert the test sequences to integer indices
indexed_subset_test_sequences = [[amino_acid_to_index[aa] for aa in seq if aa in amino_acid_to_index] for seq in subset_test_sequences]

# Truncate or pad the test sequences to match the length of the training sequences (5000)
max_length_model = 5000
padded_subset_test_sequences = truncate_or_pad_sequences(indexed_subset_test_sequences, maxlen=max_length_model)


# Load the trained model
model_path = '/kaggle/input/cafa-model/protein_function_model.h5'
trained_model = load_model(model_path,compile=False)

# Make predictions on the preprocessed test data
predictions = trained_model.predict(padded_subset_test_sequences)

# Display the shape of the predictions
print(predictions.shape)


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


(500, 31466)


In [4]:
import pandas as pd

subset_test_protein_ids = list(test_sequences_raw.keys())[:subset_size]

train_terms = pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv', sep='\t')
unique_go_terms = train_terms['term'].unique()
# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions, columns=unique_go_terms, index=subset_test_protein_ids)

# Melt DataFrame to match submission format
submission_df = predictions_df.reset_index().melt(id_vars='index', var_name='term', value_name='confidence')
submission_df.rename(columns={'index': 'protein'}, inplace=True)

# Save submission file
submission_df.to_csv('submission.tsv', sep='\t', index=False)
