## Setup

### Configuration

In [1]:
import sys
import os
import yaml
import torch
from google.colab import drive, files

drive.mount('/content/drive')

# Set directories and configuration file
PROJECT_DIR = "/content/drive/MyDrive/pfp"
CONFIG_FILE = f"{PROJECT_DIR}/config.yaml"
SRC_DIR = f"{PROJECT_DIR}/src"

# Add the src directory to the Python path
sys.path.append(SRC_DIR)

# Choose device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {device}")

# Load the configuration data from the YAML file
with open(CONFIG_FILE, 'r') as f:
    config_data = yaml.safe_load(f)

# Import custom modules
import data_preprocessing as dp
import utils

# Get parameters
SEED = config_data['parameters']['RANDOM_SEED']

Mounted at /content/drive
Device: cuda


### Libraries

In [2]:
import pandas as pd
import keras
import random
import tensorflow as tf
import numpy as np
import h5py

Random seed set as 42


## Load original data

In [3]:
# Load train data
train_set, train_embeddings, train_ids = dp.load_original_data(config_data, test=False)

## Build custom test and training sets

In [4]:
# Sample 1000 unique protein IDs to build a custom test set
n_examples = 1000
utils.set_random_seed(SEED)
custom_test_ids = np.random.choice(train_ids, size=n_examples, replace=False)

Random seed set as 42


In [5]:
# Get the custom test protein embeddings and the updated training protein embeddings and protein IDs
custom_test_idx, custom_test_embeddings, custom_train_ids, custom_train_idx, custom_train_embeddings = dp.build_custom_datasets(train_ids, train_embeddings, custom_test_ids)

In [6]:
# Get custom test set df and the updated training set df
custom_test_set = train_set.loc[train_set['protein_id'].isin(custom_test_ids)]
custom_train_set = train_set.loc[~train_set['protein_id'].isin(custom_test_ids)]

In [7]:
# Ensure custom datasets don't overlap
intersection = set(custom_train_ids) & set(custom_test_ids)
print("Train and test set don't overlap.") if not intersection else print("Train and test set overlap.")

Train and test set don't overlap.


In [8]:
# Extract custom test ground truth file for evaluation with cafaeval
custom_test_ground_truth = custom_test_set.drop(columns=['aspect'])

In [9]:
# Check protein counts and order
print(f"Number of proteins in the training set: {len(custom_train_set['protein_id'].unique())}")
print(f"Number of proteins in the custom test set: {len(custom_test_ids)}")
print(f"Total number of proteins: {len(custom_train_set['protein_id'].unique()) + len(custom_test_ids)}")
print("Are the custom training set protein IDs in alphabetical order?")
list(custom_train_ids) == sorted(custom_train_ids)

Number of proteins in the training set: 122969
Number of proteins in the custom test set: 1000
Total number of proteins: 123969
Are the custom training set protein IDs in alphabetical order?


True

## Save files for download

In [10]:
# Save custom test set files
custom_test_set.to_csv('custom_test_set.tsv', sep='\t', index=False)
with h5py.File('custom_test_embeddings.h5', 'w') as h5f:
    h5f.create_dataset('custom_test_embeddings', data=custom_test_embeddings)
np.savetxt('custom_test_ids.tsv', custom_test_ids, delimiter='\t', fmt='%s')
np.savetxt('custom_test_indices.tsv', custom_test_idx, delimiter='\t')
custom_test_ground_truth.to_csv('custom_test_ground_truth.tsv', sep='\t', index=False, header=False)

In [11]:
# Save custom train set files
custom_train_set.to_csv('custom_train_set.tsv', sep='\t', index=False)
with h5py.File('custom_train_embeddings.h5', 'w') as h5f:
    h5f.create_dataset('custom_train_embeddings', data=custom_train_embeddings)
np.savetxt('custom_train_ids.tsv', custom_train_ids, delimiter='\t', fmt='%s')
np.savetxt('custom_train_indices.tsv', custom_train_idx, delimiter='\t')