In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!kaggle kernels output kwahnguyen/baseline-protein-prediction -p ./cafa6-test/

In [None]:
from kaggle_secrets import UserSecretsClient
import os 

user_secrets = UserSecretsClient()
os.environ["KAGGLE_KEY"] = user_secrets.get_secret("KAGGLE_KEY")
os.environ["KAGGLE_USERNAME"] = user_secrets.get_secret("KAGGLE_USERNAME")

In [None]:
!kaggle competitions download -c cafa-6-protein-function-prediction

Downloading cafa-6-protein-function-prediction.zip to /kaggle/working
  0%|                                               | 0.00/91.3M [00:00<?, ?B/s]
100%|██████████████████████████████████████| 91.3M/91.3M [00:00<00:00, 1.39GB/s]


In [None]:
# !unzip /kaggle/working/cafa-6-protein-function-prediction.zip -d /kaggle/working

Archive:  /kaggle/working/cafa-6-protein-function-prediction.zip
  inflating: /kaggle/working/IA.tsv  
  inflating: /kaggle/working/Test/testsuperset-taxon-list.tsv  
  inflating: /kaggle/working/Test/testsuperset.fasta  
  inflating: /kaggle/working/Train/go-basic.obo  
  inflating: /kaggle/working/Train/train_sequences.fasta  
  inflating: /kaggle/working/Train/train_taxonomy.tsv  
  inflating: /kaggle/working/Train/train_terms.tsv  
  inflating: /kaggle/working/sample_submission.tsv  


In [None]:
# !pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
import zipfile
import os

zip_path = "cafa-6-protein-function-prediction.zip"
extract_dir = "./cafa6_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

os.listdir(extract_dir)

['IA.tsv', 'Train', 'Test', 'sample_submission.tsv']

In [None]:
import os

dir_path = '../data/Train'
os.listdir(dir_path)

['train_terms.tsv',
 'train_taxonomy.tsv',
 'go-basic.obo',
 'train_sequences.fasta']

In [8]:
# Calculating Naive Frequency given a sequence
def naive_freq(sequence):
    """
    Input: an amino acid sequence of variable length
    Output: a list of shape (1, 20), each element standing for an amino acid and its frequency
    """
    # Counting the appearance of each amino acid within the sequence
    AMINO_ACIDS = 'ARNDCQEGHILKMFPSTWYV'

    AA_TO_INDEX = {aa: i for i, aa in enumerate(AMINO_ACIDS)}

    frequency_vector = [0] * 20

    for amino_acid in sequence:
        try:
            index = AA_TO_INDEX[amino_acid]
            frequency_vector[index] += 1
        except KeyError:
            print(f"Warning: Skipping unknown amino acid {amino_acid}")
            pass

    # Calculating frequencies for each amino acid
    total_length = len(sequence)

    normalized_vector = frequency_vector
    if total_length > 0:
        normalized_vector = [count / total_length for count in frequency_vector]

    return normalized_vector

In [9]:
CTD_GROUPS_BY_LIST = {
    "Hydrophobicity": {
        1: {"A", "V", "L", "I", "M", "F", "W", "C"},               # Hydrophobic
        2: {"G", "H", "Y", "P", "T", "S"},          # Neutral
        3: {"R", "K", "Q", "E", "D", "N"}           # Polar
    },

    "Charge": {
        1: {"D", "E"},      # Negative
        2: {"A", "G", "I", "L", "M", "F", "P", "Q", "S", "T", "W", "Y", "V", "N", "C"}, # Neutral
        3: {"K", "R", "H"}     # Positive
    },

    "VanDerWaals": {
        1: {"A", "G", "S", "C"},  # Small
        2: {"T", "D", "P", "N", "V"},   # Medium
        3: {"E", "Q", "L", "I", "F", "Y", "M", "H", "K", "R", "W"}   # Large
    },

    "Polarity": {
        1: {"L", "A", "W", "F", "C", "M", "V", "I", "Y"},   # Small
        2: {"P", "T", "S", "G", "H"},    # Medium
        3: {"Q", "N", "E", "D", "K", "R"}   # High
    },

    "Polarizability": {
        1: {"G", "A", "S", "D", "C"},   # Small
        2: {"T", "P", "N", "H", "E", "Q", "K"},   # Medium
        3: {"M", "I", "L", "V", "F", "Y", "W", "R"}
    },

    "SecondStructure": {
        1: {"E", "A", "L", "M", "Q", "K", "R", "H"},  # Helix
        2: {"V", "I", "Y", "C", "W", "F", "T"},  # Strand
        3: {"G", "N", "P", "S", "D"}  # Coil
    },

    "Solvent": {
        1: {"A", "L", "F", "C", "G", "I", "V", "W"},  # Buried
        2: {"R", "K", "Q", "E", "D", "N"},  # Intermediate
        3: {"M", "S", "P", "T", "H", "Y"}  # Exposed
    }
}

def get_group(aa, property_map):
    for g, aa_set in property_map.items():
        if aa in aa_set:
            return g
    return None

In [10]:
import math

def aa_ctd(sequence, physicochem):
    """
    For this composition, we compute the statistics on groups divided by physicochemical properties:
        e.g: By hydrophobicity, we have three groups:
            Class 1: {A, G, V, L, I, M, F, W, P}
            Class 2: {R, K, H}
            Class 3: {S, T, Y, C, N, Q, D, E}

    Amino Acid composition is made up of three subcompositions for each physicochemical property:
    - Composition (C): The frequency of a group by the number of AAs
    - Transition (T) :
    Input: an amino acid sequence of variable length

    """
    property_map = CTD_GROUPS_BY_LIST[physicochem]
    L = len(sequence)

    groups = []
    for aa in sequence:
        g = get_group(aa, property_map)
        if g is not None:
            groups.append(g)

    L = len(groups)
    if L == 0:
        return [0.0] * 21  # safe fallback

    # Count members of each group
    N = {1: 0, 2: 0, 3: 0}
    for g in groups:
        N[g] += 1

    composition = [N[1]/L, N[2]/L, N[3]/L]

    # Transitions
    T12 = T13 = T23 = 0
    for i in range(L - 1):
        g1, g2 = groups[i], groups[i + 1]
        if g1 == g2:
            continue
        gmin, gmax = min(g1, g2), max(g1, g2)
        if gmin == 1 and gmax == 2:
            T12 += 1
        elif gmin == 1 and gmax == 3:
            T13 += 1
        elif gmin == 2 and gmax == 3:
            T23 += 1

    denom = L - 1
    transition = [T12/denom, T13/denom, T23/denom]

    # Distribution
    positions = {1: [], 2: [], 3: []}
    for i, g in enumerate(groups):
        positions[g].append(i + 1)

    P_k = [0.0, 0.25, 0.5, 0.75, 1.0]
    distribution = []

    for g in [1, 2, 3]:
        pos_list = positions[g]
        Ng = len(pos_list)
        if Ng == 0:
            distribution.extend([0.0]*5)
            continue
        for pk in P_k:
            if pk == 0:
                idx = 0
            else:
                idx = math.ceil(Ng * pk) - 1
            distribution.append(pos_list[idx] / L)

    return composition + transition + distribution

sequence = "AEAAAEAEEAAAAAEAEEEAAEEAEEEAAE"
ctd = aa_ctd(sequence, "Hydrophobicity")
print(len(ctd))

21


In [11]:
def dipeptide_composition(seq):
    """Compute dipeptide composition."""
    AA = "ACDEFGHIKLMNPQRSTVWY"
    dipeptides = [a+b for a in AA for b in AA]
    seq = seq.upper()
    length = len(seq)-1 if len(seq)>1 else 1
    return [sum(1 for i in range(len(seq)-1) if seq[i]+seq[i+1]==dp)/length for dp in dipeptides]

In [12]:
def extract_accession(header):
    """Extract accession from a FASTA header like >sp|A0JP26|POTB3_HUMAN"""
    parts = header.lstrip('>').split('|')
    if len(parts) >= 2:
        return parts[1]
    else:
        return header.lstrip('>')

In [13]:
from Bio import SeqIO

bio_properties = ["Hydrophobicity", "Charge", "VanDerWaals", "Polarity",
                  "Polarizability", "SecondStructure", "Solvent"]

def load_fasta_features(fasta_path):
    """
    Load sequences from a FASTA file, extract features, and return
    a feature matrix X and a list of protein IDs.
    """
    ids = []
    feats = []

    for record in SeqIO.parse(fasta_path, "fasta"):
        protein_id = extract_accession(record.id)
        seq = str(record.seq).upper()

        # --- Feature extraction ---
        x = []
        x.extend(naive_freq(seq))
        for prop in bio_properties:
            x.extend(aa_ctd(seq, prop))
        x.extend(dipeptide_composition(seq))

        ids.append(protein_id)
        feats.append(x)

    X = np.vstack(feats)
    return ids, X

In [14]:
import pandas as pd

df = pd.read_csv("/kaggle/working/cafa6_data/Train/train_terms.tsv", sep="\t",
                 names=["EntryID", "GO", "Ont"])

df["EntryID"] = df["EntryID"].str.strip()
df["GO"] = df["GO"].str.strip()
df["Ont"] = df["Ont"].str.strip()

labels_MF = {}
labels_BP = {}
labels_CC = {}

for entry, go, ont in zip(df["EntryID"], df["GO"], df["Ont"]):
    if ont == "F":
        labels_MF.setdefault(entry, []).append(go)
    elif ont == "P":
        labels_BP.setdefault(entry, []).append(go)
    elif ont == "C":
        labels_CC.setdefault(entry, []).append(go)


In [15]:
import numpy as np

def build_Y(train_ids, label_dict):
    # collect all GO terms for this ontology
    all_terms = sorted({go for gos in label_dict.values() for go in gos})
    term_to_index = {go: i for i, go in enumerate(all_terms)}

    Y = np.zeros((len(train_ids), len(all_terms)), dtype=np.uint8)

    for i, pid in enumerate(train_ids):
        if pid in label_dict:
            for go in label_dict[pid]:
                j = term_to_index[go]
                Y[i, j] = 1

    return Y, term_to_index, all_terms

In [16]:
import numpy as np

# train_ids = np.load("/kaggle/input/training-data/training_data.npz")["Train_ID"]
# X_train = np.load("/kaggle/input/training-data/training_data.npz")["X_train"]
train_ids = np.load("/kaggle/input/cafa-6-t5-embeddings/train_ids.npy")
X_train = np.load("/kaggle/input/cafa-6-t5-embeddings/train_embeds.npy")

In [17]:
Y_MF, mf_term_to_idx, mf_terms = build_Y(train_ids, labels_MF)
Y_BP, bp_term_to_idx, bp_terms = build_Y(train_ids, labels_BP)
Y_CC, cc_term_to_idx, cc_terms = build_Y(train_ids, labels_CC)

In [18]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Reduce X dimensionality from 567 → 100
pca = PCA(n_components=100)
X_reduced = pca.fit_transform(X_train)

knn = KNeighborsClassifier(
    n_neighbors=5,
    metric="euclidean",
    weights="distance",
    n_jobs=-1
)

print("Training PCA + KNN...")
knn.fit(X_reduced, Y_MF)

# Prediction
pred = knn.predict(X_reduced[:10])
print(pred.shape)


Training PCA + KNN...
(10, 6616)


In [19]:
# Same PCA trained on X_train
pca = PCA(n_components=100)
X_train_reduced = pca.fit_transform(X_train)

# BP
knn_bp = KNeighborsClassifier(n_neighbors=5, metric="euclidean", weights="distance", n_jobs=-1)
knn_bp.fit(X_train_reduced, Y_BP)

# CC
knn_cc = KNeighborsClassifier(n_neighbors=5, metric="euclidean", weights="distance", n_jobs=-1)
knn_cc.fit(X_train_reduced, Y_CC)


In [20]:
import numpy as np 

# test_data = np.load("/kaggle/input/test-set/test_data.npz")
# print(test_data)
X_test = np.load("/kaggle/input/cafa-6-t5-embeddings/test_embeds.npy")
test_ids = np.load("/kaggle/input/cafa-6-t5-embeddings/test_ids.npy")

In [21]:
print(Y_MF.shape)
print(Y_BP.shape)
print(Y_CC.shape)
print(len(mf_terms))
print(len(bp_terms))
print(len(cc_terms))
print(test_ids[:5])

(82404, 6616)
(82404, 16858)
(82404, 2651)
6616
16858
2651
['A0A0C5B5G6' 'A0A1B0GTW7' 'A0JNW5' 'A0JP26' 'A0PK11']


In [22]:
X_test_reduced = pca.transform(X_test)
print(X_test_reduced.shape)

(224309, 100)


In [23]:
import math
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

# --- 1. GO Term Loading and Mapping ---
def load_go_terms(obo_path):
    """Parses GO.obo to map GO IDs to their root ontology (MFO, BPO, CCO)."""
    go_terms = {}
    current_term = None
    
    # Read the GO.obo file
    with open(obo_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('[Term]'):
                current_term = {}
            elif current_term is not None:
                if line.startswith('id:'):
                    current_term['id'] = line.split(': ')[1]
                elif line.startswith('namespace:'):
                    namespace = line.split(': ')[1]
                    if 'biological_process' in namespace:
                        current_term['root'] = 'BPO'
                    elif 'molecular_function' in namespace:
                        current_term['root'] = 'MFO'
                    elif 'cellular_component' in namespace:
                        current_term['root'] = 'CCO'
                elif line == '':
                    if current_term.get('id') and current_term.get('root'):
                        go_terms[current_term['id']] = current_term['root']
                    current_term = None
    return go_terms

# Load the GO terms dictionary
# NOTE: Update the path to 'go-basic.obo' if needed for your environment.
GO_OBO_PATH = '/kaggle/working/cafa6_data/Train/go-basic.obo' 
go_terms_dict = load_go_terms(GO_OBO_PATH)


# --- 2. Ensemble Class ---
class ProteinPredictions:
    """Stores and merges predictions from multiple sources."""
    def __init__(self):
        self.predictions = {}

    def add_prediction(self, protein, go_term, score, branch, bonus=0):
        # If the protein is not already in the storage, initialize its structure
        if protein not in self.predictions:
            self.predictions[protein] = {'CCO': {}, 'MFO': {}, 'BPO': {}}
        
        score = float(score)

        if go_term in self.predictions[protein][branch]:
            # This logic rewards consensus: if a term is predicted by both models, 
            # the bonus is added to the highest score found so far.
            current_score = self.predictions[protein][branch][go_term]
            new_score = max(current_score, score) + bonus
            self.predictions[protein][branch][go_term] = new_score
        else:
            self.predictions[protein][branch][go_term] = score

        # Ensure that the score does not exceed 1
        if self.predictions[protein][branch][go_term] > 1:
            self.predictions[protein][branch][go_term] = 1

    def get_predictions(self, output_file='submission_ensemble.tsv', top=35):
        """Exports the merged predictions to a CAFA-formatted file."""
        with open(output_file, 'w') as f:
            # Write CAFA headers
            for protein, branches in tqdm(self.predictions.items(), desc="Writing Final Submission"):
                for branch, go_terms in branches.items():
                    # Sort GO terms by score in descending order and select the top ones
                    top_go_terms = sorted(go_terms.items(), key=lambda x: x[1], reverse=True)[:top]
                    
                    # Write each of the top predictions to the file
                    for go_term, score in top_go_terms:
                        f.write(f"{protein}\t{go_term}\t{score:.6f}\n")

        print(f"\nEnsemble CAFA submission saved to: {output_file}")

In [30]:
# --- 1. Generate CTD-SVM Baseline (P_CTD) ---
CTD_SUBMISSION_FILE = "ctd_temp_submission.tsv"

# Use your existing batch submission function to generate the CTD-based predictions
make_cafa_submission_batch(
    test_ids,
    X_test_reduced,
    knn, mf_terms,
    knn_bp, bp_terms,
    knn_cc, cc_terms,
    output_path=CTD_SUBMISSION_FILE, # Saves your CTD predictions to a temp file
    batch_size=1000
)

# --- 2. Initialize Ensemble and Load Predictions ---
predictor = ProteinPredictions()

# Load CTD-based predictions (P_CTD)
print(f"Loading CTD predictions from: {CTD_SUBMISSION_FILE}")
with open(CTD_SUBMISSION_FILE, 'r') as f:
    for item in tqdm(f, desc="Processing CTD scores"):
        if item.startswith('AUTHOR') or item.startswith('MODEL') or item.startswith('END'):
            continue
        try:
            protein_id, go_term, score_str = item.strip().split('\t')
            score = float(score_str)
            if go_term in go_terms_dict:
                root = go_terms_dict[go_term]
                # Add CTD predictions (no bonus yet)
                predictor.add_prediction(protein_id, go_term, score, root, bonus=0.0)
        except ValueError:
            continue

# Load BLAST-based predictions (P_BLAST)
BLAST_SUBMISSION_PATH = '/kaggle/input/blast-quick-sprof-zero-pred/submission.tsv' 
print(f"\nLoading BLAST predictions from: {BLAST_SUBMISSION_PATH}")
with open(BLAST_SUBMISSION_PATH, 'r') as f:
    for item in tqdm(f, desc="Processing BLAST scores"):
        try:
            # Note: The format of this file might have a different column order (often ProteinID, GO_TERM, Score).
            # We assume the standard three-column CAFA output.
            protein_id, go_term, score_str = item.strip().split('\t')
            score = float(score_str)
            if go_term in go_terms_dict:
                root = go_terms_dict[go_term]
                # Add BLAST predictions with a slight bonus to reward consensus
                predictor.add_prediction(protein_id, go_term, score, root, bonus=0.01)
        except ValueError:
            # Handle potential header lines or formatting issues in the BLAST file
            continue
        except KeyError:
            # Handle GO terms not found in the GO_terms_dict (non-evaluatable or obsolete)
            continue
            
# --- 3. Final Submission Export ---
FINAL_SUBMISSION_FILE = "final_ensemble_submission.tsv"
predictor.get_predictions(FINAL_SUBMISSION_FILE, top=35)

Processing proteins: 100%|██████████| 225/225 [1:52:49<00:00, 30.09s/it]



Batch CAFA submission saved to: ctd_temp_submission.tsv
Loading CTD predictions from: ctd_temp_submission.tsv


Processing CTD scores: 7850815it [00:20, 389919.60it/s]



Loading BLAST predictions from: /kaggle/input/blast-quick-sprof-zero-pred/submission.tsv


Processing BLAST scores: 11977931it [00:26, 454366.27it/s]
Writing Final Submission: 100%|██████████| 279437/279437 [00:12<00:00, 23012.11it/s]


Ensemble CAFA submission saved to: final_ensemble_submission.tsv





In [29]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# (Assume the get_probabilities function is the robust, RAM-efficient version)
# ...
def get_probabilities(knn_model, X_batch):
    """
    The definitive, RAM-efficient method to get the (N_batch, N_terms) score matrix,
    explicitly handling the single-column collapse without excessive memory use.
    """
    # 1. Get the list of probability arrays (one array per term)
    prob_list = knn_model.predict_proba(X_batch)
    N_batch = X_batch.shape[0]
    N_terms = len(prob_list)
    
    # 2. Pre-allocate the final score matrix (N_batch rows, N_terms columns)
    # Using float32 saves half the memory compared to float64.
    final_scores = np.empty((N_batch, N_terms), dtype=np.float32)
    
    # 3. Populate the matrix column-by-column, handling the size collapse
    for j, P_term in enumerate(prob_list):
        # P_term shape is (N_batch, N_classes)
        
        # Check if the class dimension has collapsed (size 1) or is standard (size 2)
        if P_term.shape[1] == 2:
            # Standard: Extract the positive class probability (index 1)
            final_scores[:, j] = P_term[:, 1]
        elif P_term.shape[1] == 1:
            # Collapsed: Assume the single column is the score we want (index 0)
            final_scores[:, j] = P_term[:, 0]
        else:
            raise ValueError("Unexpected number of classes for a GO term.")
            
    return final_scores

def make_cafa_submission_batch(
    test_ids,
    X_test_reduced,
    knn_mf, mf_terms,
    knn_bp, bp_terms,
    knn_cc, cc_terms,
    output_path="cafa_submission.tsv",
    threshold=0.1,
    top_k=35,  # ADDED: Maximum number of predictions per protein
    batch_size=1000
):
    N = len(test_ids)

    with open(output_path, "w") as f:
        # Loop over batch indices
        for start in tqdm(range(0, N, batch_size), desc="Processing proteins"):
            end = min(start + batch_size, N)

            X_batch = X_test_reduced[start:end]
            ids_batch = test_ids[start:end]

            # 1. Get the Score Matrices (RAM-efficiently)
            prob_mf_matrix = get_probabilities(knn_mf, X_batch)
            prob_bp_matrix = get_probabilities(knn_bp, X_batch)
            prob_cc_matrix = get_probabilities(knn_cc, X_batch)

            # 2. Write rows to file
            for i in range(len(ids_batch)):
                pid = ids_batch[i]
                
                # Collect all filtered scores for the current protein
                all_protein_predictions = []
                
                ontologies = [
                    (prob_mf_matrix[i], mf_terms),
                    (prob_bp_matrix[i], bp_terms),
                    (prob_cc_matrix[i], cc_terms)
                ]

                # Loop through each ontology
                for scores, terms in ontologies:
                    # Apply the score threshold filter first
                    mask = scores >= threshold
                    
                    filtered_scores = scores[mask]
                    filtered_terms = np.array(terms)[mask]

                    # Collect the valid predictions
                    for go, score in zip(filtered_terms, filtered_scores):
                        all_protein_predictions.append((go, score))

                # --- NEW STEP: Apply Top-K Filter ---
                if all_protein_predictions:
                    # Sort by score in descending order
                    all_protein_predictions.sort(key=lambda x: x[1], reverse=True)
                    
                    # Take only the top K predictions (e.g., 35)
                    top_k_predictions = all_protein_predictions[:top_k]
                    
                    # Write the final, filtered predictions to the file
                    for go_term, score in top_k_predictions:
                        f.write(f"{pid}\t{go_term}\t{score:.6f}\n")

        # Write the required footer for CAFA format
        # f.write("END\n") # Commented out if handled in the caller function

    print(f"\nBatch CAFA submission saved to: {output_path}")

In [None]:
make_cafa_submission_batch(
    test_ids,
    X_test_reduced,
    knn, mf_terms,
    knn_bp, bp_terms,
    knn_cc, cc_terms,
    output_path="submission.tsv",
    batch_size=1000
)

In [None]:
# test_ids, X_test = load_fasta_features("/kaggle/working/cafa6_data/Test/testsuperset.fasta")

In [None]:
# import numpy as np

# np.savez('/kaggle/working/test_data.npz', 
#          test_ids=test_ids, 
#          X_test=X_test)

### **Ensemble Stacking**

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
N, L = y_train.shape
Z_knn   = np.zeros((N, L))
Z_mlp   = np.zeros((N, L))
Z_blast = np.zeros((N, L))

In [None]:
for fold, (tr, val) in enumerate(kf.split(X_ids)):

    # ---- KNN ----
    knn = KNNModel(...)
    knn.fit(embeddings[tr], y_train[tr])
    Z_knn[val] = knn.predict_proba(embeddings[val])

    # ---- MLP ----
    mlp = train_mlp(esm[tr], y_train[tr])
    Z_mlp[val] = mlp.predict_proba(esm[val])

    # ---- BLAST ----
    blast_db = make_blast_db(seqs[tr])
    Z_blast[val] = blast_predict(
        queries=seqs[val],
        db=blast_db
    )


In [None]:
# For GO term g
X_meta_g = np.stack([
    Z_knn[:, g],
    Z_mlp[:, g],
    Z_blast[:, g]
], axis=1)


In [None]:
from sklearn.linear_model import LogisticRegression

meta_models = []

for g in range(L):
    meta = LogisticRegression(
        penalty="l2",
        C=1.0,
        max_iter=1000
    )
    meta.fit(X_meta_g, y_train[:, g])
    meta_models.append(meta)
