In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import gc
import os
import sys
from collections import defaultdict

# ====================================================
# 1. C·∫§U H√åNH (CONFIGURATION)
# ====================================================
class Config:
    # --- Paths ---
    ESM_DIR = '/kaggle/input/cafa6-protein-embeddings-esm2'
    TRAIN_TERMS = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv'
    TEST_FASTA = '/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta'
    GO_OBO = '/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo'
    
    # Dataset m·ªõi c·∫ßn add ƒë·ªÉ l·ªçc Negative (T√¨m dataset ch·ª©a file n√†y tr√™n Kaggle)
    # N·∫øu kh√¥ng t√¨m th·∫•y file n√†y, code s·∫Ω t·ª± ƒë·ªông b·ªè qua b∆∞·ªõc l·ªçc Negative
    GOA_FILE = '/kaggle/input/protein-go-annotations/goa_uniprot_all.csv'
    
    # Foldseek
    HOMOLOGY_FILE = '/kaggle/input/foldseek-cafa/foldseek_submission.tsv'
    
    # --- Model & Training ---
    NUM_LABELS = 1500
    EMBED_DIM = 1280
    BATCH_SIZE = 128
    LR = 1e-3
    EPOCHS = 10
    
    # --- Post-processing ---
    THRESHOLD = 0.01
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"‚öôÔ∏è Running on device: {Config.DEVICE}")

# ====================================================
# 2. MODULE: NEGATIVE PROPAGATION (M·ªöI)
# ====================================================
class NegativePropagator:
    def __init__(self, obo_path, goa_path):
        self.obo_path = obo_path
        self.goa_path = goa_path
        self.children = defaultdict(list)
        self.negative_pairs = set()

    def parse_obo(self):
        """ƒê·ªçc file OBO ƒë·ªÉ hi·ªÉu quan h·ªá cha-con (Parent-Child)"""
        print("   Parsing OBO ontology...")
        term = None
        with open(self.obo_path, 'r') as f:
            for line in f:
                line = line.strip()
                if line.startswith('[Term]'):
                    term = None
                elif line.startswith('id: GO:'):
                    term = line[4:]
                elif line.startswith('is_a: GO:') and term:
                    parent = line.split()[1]
                    self.children[parent].append(term) # Parent -> Children

    def get_descendants(self, root_term):
        """T√¨m t·∫•t c·∫£ con ch√°u c·ªßa m·ªôt term"""
        descendants = set()
        stack = [root_term]
        while stack:
            current = stack.pop()
            if current in self.children:
                kids = self.children[current]
                for kid in kids:
                    if kid not in descendants:
                        descendants.add(kid)
                        stack.append(kid)
        return descendants

    def load_negatives(self, target_ids):
        """Load c√°c Negative Annotations t·ª´ GOA v√† lan truy·ªÅn xu·ªëng con ch√°u"""
        if not os.path.exists(self.goa_path):
            print(f"‚ö†Ô∏è GOA file not found at {self.goa_path}. Skipping Negative Prop.")
            return

        print("   Loading Negative Annotations (NOT)...")
        # ƒê·ªçc file l·ªõn, ch·ªâ l·∫•y c√°c c·ªôt c·∫ßn thi·∫øt
        # C·∫•u tr√∫c th∆∞·ªùng l√†: ID, GO_ID, Qualifier
        # Qualifier ch·ª©a "NOT" l√† negative
        target_set = set(target_ids)
        
        # ƒê·ªçc t·ª´ng chunk ƒë·ªÉ tr√°nh tr√†n RAM
        chunk_iter = pd.read_csv(self.goa_path, chunksize=500000, 
                                 usecols=['protein_id', 'go_term', 'qualifier'])
        
        count = 0
        for chunk in chunk_iter:
            # L·ªçc c√°c d√≤ng c√≥ 'NOT' v√† thu·ªôc t·∫≠p Test
            neg_chunk = chunk[
                (chunk['qualifier'].str.contains('NOT', na=False)) & 
                (chunk['protein_id'].isin(target_set))
            ]
            
            for _, row in neg_chunk.iterrows():
                pid = row['protein_id']
                term = row['go_term']
                
                # Th√™m ch√≠nh n√≥
                self.negative_pairs.add(f"{pid}_{term}")
                
                # Th√™m t·∫•t c·∫£ con ch√°u (N·∫øu kh√¥ng c√≥ cha th√¨ kh√¥ng c√≥ con)
                descendants = self.get_descendants(term)
                for d in descendants:
                    self.negative_pairs.add(f"{pid}_{d}")
            
            count += len(neg_chunk)
            
        print(f"   Found {count} negative roots. Expanded to {len(self.negative_pairs)} blocked pairs.")

    def filter_submission(self, df_sub):
        """L·ªçc b·ªè c√°c c·∫∑p negative kh·ªèi DataFrame submission"""
        print("   Applying Negative Filter...")
        initial_len = len(df_sub)
        
        # T·∫°o c·ªôt key ƒë·ªÉ so s√°nh
        df_sub['key'] = df_sub['Id'] + '_' + df_sub['Term']
        
        # Gi·ªØ l·∫°i nh·ªØng d√≤ng KH√îNG n·∫±m trong t·∫≠p negative
        # D√πng set ƒë·ªÉ tra c·ª©u c·ª±c nhanh
        df_clean = df_sub[~df_sub['key'].isin(self.negative_pairs)].copy()
        
        df_clean.drop(columns=['key'], inplace=True)
        removed = initial_len - len(df_clean)
        print(f"   üö´ Removed {removed} negative predictions.")
        
        return df_clean

# ====================================================
# 3. C√ÅC H√ÄM H·ªñ TR·ª¢ KH√ÅC (NH∆Ø C≈®)
# ====================================================

def compute_fmax(y_true, y_pred_probs, steps=10):
    best_f1 = 0.0
    best_threshold = 0.0
    thresholds = np.linspace(0.1, 0.6, steps)
    for t in thresholds:
        y_pred_bin = (y_pred_probs >= t).astype(int)
        score = f1_score(y_true, y_pred_bin, average='samples', zero_division=0)
        if score > best_f1:
            best_f1 = score
            best_threshold = t
    return best_f1, best_threshold

def load_safe_ids(path):
    try:
        ids = np.load(path, allow_pickle=True)
        if len(ids) > 0 and isinstance(ids[0], (bytes, np.bytes_)):
            return [i.decode('utf-8') for i in ids]
        return ids.tolist()
    except: return []

def get_train_data(train_ids_list):
    df = pd.read_csv(Config.TRAIN_TERMS, sep="\t")
    top_terms = df['term'].value_counts().index[:Config.NUM_LABELS].tolist()
    term_to_idx = {t: i for i, t in enumerate(top_terms)}
    pid_to_idx = {pid: i for i, pid in enumerate(train_ids_list)}
    y = np.zeros((len(train_ids_list), Config.NUM_LABELS), dtype=np.float32)
    df = df[df['term'].isin(top_terms) & df['EntryID'].isin(train_ids_list)]
    row_idx = df['EntryID'].map(pid_to_idx)
    col_idx = df['term'].map(term_to_idx)
    valid = row_idx.notna() & col_idx.notna()
    y[row_idx[valid].astype(int), col_idx[valid].astype(int)] = 1.0
    return y, {i: t for t, i in term_to_idx.items()}

# ====================================================
# 4. MODEL ARCHITECTURE
# ====================================================
class ResidualBlock(nn.Module):
    def __init__(self, in_features, hidden_features, dropout=0.3):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(in_features, hidden_features),
            nn.BatchNorm1d(hidden_features),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_features, in_features),
            nn.BatchNorm1d(in_features)
        )
        self.relu = nn.ReLU()
    def forward(self, x): return self.relu(x + self.block(x))

class ProteinClassifier(nn.Module):
    def __init__(self, input_dim=1280, num_classes=1500):
        super().__init__()
        self.bn_input = nn.BatchNorm1d(input_dim)
        self.layer1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.res_block = ResidualBlock(512, 256)
        self.out = nn.Linear(512, num_classes)
    def forward(self, x):
        x = self.bn_input(x)
        x = self.dropout(self.relu(self.layer1(x)))
        x = self.res_block(x)
        return self.out(x)

# ====================================================
# 5. TRAINING PIPELINE (ESM-2)
# ====================================================
def run_esm_pipeline():
    print("\n" + "="*30 + "\nüöÄ TRAINING ESM-2 MODEL\n" + "="*30)
    
    # A. Load Data
    try:
        all_pids = pd.read_csv(os.path.join(Config.ESM_DIR, "protein_ids.csv"))['protein_id'].tolist()
    except:
        all_pids = load_safe_ids(os.path.join(Config.ESM_DIR, "protein_ids.npy"))
    all_embeds = np.load(os.path.join(Config.ESM_DIR, "protein_embeddings.npy"), mmap_mode='r')
    pid_to_idx_map = {pid: i for i, pid in enumerate(all_pids)}
    
    # B. Prepare Train
    df_terms = pd.read_csv(Config.TRAIN_TERMS, sep="\t")
    train_targets = set(df_terms['EntryID'].unique())
    valid_pids = [p for p in train_targets if p in pid_to_idx_map]
    
    train_indices = [pid_to_idx_map[p] for p in valid_pids]
    X = np.array([all_embeds[i] for i in train_indices])
    y, term_map = get_train_data(valid_pids)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # C. Train
    train_ds = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
    val_ds = torch.utils.data.TensorDataset(torch.tensor(X_val).float())
    train_dl = DataLoader(train_ds, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=0)
    val_dl = DataLoader(val_ds, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=0)
    
    model = ProteinClassifier(input_dim=X.shape[1], num_classes=Config.NUM_LABELS).to(Config.DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.LR)
    criterion = nn.BCEWithLogitsLoss()
    
    for epoch in range(Config.EPOCHS):
        model.train()
        total_loss = 0
        for X_b, y_b in train_dl:
            X_b, y_b = X_b.to(Config.DEVICE), y_b.to(Config.DEVICE)
            optimizer.zero_grad()
            out = model(X_b)
            loss = criterion(out, y_b)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"   Epoch {epoch+1}/{Config.EPOCHS} | Loss: {total_loss/len(train_dl):.4f}")
        
    # D. Validation
    model.eval()
    val_probs = []
    with torch.no_grad():
        for (X_b,) in val_dl:
            X_b = X_b.to(Config.DEVICE)
            val_probs.append(torch.sigmoid(model(X_b)).cpu().numpy())
    val_probs = np.vstack(val_probs)
    fmax, thresh = compute_fmax(y_val, val_probs)
    print(f"   üî• Validation F-Max: {fmax:.4f} (Thresh: {thresh:.2f})")
    
    del X_train, y_train, X_val, y_val, train_ds, val_ds
    gc.collect()
    
    # E. Predict Test
    print("   Predicting Test Data...")
    test_pids = []
    with open(Config.TEST_FASTA, 'r') as f:
        for line in f:
            if line.startswith('>'): test_pids.append(line.strip()[1:].split()[0])
            
    X_test = np.zeros((len(test_pids), all_embeds.shape[1]), dtype=np.float32)
    for i, pid in enumerate(test_pids):
        if pid in pid_to_idx_map:
            X_test[i] = all_embeds[pid_to_idx_map[pid]]
            
    test_ds = torch.utils.data.TensorDataset(torch.tensor(X_test).float())
    test_dl = DataLoader(test_ds, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=0)
    
    preds = []
    with torch.no_grad():
        for (X_b,) in tqdm(test_dl, desc="Inference"):
            X_b = X_b.to(Config.DEVICE)
            preds.append(torch.sigmoid(model(X_b)).cpu().numpy())
    all_preds = np.vstack(preds)
    
    # Format
    dl_results = []
    idx_to_term = term_map
    for i, pid in enumerate(tqdm(test_pids, desc="Formatting")):
        scores = all_preds[i]
        indices = np.where(scores >= Config.THRESHOLD)[0]
        for idx in indices:
            dl_results.append((pid, idx_to_term[idx], scores[idx]))
            
    df_dl = pd.DataFrame(dl_results, columns=['Id', 'Term', 'Score'])
    del all_embeds, X_test, all_preds, model
    gc.collect()
    
    return df_dl, test_pids # Tr·∫£ v·ªÅ c·∫£ list test_pids ƒë·ªÉ d√πng cho filter negative

# ====================================================
# 6. MAIN EXECUTION
# ====================================================
def main():
    # 1. Train & Predict Deep Learning
    df_dl, test_pids = run_esm_pipeline()
    print(f"‚úÖ DL Predictions: {len(df_dl)} rows")
    
    # 2. Merge Foldseek
    print("\n" + "="*30 + "\nü§ù MERGING FOLDSEEK\n" + "="*30)
    try:
        print(f"   Loading: {Config.HOMOLOGY_FILE}")
        df_hom = pd.read_csv(Config.HOMOLOGY_FILE, sep='\t', header=None, names=['Id', 'Term', 'Score_Homology'])
        
        print("   Merging...")
        df_final = pd.merge(df_dl, df_hom, on=['Id', 'Term'], how='outer')
        df_final['Score'].fillna(0, inplace=True)
        df_final['Score_Homology'].fillna(0, inplace=True)
        
        # Max Pooling Strategy
        df_final['Final_Score'] = np.maximum(df_final['Score'], df_final['Score_Homology'])
        
    except Exception as e:
        print(f"‚ö†Ô∏è Merge Failed: {e}. Using DL Only.")
        df_final = df_dl
        df_final['Final_Score'] = df_final['Score']

    # 3. APPLY NEGATIVE PROPAGATION (M·ªöI)
    print("\n" + "="*30 + "\nüõë NEGATIVE PROPAGATION\n" + "="*30)
    
    # Kh·ªüi t·∫°o Propagator
    neg_prop = NegativePropagator(Config.GO_OBO, Config.GOA_FILE)
    
    # Parse c·∫•u tr√∫c c√¢y GO
    neg_prop.parse_obo()
    
    # Load Negative t·ª´ GOA Uniprot (ch·ªâ load cho c√°c protein trong t·∫≠p Test)
    neg_prop.load_negatives(target_ids=test_pids)
    
    # L·ªçc b·ªè c√°c d·ª± ƒëo√°n sai
    if len(neg_prop.negative_pairs) > 0:
        df_final = neg_prop.filter_submission(df_final)
    else:
        print("   No negative annotations found or file missing.")

    # 4. Save
    print("\nüíæ SAVING SUBMISSION...")
    submission = df_final[df_final['Final_Score'] >= Config.THRESHOLD][['Id', 'Term', 'Final_Score']]
    submission['Final_Score'] = submission['Final_Score'].round(3)
    submission.columns = ['Id', 'Term', 'Score']
    
    out_path = 'submission.tsv'
    submission.to_csv(out_path, sep='\t', header=False, index=False)
    
    size_mb = os.path.getsize(out_path) / (1024 * 1024)
    print(f"‚úÖ DONE! File size: {size_mb:.2f} MB")

if __name__ == "__main__":
    main()

‚öôÔ∏è Running on device: cuda

üöÄ TRAINING ESM-2 MODEL
   Epoch 1/10 | Loss: 0.0192
   Epoch 2/10 | Loss: 0.0124
   Epoch 3/10 | Loss: 0.0119
   Epoch 4/10 | Loss: 0.0115
   Epoch 5/10 | Loss: 0.0112
   Epoch 6/10 | Loss: 0.0110
   Epoch 7/10 | Loss: 0.0107
   Epoch 8/10 | Loss: 0.0105
   Epoch 9/10 | Loss: 0.0103
   Epoch 10/10 | Loss: 0.0101
   üî• Validation F-Max: 0.3267 (Thresh: 0.21)
   Predicting Test Data...


Inference:   0%|          | 0/1753 [00:00<?, ?it/s]

Formatting:   0%|          | 0/224309 [00:00<?, ?it/s]

‚úÖ DL Predictions: 9901479 rows

ü§ù MERGING FOLDSEEK
   Loading: /kaggle/input/foldseek-cafa/foldseek_submission.tsv
   Merging...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final['Score'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final['Score_Homology'].fillna(0, inplace=True)



üõë NEGATIVE PROPAGATION
   Parsing OBO ontology...
   Loading Negative Annotations (NOT)...
   Found 5294 negative roots. Expanded to 112458 blocked pairs.
   Applying Negative Filter...
   üö´ Removed 4359 negative predictions.

üíæ SAVING SUBMISSION...
‚úÖ DONE! File size: 483.83 MB


In [2]:
# Script th·ª±c hi·ªán Hierarchy Propagation (PHI√äN B·∫¢N T·ªêI ∆ØU H√ìA)
!pip install pronto
import pandas as pd
from pronto import Ontology
from tqdm import tqdm
import numpy as np
from joblib import Parallel, delayed
import os

print("B·∫Øt ƒë·∫ßu th·ª±c hi·ªán Hierarchy Propagation (T·ªêI ∆ØU H√ìA)...")

# --- 1. T·∫£i file c·∫•u tr√∫c GO ---
print("ƒêang t·∫£i file go-basic.obo v√†o b·ªô nh·ªõ...")
with open('/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo', 'rb') as f:
    go = Ontology(f)
print(f"ƒê√£ t·∫£i th√†nh c√¥ng Ontology v·ªõi {len(go)} terms.")

# --- 2. ƒê·ªçc file submission ƒë√£ t·∫°o tr∆∞·ªõc ƒë√≥ ---
submission_file = 'submission.tsv' # ƒê·∫£m b·∫£o ƒë√¢y l√† file ƒë√£ ƒë∆∞·ª£c merge v·ªõi homology
print(f"ƒêang ƒë·ªçc file submission g·ªëc: {submission_file}")
original_df = pd.read_csv(submission_file, sep='\t', header=None, names=['ProteinID', 'GOTermID', 'Confidence'])
print(f"S·ªë l∆∞·ª£ng d·ª± ƒëo√°n ban ƒë·∫ßu: {len(original_df)}")

# --- 3. TI·ªÄN T√çNH TO√ÅN (PRE-COMPUTATION) ---
# L·∫•y t·∫•t c·∫£ c√°c GO term duy nh·∫•t c√≥ trong submission c·ªßa b·∫°n
unique_terms = original_df['GOTermID'].unique()
print(f"Ti·ªÅn t√≠nh to√°n b·∫£n ƒë·ªì t·ªï ti√™n cho {len(unique_terms)} GO terms duy nh·∫•t...")

# T·∫°o m·ªôt map: term -> {t·∫≠p h·ª£p c√°c t·ªï ti√™n}
term_ancestors_map = {}
for term_id in tqdm(unique_terms, desc="Building Ancestor Map"):
    try:
        term = go[term_id]
        # L·∫•y ID c·ªßa t·∫•t c·∫£ c√°c superclasses (t·ªï ti√™n) v√† l∆∞u l·∫°i
        ancestors = {ancestor.id for ancestor in term.superclasses() if ancestor.id != term_id}
        term_ancestors_map[term_id] = ancestors
    except KeyError:
        term_ancestors_map[term_id] = set() # N·∫øu term kh√¥ng c√≥ trong file obo

print("B·∫£n ƒë·ªì t·ªï ti√™n ƒë√£ ƒë∆∞·ª£c t·∫°o.")

# --- 4. X·ª¨ L√ù SONG SONG (PARALLELIZATION) ---

def process_chunk(df_chunk):
    """
    H√†m n√†y x·ª≠ l√Ω m·ªôt ph·∫ßn nh·ªè c·ªßa DataFrame.
    N√≥ s·∫Ω ƒë∆∞·ª£c ch·∫°y tr√™n m·ªôt nh√¢n CPU ri√™ng.
    """
    propagated_predictions = {}
    # S·ª≠ d·ª•ng itertuples() nhanh h∆°n nhi·ªÅu so v·ªõi iterrows()
    for row in df_chunk.itertuples(index=False):
        protein_id = row.ProteinID
        term_id = row.GOTermID
        confidence = row.Confidence

        # Th√™m d·ª± ƒëo√°n g·ªëc
        key = (protein_id, term_id)
        if key not in propagated_predictions or confidence > propagated_predictions[key]:
            propagated_predictions[key] = confidence
        
        # L·∫•y c√°c t·ªï ti√™n t·ª´ b·∫£n ƒë·ªì ƒë√£ t√≠nh to√°n tr∆∞·ªõc
        if term_id in term_ancestors_map:
            for ancestor_id in term_ancestors_map[term_id]:
                ancestor_key = (protein_id, ancestor_id)
                if ancestor_key not in propagated_predictions or confidence > propagated_predictions[ancestor_key]:
                    propagated_predictions[ancestor_key] = confidence
                    
    return propagated_predictions

# L·∫•y s·ªë l∆∞·ª£ng nh√¢n CPU, -1 nghƒ©a l√† d√πng t·∫•t c·∫£
n_jobs = os.cpu_count()
print(f"Chia d·ªØ li·ªáu th√†nh {n_jobs} ph·∫ßn v√† x·ª≠ l√Ω song song...")

# Chia DataFrame th√†nh c√°c chunk nh·ªè
df_chunks = np.array_split(original_df, n_jobs)

# Ch·∫°y x·ª≠ l√Ω song song
# delayed(process_chunk)(chunk) chu·∫©n b·ªã l·ªánh ƒë·ªÉ ch·∫°y h√†m process_chunk v·ªõi ƒë·∫ßu v√†o l√† chunk
parallel = Parallel(n_jobs=n_jobs, backend='multiprocessing')
results_list = parallel(delayed(process_chunk)(chunk) for chunk in tqdm(df_chunks, desc="Parallel Processing"))

# --- 5. T·ªïng h·ª£p k·∫øt qu·∫£ ---
print("T·ªïng h·ª£p k·∫øt qu·∫£ t·ª´ c√°c nh√¢n CPU...")
final_propagated_predictions = {}
for chunk_dict in tqdm(results_list, desc="Merging results"):
    for key, value in chunk_dict.items():
        if key not in final_propagated_predictions or value > final_propagated_predictions[key]:
            final_propagated_predictions[key] = value

print("Qu√° tr√¨nh Propagation ho√†n t·∫•t.")

# --- 6. Chuy·ªÉn ƒë·ªïi v√† L∆∞u file ---
final_list = [{'ProteinID': k[0], 'GOTermID': k[1], 'Confidence': v} for k, v in final_propagated_predictions.items()]
final_df = pd.DataFrame(final_list)

print(f"S·ªë l∆∞·ª£ng d·ª± ƒëo√°n sau khi propagation v√† deduplication: {len(final_df)}")

final_submission_file = 'submission_propagated.tsv'
final_df.to_csv(final_submission_file, sep='\t', header=False, index=False)

print(f"\\nƒê√£ t·∫°o file submission cu·ªëi c√πng: {final_submission_file}")
print("Top 5 d√≤ng c·ªßa file cu·ªëi c√πng:")
print(final_df.head())

Collecting pronto
  Downloading pronto-2.7.2-py3-none-any.whl.metadata (10 kB)
Collecting fastobo<0.15.0,>=0.13.0 (from pronto)
  Downloading fastobo-0.14.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.4 kB)
Downloading pronto-2.7.2-py3-none-any.whl (62 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m62.2/62.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastobo-0.14.1-cp311-cp311-manylinux_2_28_x86_64.whl (2.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.3/2.3 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastobo, pronto
Successfully installed fastobo-0.14.1 pronto-2.7.2
B·∫Øt ƒë·∫ßu th·ª±c hi·ªán Hierarchy Propagation (T·ªêI ∆ØU H√ìA)...
ƒêang t·∫£i file go-basic.obo v√†o b·ªô nh·ªõ...
ƒê√£ t·∫£i th√†nh c√¥ng Ontolo

Building Ancestor Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30743/30743 [00:03<00:00, 9476.08it/s]
  return bound(*args, **kwds)


B·∫£n ƒë·ªì t·ªï ti√™n ƒë√£ ƒë∆∞·ª£c t·∫°o.
Chia d·ªØ li·ªáu th√†nh 4 ph·∫ßn v√† x·ª≠ l√Ω song song...


Parallel Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00,  5.75it/s]


T·ªïng h·ª£p k·∫øt qu·∫£ t·ª´ c√°c nh√¢n CPU...


Merging results: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:21<00:00,  5.38s/it]


Qu√° tr√¨nh Propagation ho√†n t·∫•t.
S·ªë l∆∞·ª£ng d·ª± ƒëo√°n sau khi propagation v√† deduplication: 39973787
\nƒê√£ t·∫°o file submission cu·ªëi c√πng: submission_propagated.tsv
Top 5 d√≤ng c·ªßa file cu·ªëi c√πng:
    ProteinID    GOTermID  Confidence
0  A0A017SE81  GO:0004497       0.019
1  A0A017SE81  GO:0016491       0.438
2  A0A017SE81  GO:0003674       0.438
3  A0A017SE81  GO:0003824       0.438
4  A0A017SE81  GO:0005515       0.039
