In [1]:
# %load_ext autotime
# !pip install autotime
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
import warnings
import pandas as pd
import numpy as np
import time
import esm
import biotite.structure.io as bsio
from proteinttt.models.esmfold import ESMFoldTTT, DEFAULT_ESMFOLD_TTT_CFG
from proteinttt.utils.structure import calculate_tm_score, lddt_score
import torch
import argparse
import os


pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 100)
pd.set_option("max_colwidth", 1000)

  from .autonotebook import tqdm as notebook_tqdm
  __import__("pkg_resources").declare_namespace(__name__)


In [2]:
base_path = Path("/scratch/project/open-35-8/pimenol1/ProteinTTT/ProteinTTT/data/bfvd/")
SUMMARY_PATH = base_path / 'proteinttt_results.tsv'
MSA_PATH = Path("/scratch/project/open-35-8/antonb/bfvd/bfvd_msa")
SUBSET_PATH = base_path / 'proteinttt_msa_testset.tsv'

# Choosing data

In [5]:
df = pd.read_csv(SUBSET_PATH, sep="\t")

In [None]:
from Bio import SeqIO


def check_a3m_lengths(filepath):
    lengths = {}
    try:
        for record in SeqIO.parse(filepath, format="fasta"):
            lengths[record.id] = len(record.seq)

        first_record_id = list(lengths.keys())[0]
        reference_length = lengths[first_record_id]

        print(f"Reference sequence: '{first_record_id}' (Length: {reference_length})")

        is_consistent = True
        for seq_id, length in lengths.items():
            if length != reference_length:
                print(f"MISMATCH: '{seq_id}' has length {length}")
                is_consistent = False

        if is_consistent:
            print(f"All {len(lengths)} sequences have the same length: {reference_length}")

    except Exception as e:
        print(f"An error occurred: {e}")

file_to_check = "/scratch/project/open-35-8/antonb/bfvd/bfvd_msa/A0A1M7XUY2.a3m"
check_a3m_lengths(file_to_check)

--- Checking file: /scratch/project/open-35-8/antonb/bfvd/bfvd_msa/A0A1M7XUY2.a3m ---
Reference sequence: 'UniRef100_A0A1M7XUY2' (Length: 437)
MISMATCH: 'A0A1M7XUY2' has length 464
MISMATCH: 'A0A6G8MXV6' has length 440
MISMATCH: 'A0A6G8MYC0' has length 440
MISMATCH: 'A0A2R8FFW9' has length 438
MISMATCH: 'A0A1M7XUT6' has length 440
MISMATCH: 'A0A6G8MYN2' has length 440
MISMATCH: 'A0A285PWM6' has length 442
MISMATCH: 'A0A1M7XUJ5' has length 440
MISMATCH: 'A0A2R8FFG4' has length 444
MISMATCH: 'A0A285PX37' has length 466
MISMATCH: 'A0A6G8MYJ4' has length 440
MISMATCH: 'A0A2R8FEN7' has length 440
MISMATCH: 'A0A6G8MZ46' has length 443
MISMATCH: 'A0A6G8MXZ6' has length 441
MISMATCH: 'A0A1M7XUZ9' has length 440
MISMATCH: 'A0A2R8FF08' has length 441
MISMATCH: 'A0A2R8FFG3' has length 440
MISMATCH: 'A0A285PY62' has length 440
MISMATCH: 'A0A6G8MZ79' has length 440
MISMATCH: 'A0A285PXS9' has length 443
MISMATCH: 'A0A6G8MY88' has length 442
MISMATCH: 'A0A2R8FFP0' has length 441
MISMATCH: 'A0A2R8FFR8