In [4]:
# %load_ext autotime
# !pip install autotime
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
import warnings
import pandas as pd
import numpy as np
import time
import esm
import biotite.structure.io as bsio
from proteinttt.models.esmfold import ESMFoldTTT, DEFAULT_ESMFOLD_TTT_CFG
from proteinttt.utils.structure import calculate_tm_score, lddt_score
import torch
import argparse
import os


pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 100)
pd.set_option("max_colwidth", 1000)

  from .autonotebook import tqdm as notebook_tqdm
  __import__("pkg_resources").declare_namespace(__name__)


In [5]:
base_path = Path("/scratch/project/open-35-8/pimenol1/ProteinTTT/ProteinTTT/data/bfvd/")
SUMMARY_PATH = base_path / 'proteinttt_results.tsv'
MSA_PATH = Path("/scratch/project/open-35-8/antonb/bfvd/bfvd_msa")
SUBSET_PATH = base_path / 'proteinttt_msa_testset.tsv'

# Choosing data

In [6]:
df = pd.read_csv(SUBSET_PATH, sep="\t")

In [7]:
df

Unnamed: 0,id,nmsa,pLDDT_AlphaFold,version,sequence,length,pLDDT_before,pLDDT_after,tm_score_before,lddt_before,tm_score_after,lddt_after,time,difference_plddt
0,A0A0H4SQ70,10124,91.406506,BASE,MQSIDFGRALDFTLRWEGGYVNHPSDPGGATNRGITQVTYNQWRTQKGLPTREVRLIEEDEVRSIYWQFYWAPVEGRTAPSWVQFRVCLFDTFVQFGVFGGTFLWQKVCGVPADGQWGPVTSRATENLVSTKGPLWSGMALVGERVRYRAQRVSQNRSQLAFLQGWLNRDSDLLLYLLNLR,181,85.136031,87.117266,0.98978,0.990052,0.98613,0.981236,67.120121,1.981235
1,Q80B01,5665,91.960021,BASE,ENRAFDENVTERVMHVLVAAGADVNAASVVDRTPLHVCLTGMSTHPGTIAALLRFGADVNAVDLCGMSPLAVLVRSRAATAELVRMLLDAGADAHAVDSRLDSLLHQHFQSARPRPEVVRELIRHGCSPRARNRIGNTPLHEAAKHSSCKHSLVGPLLAAGASVDARNNTGRTPLHLAAASNPRACRRLIALGADVVARSYAGVTPLAQLIADNNSALVTAALDTQPEPRAVAESLRATTPVGETACSRLCVAYVVARAPSEVLGEPERALHAAFVAECLAEVAAIHAVRCGTPPVSLLEILVSARTPRSLLSRRARRLAESRTTVYRAPLRARIAAMRHRSRLVERALRTLRGCVLPREVLERVLRCLSTQDLRASGLAE,381,90.829035,90.912645,0.88055,0.896696,0.88207,0.897574,0.639445,0.083609
2,A0A1S5R1Q9,57,94.034389,BASE+LOGAN,MSLDIPKGMNIHSAATYARDYAAKQRANYVTIVFNDINLTVYATSNPDDIALIYYLESEVRRLKR,65,58.079447,83.771889,0.50373,0.623064,0.67245,0.756843,34.663036,25.692443
3,A0A2I4Q1U7,17,91.027945,BASE+LOGAN,MNYCDIAHELRMEREKQEKRIIKKMAVLLAHYKADKQPTHDEFVDFCNMYLNVSKATGYRWLKALNDGEL,70,78.155634,87.247534,0.60368,0.743309,0.94927,0.969441,35.922151,9.091901
4,A0A218MN90,538,90.865686,BASE,METFVGFVLYLYTSAGTLLEFTPKDSLSDCLRIKRTIERTDPPQRGKERWVCKQGKLKLKVIDGKKYPVEVLDY,74,81.071421,81.071421,0.95585,0.950043,0.95585,0.950043,34.210736,0.0
5,A0A3S7URH3,1016,90.055809,BASE,MPHEIWGQRFVSTREPAWHRIGFHFAEPILPSEAVAIAGFEEPELAPLFLEDGTPVDYYAVVMGDEVFGVVSRYWRLFRLEEIVPTLDELSRTFPLSAAGQLKKGRIVFFAFEQRTEILGEDYIRYLVVLHSYEPGRSWKVLYTPTRVVCMNTLVASEKDREWEYRVYHNAPRGILEAQIVMAQYRVLQETVDKKLEAFARIGDFDAQLDVLFEHVYPYPEPPSELDKQRYGKEVEERYERAMERVKQVRNLALQSYQRFNDEFPKFGNTAYAALQAVTEVADWRGNLKSKESPLVGPRAKEKQAAWQFLSTLI,314,77.239656,77.239656,0.48237,0.768121,0.48237,0.768121,160.64179,0.0
6,A0A5B9NC75,582,91.237163,BASE,MVSSDFFEADDLFIMDSYAFGDRGILMSQDKDSWLSPMARFDIPTGTVWPALDNPFGWIKWDDTQAMPVRAHGLKFFWWQMLAGDDADNVKGITLLDGKPCGKRTAFDAIYPITSEQDAAEFVVAAYARNNQDVLAEAECLFLRRSQSDSAYQYLMSLLTTPSLRDWVQSLHEYHKQHIQWIQEHPDNGENDS,193,87.290006,90.58224,0.97195,0.940903,0.98274,0.969921,69.603494,3.292234
7,Q5GQY6,718,90.340547,BASE+LOGAN+12CY,MKINFKSHQITDREVDFTQDDQVRLFEIMKQEFINHITYSRFDSIIHLTKFEQQLYNFCKDYNVDIEYSKDRVAFFTALIKEMKYQ,86,57.228919,60.270521,0.39317,0.64958,0.39527,0.648882,35.569681,3.041602
8,K8DUZ9,123,90.277649,BASE,MNAFEKRAQLKDIKPGAILYEVFSINGVKAEMGPKKIITGLPFQHLSIGLFVDAITVYDDWEGRQHMSLMDHNVIGRNNYNFHALFLSKKDAQEYVDQINNDQLPPELRETSRKMHREWIVRRAEDALYDM,131,67.014851,73.684647,0.78825,0.73338,0.8896,0.899537,44.113523,6.669796
9,A0A0A0RPE0,30,90.647829,BASE+LOGAN+12CY,MLILIMMVVAVVGPLVALIVTEVCDAKWDRKYNEVMEQRYAERRVRKWEQRYNAHSDTLAS,61,74.841414,74.841414,0.87703,1.0,0.87703,1.0,34.537524,0.0


In [3]:
from Bio import SeqIO


def check_a3m_lengths(filepath):
    lengths = {}
    try:
        for record in SeqIO.parse(filepath, format="fasta"):
            # DELETE all lowercase letters from the sequence
            for char in record.seq:
                if char.islower():
                    record.seq = record.seq.replace(char, "")
            lengths[record.id] = len(record.seq)

        first_record_id = list(lengths.keys())[0]
        reference_length = lengths[first_record_id]

        print(f"Reference sequence: '{first_record_id}' (Length: {reference_length})")

        is_consistent = True
        for seq_id, length in lengths.items():
            if length != reference_length:
                print(f"MISMATCH: '{seq_id}' has length {length}")
                is_consistent = False

        if is_consistent:
            print(f"All {len(lengths)} sequences have the same length: {reference_length}")

    except Exception as e:
        print(f"An error occurred: {e}")

file_to_check = "/scratch/project/open-35-8/antonb/bfvd/bfvd_msa/A0A1M7XUY2.a3m"
check_a3m_lengths(file_to_check)

Reference sequence: 'UniRef100_A0A1M7XUY2' (Length: 437)
All 2527 sequences have the same length: 437
