In [1]:
from pathlib import Path
from Bio import SeqIO, pairwise2
from collections import defaultdict
from Bio.SubsMat import MatrixInfo as matlist



In [2]:
class FASTAinfo:
    def __init__(self, name : str, type_info : str, sequence : str, score: int):
        self.name = name
        self.type_info = type_info
        self.sequence = sequence
        self.score = score

In [3]:
error_file = "/home/light/mqy/ncaa/notebooks/tmp/id_length_larger_than_300_20241104.txt"
with open (error_file, "r") as f:
    error_id = f.readlines() # 返回列表
error_id = [line.strip() for line in error_id] # 去掉换行符


In [4]:
training_fastas = []
training_set_path = Path("/home/light/mqy/ncaa/data/training")
for subdir in training_set_path.iterdir():
    if subdir.is_dir():
        if len(list(subdir.glob("*-protein-*.fasta"))) == 0:
            if subdir.name not in error_id:
                fasta_path = subdir / f"{subdir.name}-pep.fasta"
                fasta = [str(record.seq) for record in SeqIO.parse(fasta_path,"fasta")][0]
                training_fastas.append(fasta)

In [5]:
test_fastas = {}
test_root_path = Path("/home/light/mqy/ncaa/data/test_all")
for subdir in test_root_path.iterdir():
    type_info  = subdir.name
    if subdir.is_dir():
        for subsubdir in subdir.iterdir():
            if subsubdir.is_dir():
                fasta_path = subsubdir / f"{subsubdir.name}-pep.fasta"
                fasta_name = subsubdir.name
                fasta = [str(record.seq) for record in SeqIO.parse(fasta_path,"fasta")][0]
                test_fastas[fasta_name] = FASTAinfo(
                    name=fasta_name,
                    type_info=type_info,
                    sequence=fasta,
                    score=None
                )

In [6]:
def align_sequences(seq_a, seq_b):
    alignments = pairwise2.align.globalms(seq_a, seq_b, 2, -1, -0.5, -0.1)
    best_alignment = alignments[0]
    return best_alignment

In [7]:
for name, fasta_info in test_fastas.items():
    score = 0
    for fasta_j in training_fastas:
        best_alignment = align_sequences(fasta_info.sequence, fasta_j)
        max_score = len(best_alignment.seqA) + len(best_alignment.seqB) - abs(len(best_alignment.seqA) - len(best_alignment.seqB))
        similarity_percentage = best_alignment.score / max_score * 100
        if similarity_percentage > score:
            score = similarity_percentage
    
    fasta_info.score = score

In [8]:
# 按 type_info 分组
type_groups = defaultdict(list)
for name, fasta_info in test_fastas.items():
    type_groups[fasta_info.type_info].append(fasta_info)

# 打印每个 type_info 分组中 score 小于 25 的 name
for type_info, fastas in type_groups.items():
    print(f"Type: {type_info}")
    for fasta in fastas:
        print(f"  Name: {fasta.name} with Score: {fasta.score:.3f}")
        # if fasta.score < 25:
        #     print(f"  Name: {fasta.name} with Score: {fasta.score:.3f}")
    print()

Type: linear_monomer
  Name: 6n68A with Score: 16.286
  Name: 2norA with Score: 100.000
  Name: 1v50A with Score: 100.000
  Name: 2l87A with Score: 17.115
  Name: 2mzaA with Score: 18.182
  Name: 2n0nA with Score: 14.286
  Name: 2LDA with Score: 11.429
  Name: 6cmhA with Score: 100.000
  Name: 3zs2D with Score: 15.326
  Name: 3cmhA with Score: 100.000
  Name: 1vm5A with Score: 15.227
  Name: 2LDD with Score: 11.786
  Name: 1geaA with Score: 13.824
  Name: 2fbsN with Score: 17.292
  Name: 5kgyA with Score: 16.744
  Name: 2LDC with Score: 11.739
  Name: 6QXB with Score: 22.222
  Name: 1mxqA with Score: 100.000
  Name: 2mylA with Score: 14.286
  Name: 7jgyA with Score: 16.286
  Name: 2mz2A with Score: 18.182
  Name: 2mymA with Score: 14.394

Type: linear_complex
  Name: 1d5eA with Score: 16.250
  Name: 6gb1B with Score: 27.564
  Name: 3oqza with Score: 15.385
  Name: 3or0a with Score: 11.346
  Name: 2fx8P with Score: 11.200
  Name: 2k7lB with Score: 11.071
  Name: 2rlnS with Score: 15.385