In [13]:
import os
import numpy as np
from Bio.PDB import PDBParser
from tqdm import tqdm
import json

In [14]:
def get_confidence_and_len(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('pdb_structure', pdb_file)
    confidence = [atom.get_bfactor() for atom in structure.get_atoms() if ((atom.get_bfactor() is not None)  and (atom.get_name() == 'CA'))]
    if confidence:
        return np.mean(confidence),len(confidence)
    else:
        return None,None

In [15]:
def clean_pdb_folder(folder_path, confidence_threshold=70, min_length=50, max_length=100):    
    file_list = [filename for filename in os.listdir(folder_path) if filename.endswith('.pdb')]
    with tqdm(total=len(file_list), desc="Processing files") as pbar:
        for filename in file_list:
            pdb_file = os.path.join(folder_path, filename)
            confidence, length = get_confidence_and_len(pdb_file)
            pbar.update(1)
            if confidence < confidence_threshold or length < min_length or length > max_length:
                os.remove(pdb_file)

In [16]:
clean_pdb_folder("swissprot_pdb_v4", confidence_threshold=70, min_length=50, max_length=100)

Processing files: 100%|██████████| 115/115 [00:01<00:00, 110.54it/s]
