In [1]:
#!pip install pypdb
import requests
import os
import string
from collections import defaultdict
from collections import Counter
import re
import shutil
import statistics
from datetime import date
#import hail as hl
import glob
import time
import pytrimal
# Import from installed package
#from pypdb.clients.pdb.pdb_client import *
import dask.dataframe as dd
import json
import pandas as pd
#import plotly as px
from Bio.PDB import PDBParser, PDBIO, Select, MMCIFParser, Structure, Chain, Atom
from Bio.PDB import Model as Bio_Model
from Bio.PDB import Chain as Bio_chain
from Bio.SeqIO import PirIO
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, Align, PDB, Seq, AlignIO
#from Bio import pairwise2
from io import StringIO
from modeller import *
from modeller.automodel import *
from modeller.parallel import job, local_slave
import matplotlib.pyplot as plt
import matplotlib.style as style
import logging
import subprocess
import shlex
from subprocess import PIPE, run
import numpy as np
import math
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, wait
from functools import partial
from bs4 import BeautifulSoup  #required later to download SIFT files.
import atomium
from itertools import compress
from sklearn import metrics
from sklearn.cluster import AffinityPropagation
from sklearn.datasets import make_blobs
from sklearn.cluster import MeanShift
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.gridspec as gridspe
import plotly.express as px
import plotly.io as pio
from bravado.client import SwaggerClient
from pycanal import Canal
#import hdbscan
from sklearn.cluster import DBSCAN
from pathlib import Path
import concurrent.futures
import threading
from threading import Lock
from concurrent.futures import ProcessPoolExecutor, as_completed
from packman import molecule
from packman.apps import predict_hinge

from sklearn.cluster import OPTICS, cluster_optics_dbscan

#logging.getLogger("requests").setLevel(logging.WARNING)


In [116]:
class DownloadPipe:
    '''Class object containing the download function that will download all pdbs 
    which we need for downstream analysis of a particular uniprot ID'''
    
    def __init__(self, templates, work_dir, download_type="pdb"):

        # Storage for potential use.
        self.work_dir = work_dir 
        # Download PDB or also mmCIF (currently only PDB)
        self.download_type = download_type
        # All pdb files that we need to download
        self.pdbs_to_download = templates 
        # The bash script location which will download the pdbs. 
        self.download_script = os.path.join(work_dir, "batch_download_modified.sh")
        # The location for the temporary file that is required for the download_script as input.
        self.download_tmp = os.path.join(work_dir, "pdb_list.csv")     
        # The list of chains that will be used later to fetch correct structures.
        self.chain_dict = self._setup_download_list()
        # We store also meta info as a json dict
        self.meta_dict = None
        #we store high resolution structures as a list if the user wants to separate based on resolution.
        self.high_resolution = None
            
    def paralellized_download(self):
        '''This function is going to call _download_files n times to parallelize download. It is going to pass
        the function call itself **_download_file**, self.download_tmp (the location of the tmp file which is pdb_id comma separated), p (an additional parameter specifying that 
        we want to download pdbs, and self.work_dir(the current work dir)'''
        
        # ThreadPoolExecutor
        with ThreadPoolExecutor() as executor:
            # Submit your tasks to the executor.
            futures_pdb = [executor.submit(self._download_files, self.download_tmp, 'p', self.work_dir)]

            # Optionally, you can use as_completed to wait for and retrieve completed results.
            for future in as_completed(futures_pdb):
                result = future.result()
                
            
    def _setup_download_list(self):
        '''Helper function to setup the list of comma-separated pdb ids for the download_files function'''
        
        chain_dict = defaultdict(list)
        #Parse through all PDBs and their associated chains which we need to download.
        for pdb in self.pdbs_to_download:
            # Extract the 4-digit PDB-ID
            pdb_4_digit_id = pdb[:4]
            # Extract the Chain
            chain = pdb[-1]
            # Map the chains that we need to each unique PDB-ID
            chain_dict[pdb_4_digit_id].append(chain)

        # We only want to download pdb files once. 
        # No reason to download a PDB-file 4 times just because we need chain [A, B, C, D]
        unique_pdbs = chain_dict.keys() # Keys : PDB-IDs, Vals: Chains
        
        # Create download_files input list
        with open(self.download_tmp, "w") as pdb_tar:
            pdb_tar.write(",".join(unique_pdbs))

        print(chain_dict)
        return chain_dict
        
    
    def _download_files(self, download_tmp, download_type, path)->list:
        """This helper function runs inside paralellized_download 
        and will be used to get the PDB files that we require for downstream analysis."""
        
        results = []
    
        # Input for subprocess
        bash_curl_cmd = f"{self.download_script} -f {download_tmp} -o {path} -{download_type}"
        # split into list 
        bash_curl_cmd_rdy = bash_curl_cmd.split()
        
        try:
            # Run subprocess
            result = run(bash_curl_cmd_rdy, stdout=PIPE, stderr=PIPE, universal_newlines=True)
            # Append result output to results
            results.append(result.stdout.split("\n")[:-1])  # Skip the last empty element
        except Exception as e:
            results.append(f"Error downloading: {e}")

        return results    
    
    def retrieve_meta(self)->dict:
        '''We also want to store meta information about resolution etc.
        This function takes each pdb file and retrieves the following information:
        
        - Title
        - Keywords
        - PDBcode
        - Authors
        - Deposition date
        - Technique
        - Resolution
        - R_value : If crystallography else None
        - R_free : If crystallographe else None
        - Classification
        - Organism
        - Expression System
        - Number of amino acids in the asymmetric unit
        - Mass of amino acids in the asymmetric unit (Da)
        - Number of amino acids in the biological unit
        - Mass of amino acids in the biological unit (Da)
        '''
        
        #little helper function to deal with date data
        def _date_encoder(obj):
            if isinstance(obj, date):
                return obj.isoformat()  # Convert date to ISO format

        #grab all PDB files which contain the meta information.
        pdbs_to_retrieve = [f for f in os.listdir(self.work_dir) if f.endswith(".pdb")]

        #here we store info about ALL pdbs.
        meta_dictionary = defaultdict()
        
        for pdbs in pdbs_to_retrieve:

            # We store meta data here for EACH pdb.
            sub_dict = defaultdict()
            #we store the 4-digit code for easier access
            pdb_code = pdbs[:4]
            # We need the fullpath to fetch meta data.
            fullp = os.path.join(self.work_dir, pdbs)

            #open file through atomium
            pdb = atomium.open(fullp)
            sub_dict["title"] = pdb.title
            sub_dict["key_words"] = pdb.keywords
            sub_dict["code"] = pdb.code
            sub_dict["authors"] = pdb.authors
            sub_dict["deposition_date"] = pdb.deposition_date.isoformat()  #isoformat because it is a time object
            sub_dict["technique"] = pdb.technique
            sub_dict["resolution"] = pdb.resolution
            sub_dict["r_val"] = pdb.rvalue
            sub_dict["r_free"] = pdb.rfree
            sub_dict["classification"] = pdb.classification
            sub_dict["organism"] = pdb.source_organism
            sub_dict["expression_system"] = pdb.expression_system
            sub_dict['number_of_residues_asymmetric_unit'] = len(pdb.model.residues())
            sub_dict['mass_dalton_asymetric_unit'] = f"{pdb.model.mass:.2f}" 
            #sub_dict["file_type"] = pdb.filetype
            #sub_dict["missing_res"] = pdb.missing_residues

            try:
                #build the biological assembly 
                assembly = pdb.generate_assembly(1)
                sub_dict['number_of_residues_biological_unit'] = len(assembly.residues())
                sub_dict['mass_dalton_biological_unit'] = f"{assembly.mass:.2f}"
                
            except Exception as e:
                print(f"We could not build the assembly for: {pdb_code}")
                
            meta_dictionary[pdb_code] = sub_dict
        
        # Code block to store meta info as a txt file.
        with open(f"{work_dir}/human_meta_data.txt", "w") as human_fh:
            for _, entries in meta_dictionary.items():
                for info , val in entries.items():
                    human_fh.write(info)
                    human_fh.write(":")
                    if isinstance(val, list):
                        val = ", ".join(map(str,val))
                    human_fh.write(str(val))
                    human_fh.write("\n")
                    
                human_fh.write("-"*80)

        #lets store meta info as json dict
        self.meta_dict = meta_dictionary

        


In [173]:
class PDBCleaning:
    '''Class object containing the cleaning functions that will work on all PDBs that we fetched.'''
    
    def __init__(self, work_dir, chain_dict=None, meta_dict=None):
        self.work_dir = work_dir
        self.chain_dict = chain_dict
        self.filtered_chain_dict = None
        self.meta_dict = meta_dict
        self.shifts = None
        self.filtered_structures = None
        self.renumbered = False

    
    def setup_cutoff(self, cutoff, apply_filter=False):
        '''If we want to setup a resolution cutoff filter for further downstream analysis, 
        this function helps with it.'''
        
        # If there is no meta dict we cant proceed and filter based on resolution.
        if self.meta_dict:

            #here we store the pdb codes that we keep
            pdbs_to_keep = []
            #Now lets parse through the whole meta dict and fetch the cutoffs for structures.
            for _, single_pdbs in self.meta_dict.items():
                if single_pdbs['resolution'] <= cutoff:
                    pdbs_to_keep.append(single_pdbs['code'].lower()) #normalize to lower in order to have uniform list members.    


            self.filtered_structures = pdbs_to_keep

            #now if we directly want to apply the filter to remove files that dont match our criteria.
            if apply_filter:
                #check for union between files and kept structures.
                pdbs_to_retrieve = [f[:4] for f in os.listdir(self.work_dir) if f.endswith(".pdb")]

                #lets fetch the intersect between the 2 sets which corresponds to the pdbs we want to keep.
                common_pdb = set(pdbs_to_retrieve) & set(pdbs_to_keep) #intersection
                intersect_lst = list(common_pdb)
                self.filtered_structures = pdbs_to_keep
                
                if self.chain_dict:
                    #now we need to update the chain_dict as well:
                    filtered_dict = {pdb: v for pdb, v in self.chain_dict.items() if pdb[:4] in pdbs_to_keep}
                    
                    self.chain_dict = filtered_dict
                    
        
        else:
            print("We have no meta dict to implement a cutoff")


    
    def parallel_shift_calculation(self):
        '''Here we compute the shift according to uniprot or authors in order to be in line with UNIPROT numbering which is crucial for later renumbering.'''
        
        pdbs_to_retrieve = [f[0:4] for f in os.listdir(self.work_dir) if f.endswith(".pdb")]

        #print(pdbs_to_retrieve)
        #print(set(x[:4] for x in self.chain_dict.keys()))
        
        pdbs_to_retrieve = set(pdbs_to_retrieve) & set(x[:4] for x in self.chain_dict.keys()) #here we check the first 4 which is pdb code

        #print(pdbs_to_retrieve)
        
        link_path = "https://www.ebi.ac.uk/pdbe/api/mappings/uniprot"
        shift_dict = defaultdict()

        with ThreadPoolExecutor() as executor:
            # Using partial to create a function with fixed parameters (link_path)
            calculate_shift_bound = partial(self._calculate_shift)
            
            tasks = ((link_path, pdb) for pdb in pdbs_to_retrieve)

            # Map the bound function to the arguments in parallel
            results = executor.map(calculate_shift_bound, tasks)

            # Combine the results
            for result in results:
                for keys, vals in result.items():
                    shift_dict[keys] = vals

        self.shifts = shift_dict

    
    def _calculate_shift(self, args):

        link_path, pdb = args
        shift_dict = defaultdict()
        
        searchp = f"{link_path}/{pdb[0:4]}"
        resp = self._get_url(searchp)
        resp = resp.json()
        
        for pdb_id, pdb_info in resp.items():
            for uniprot_id, uniprot_info in pdb_info['UniProt'].items():
                for mapping in uniprot_info['mappings']:
                    chain_id = mapping['chain_id']
                    unp_start = mapping['unp_start']
                    unp_end = mapping['unp_end']
                    
                    author_start = mapping['start']['author_residue_number']
                    author_end = mapping['end']['author_residue_number']
    
                    if author_start is None:
                        author_start = unp_start
                    if author_end is None:
                        author_end = unp_end
    
                    shift_start = unp_start - author_start
                    shift_end = unp_end - author_end
    
                    shift_dict[f"{pdb_id}_{chain_id}"] = shift_start 

        return shift_dict

    
    def parallel_renumbering(self):

        if self.shifts:
            
            #needs a list to apply it to 
            relevant_files = self.chain_dict.keys()
            
            with ProcessPoolExecutor() as executor:
                # Using partial to create a function with fixed parameters (shift_dict, path)
                renumber_structure_partial = partial(self._renumber_structure, shift_dict=self.shifts, path=self.work_dir)
                # Map the renumbering function to each relevant file in parallel
                executor.map(renumber_structure_partial, relevant_files)

            self.renumbered = True
        else:
            print("You first need to obtain shifts which will be used as reference in order to start renumbering.\nCall first .parallel_shift_calculation()")
    
    def _renumber_structure(self, files, shift_dict, path):
        '''Function that is going to apply pdb_shiftres_by_chain.py to each pdb file that is shifted.
        Will apply renumbering to ALL structures if you did not set a cutoff previously and applied filter. 
        If filter applied for resolution will only renumber those structures that are left after filtering.'''
        
        for keys, vals in shift_dict.items():
            #dont renumber if there is not shift
            if files == keys[0:4] and vals != str(0):

                chain = keys[-1]
                shift = int(vals)
                filepath = f"{self.work_dir}/{files}.pdb"
                # Should we really shift by shift + 1??? or just shift?
                bash_cmd = f"python {self.work_dir}/pdb_shiftres_by_chain.py {filepath} {shift} {chain}"
                bash_cmd_rdy = bash_cmd.split()
            
                with open(f"{filepath}_tmp", "w") as fh_tmp:
                    result = run(bash_cmd_rdy, stdout=fh_tmp, stderr=PIPE, universal_newlines=True)
                    # Now replace the original one with the temp file.
                    os.replace(f"{filepath}_tmp", filepath)
    
    
    def _get_url(self, url):
        try:
            response = requests.get(url)  
            if not response.ok:
                print(response.text)
        except:
            response.raise_for_status()
            #sys.exit()
        
        return response



    

In [221]:
class PdbBuilder:
    def __init__(self, work_dir, structures):
        self.work_dir = work_dir
        #Here we store the structures that need to be built.
        self.structures = structures
        

    
    def build_assembly(self):

        # These files need to be opened, rechained and assemblies built.
        full_pdb_paths = [os.path.join(self.work_dir, f"{file}.pdb") for file in self.structures]

        
        oligostates = defaultdict(str)
    
        #this letterdict is used for rechaining.
        letterdict = {i: chr(65 + i) for i in range(26)}
        
        #changed this here from threadpool to process pool
        with ProcessPoolExecutor() as executor:
            # Define your processing function, partially applied with gene_name and main_protein_seq
            process_func = partial(self._process_pdb, letterdict=letterdict)
    
            results = executor.map(process_func, full_pdb_paths)
    
            for result in results:
                #print(f"this is result: {result}")
                oligostates.update(result)
                
        return oligostates


    def _process_pdb(self, path:str,letterdict:dict)->dict:
        #helper function to split between nmr and xray / cryoem
        try:
        
            pdb_file_name = os.path.basename(path)
        
            pdb_file = atomium.open(path)
            # We filter according to model length.. if there are more than 5 models deposited its NMR
            model_len = pdb_file.models
            
            if len(model_len) > 5:

                print("we go into _NMR_ensemble")
                #this gives back a dictionary with all nmr structures and their oligomeric state (mostly monomer.)
                return {pdb_file_name: self._NMR_ensemble(path=path, letterdict=letterdict)}
            else:
                return {pdb_file_name: self._non_NMR_structures(path=path, letterdict=letterdict)}
    
        except Exception as error:
            print("process pdb did not work")
            print(error)
        
            return {}
    
    #helper function for XRAY and CRYO-EM ensembles.
    def _non_NMR_structures(self, path:str, letterdict:dict):
    
        """This function takes in the the pdb file that is xray or cryoem and rechains each chain. 
        Additionally, we merge the new labelled chains into a merged_pdb file for further use."""
    
        #store base dir.
        base_dir = os.path.dirname(path)
        pdb_name = os.path.basename(path)[:4]
    
        #/home/micnag/bioinformatics/rcsb_retrieved_pdbs/rcsb_fetched_structures/NUDT4B/3h95.pdb
        pdb_file = atomium.open(path)
    
        assemblies = [pdb_file.generate_assembly(n + 1) for n in range(len(pdb_file.assemblies))]
    
        #we take the first one(this is the biological unit built from the asymmetric unit)
        assembly = assemblies[0]

        #tuple containing chain ID and LEN of each chain.
        seq_chains = [(chain.id, len(chain.sequence)) for chain in assembly.chains()]

        sorted_lens = sorted(seq_chains, key= lambda x: x[1], reverse=True) #reverse = true :largest first.

        accepted_chains = []  #this will be used to store and evalute oligomeric state.
    
        min_accepted_length = float('inf')   # Minimum length of accepted chains init as pos inf.

        for chain, length in seq_chains:
            #for each chain and length.
            if not accepted_chains or length > 0.8 * min_accepted_length:
                #we accept it if either we have no other chain so far OR length > 80% of the chain we have so far.
            
                accepted_chains.append(chain)
                #first chain will be accepted and then will be the standard for the next chains to follow.
                min_accepted_length = min(min_accepted_length, length)



        oligostate = len(accepted_chains)  #this excludes small peptides ect from being mistaken as oligomers.

        """Part 2. We investigate oligomeric state."""
    
        accepted_chains_set = set(accepted_chains)

        oligomeric_status = None
    
        if len(accepted_chains_set) != len(accepted_chains):
            #this means we have a homo-oligomer!
            #e.g A vs A A A .. len(1) != len(0)
            #hetero-mers are not caught here.. A B C == A B C == len(3)
            oligomeric_status = "homo_oligomer"
        
        elif len(accepted_chains) == 1:
        
            #this means we deal with a monomer.
            oligomeric_status = "monomer"
        
        elif len(accepted_chains) > 1 and len(accepted_chains) == len(accepted_chains_set):
        
            #this means its a mixed heteromer. becaue len(1) > AND set == list aka no redundancy ergo heteromer.
            oligomeric_status = "hetero_oligomer"

        """Part 3: We follow through and now save individual chains + send them to proper rechaining. """    
        path_list = []

        for idx, chain in enumerate(assembly.chains()):
            chain_label = chain.id
            if chain_label in accepted_chains_set:
                path_to_pdb = f"{self.work_dir}/{pdb_name}_{idx}.pdb"
                #save it here.
                path_list.append(path_to_pdb)
                #and also save the structure in its wrong chain state first.
                chain.save(path_to_pdb)
        
            path_list = sorted(path_list, key=lambda x: int(x[-5]))

    
        """Part 4: We now deal with all kind of oligomers, and also save all single chains in the procedure.
        Normal monomers are also simply saved and rechained. Everything according to a general schema for efficient
        downstream processing."""
    
        self._merge_pdb_chains(path_list, pdb_name=pdb_name, oligomeric_status=oligomeric_status, 
                      letterdict=letterdict, accepted_chains = accepted_chains, accepted_chains_set=accepted_chains_set)
    
        
        #we return the oligostate of this file and merge it into dict as return value.
        return oligostate


    #helper function for NMR ensembles.
    def _NMR_ensemble(self, path:str, letterdict:dict):

        """This function takes in the NMR ensemble and splits each state into a respective PDB file."""
    
        #open the pdb file
        print(f"we currently open with atomium: {path}")
    
        pdb_name = os.path.basename(path)[:4] #4 digit ID
        base_dir = os.path.dirname(path) #base dir name
    
        pdb_file = atomium.open(path)
    
        oligostate = 1 #default initialized

        path_list = []
    
        for i, model in enumerate(pdb1.models):
            #now iterate through each model and its respective chains.

            chain_len = len([x.id for x in model.chains()]) > 1 # True if multiple chains.
            #if larger than 1 : we need to merge.
            chain_paths = []
            new_chains = []
        
            for j, chain in enumerate(model.chains()): #enumerate because there are NMR models with MULTIPLE CHAINS
            
                #here we save the structure. as number.. we need to check the chain id.
            
                new_chain = chain.copy(id=letterdict[j]) #new chain ID.
            
                #this effectively rechained the chain.
                save_location = f"{base_dir}/{pdb_name}_{i}_{letterdict[j]}.pdb"

                print(f"We save NMR structure chain at : {save_location=}")
                new_chain.save(save_location) # e.g 4ND5_0_A.pdb 4ND5_1_A etc..
            
                #for multichain this would be : 5NR2_0_A.pdb 5NR2_0_B.pdb 5NR2_0_C.pdb etc.
            
                if chain_len:
                    chain_paths.append(save_location)
                    new_chains.append(new_chain.id)
        
            #if done: check if there are multiple chains. if yes. merge.
            if chain_len:

                oligostate = len(new_chains)
            
                #this is still inside the i enumerate one so we utilize this one.
                #we merge the chains.

                print(f"This is {new_chains=}")
                save_nmr_oligomer = f"{base_dir}/{pdb_name}_{i}_{''.join(new_chains)}.pdb"
            
                merge_command = f"python {self.work_dir}/pdb_merge.py {' '.join(chain_paths)}"
    
                merge_command_rdy = merge_command.split()
    
                merge_output_file = f"{save_nmr_oligomer}_tmp.pdb"  #tmp
    
                with open(merge_output_file, "w") as fh_out:
                    result_pdbs = run(merge_command_rdy, stdout=fh_out, stderr=PIPE, universal_newlines=True)
        
                # Run tidy on the merged PDB
                tidy_command = f"python {self.work_dir}/pdb_tidy.py {merge_output_file}"
    
                tidy_command_rdy = tidy_command.split()
    
                tidy_output_file = f"{save_nmr_oligomer}.pdb"
    
                with open(tidy_output_file, "w") as fh_out2:
                    results_tidy = run(tidy_command_rdy, stdout=fh_out2, stderr=PIPE, universal_newlines=True)

                #we remove tmp intermediate files.
                os.remove(merge_output_file) #this is the tmp file that is not tidy.

        return oligostate
    
    
    def _merge_pdb_chains(self, path_list:list, pdb_name:str, oligomeric_status:str, letterdict:dict,
                     accepted_chains:list, accepted_chains_set:set):

        """DEBUG THIS FUNCTION. NEEDS TO SAVE ACCORDINGLY ALL STRUCTURES AT MERGED_CLEANED_FILES."""

        if oligomeric_status == "homo_oligomer":
        
            print("we move into pure oligomer now!")

            #this is ok. lets continue afterwards with the rest.
        
            self._pure_oligomer_rechaining(path_list=path_list, letterdict=letterdict, pdb_name=pdb_name)
        
        elif oligomeric_status == "hetero_oligomer":
        
            print("we move into hetero oligomer now!")

            self._mixed_oligomer_rechaining(path_list=path_list, letterdict=letterdict, pdb_name=pdb_name,
                                  accepted_chains=accepted_chains, 
                                   accepted_chains_set=accepted_chains_set)

        elif oligomeric_status == "monomer":
        
            #print("we move into monomer now!")

            #print(f"{path_list=}, {letterdict=}, {pdb_name=}")
            self._monomeric_rechaining(path_list=path_list, letterdict=letterdict, pdb_name=pdb_name)

        else:
            print(f"There was an issue with: {oligomeric_status=}")

        return

    def _pure_oligomer_rechaining(self, path_list:list, letterdict:dict, pdb_name:str): 
        #path_list = ['/home/micnag/bioinformatics/rcsb_retrieved_pdbs/rcsb_fetched_structures/NUDT4B/3h95_0.pdb',
        # '/home/micnag/bioinformatics/rcsb_retrieved_pdbs/rcsb_fetched_structures/NUDT4B/3h95_1.pdb']


        directory = os.path.dirname(path_list[0]) #this does not change. so no reason to constantly evaluate it in the loop

        #store path to chains here.
        lst_to_merge_paths = []
        #store lists here for later merge.
        chain_lst = []

        for path_to_pdb in path_list:
            
            filename = os.path.basename(path_to_pdb) #same as above.
        
            new_chain_digit = filename[5]  # Get the single chain id (e.g., '0')

            parser = PDBParser(QUIET=True)
            individual_structure = parser.get_structure("default", path_to_pdb)

        
            new_chain = letterdict[int(new_chain_digit)]
            #print(f"this is new chain: {new_chain}")
        
            for models in individual_structure:
            
                for chain in models:
                
                    if chain.id == new_chain:
                        #then we simply save it under its original chain.
                        io = PDBIO()
                        io.set_structure(chain)
                        save_location = os.path.join(directory, f"{pdb_name}_{chain.id}.pdb")
                        lst_to_merge_paths.append(save_location)
                        chain_lst.append(chain.id)
                        io.save(save_location)
                    else:
                        chain.id = new_chain
                        io = PDBIO()
                        io.set_structure(chain)
                        save_location = os.path.join(directory, f"{pdb_name}_{chain.id}.pdb")
                        chain_lst.append(chain.id)
                        lst_to_merge_paths.append(save_location)
                        io.save(save_location)
                    

        #now we run the merge script to merge the single chains into a merged pdb.
        #print(f"WE are at oligomer_rechaining pure: {lst_to_merge_paths=}")
        #print(f"WE are at oligomer_rechaining pure: {chain_lst=}")
    
        merge_command = f"python {self.work_dir}/pdb_merge.py {' '.join(lst_to_merge_paths)}"
    
        merge_command_rdy = merge_command.split()
    
        merge_output_file = f"{self.work_dir}/{pdb_name}_{''.join(chain_lst)}_tmp.pdb"  #tmp
    
        with open(merge_output_file, "w") as fh_out:
            result_pdbs = run(merge_command_rdy, stdout=fh_out, stderr=PIPE, universal_newlines=True)
        
        # Run tidy on the merged PDB
        tidy_command = f"python {self.work_dir}/pdb_tidy.py {merge_output_file}"
    
        tidy_command_rdy = tidy_command.split()
    
        tidy_output_file = f"{self.work_dir}/{pdb_name}_{''.join(chain_lst)}.pdb"
    
        with open(tidy_output_file, "w") as fh_out2:
            results_tidy = run(tidy_command_rdy, stdout=fh_out2, stderr=PIPE, universal_newlines=True)

        #we remove tmp intermediate files.
        os.remove(merge_output_file) #this is the tmp file that is not tidy.




    def _mixed_oligomer_rechaining(self, accepted_chains:list,
                               accepted_chains_set:set,
                               path_list:list,
                               letterdict:dict, pdb_name:str):


        seen_chains = sorted(accepted_chains, reverse=False)
    
        chain_seq_len = len(seen_chains) #e.g 6
    
        shift = len(accepted_chains_set) # e.g 3
    
        blocksize = chain_seq_len // shift # e.g 2

        block_count = int(chain_seq_len/blocksize) 
    
        # A A B B C C becomes A D B E C F
    
        # B B C C becomes A C B D 
    
        #i = 1
    
        # A D 
    
        # block 1 2 3 for A A B B C C 
    
        # 0 2 1 3
        j = 0
        new_chain_seq = []
        lst_to_merge_paths = []
    
        for blocks in range(0, block_count):

            for i in range(0, blocksize):
                # first iteration A D
                # second iteration B E
                # third iteration C F
                new_chain = letterdict[blocks+i*shift]
    
                #print(f"this is new_chain in mixed oligomer rechaining: {new_chain=}")
                new_chain_seq.append(new_chain)
    
                path_to_pdb = path_list[j]
            
                directory = os.path.dirname(path_to_pdb)
            
                j += 1
    
                parser = PDBParser(QUIET=True)
                
                prot_name = f"default"
                
                #open the correct pdb and rechain it.
                structure_template = parser.get_structure(prot_name, path_to_pdb)
                
                for models in structure_template:
                    for chain in models:
                        if chain.id != new_chain:
                            chain.id = new_chain
                
                        io = PDBIO()
            
                        io.set_structure(chain)
                        #print(f"This is single chain save inside oligomer rechain: {directory}/{pdb_name}_{chain.id}.pdb")
                        io.save(f"{directory}/{pdb_name}_{chain.id}.pdb")
                        lst_to_merge_paths.append(f"{directory}/{pdb_name}_{chain.id}.pdb")
        
        merge_command = f"python {self.work_dir}/pdb_merge.py {' '.join(lst_to_merge_paths)}"
        
        merge_command_rdy = merge_command.split()
        
        merge_output_file = f"{self.work_dir}/{pdb_name}_{''.join(new_chain_seq)}_tmp.pdb"  #tmp
        
        with open(merge_output_file, "w") as fh_out:
            result_pdbs = run(merge_command_rdy, stdout=fh_out, stderr=PIPE, universal_newlines=True)
            
        # Run tidy on the merged PDB
        tidy_command = f"python {self.work_dir}/pdb_tidy.py {merge_output_file}"
        
        tidy_command_rdy = tidy_command.split()
        
        tidy_output_file = f"{self.work_dir}/{pdb_name}_{''.join(new_chain_seq)}.pdb"
        
        with open(tidy_output_file, "w") as fh_out2:
            results_tidy = run(tidy_command_rdy, stdout=fh_out2, stderr=PIPE, universal_newlines=True)
    
        #we remove tmp intermediate files.
        os.remove(merge_output_file) #this is the tmp file that is not tidy.  

    def _monomeric_rechaining(self, path_list:list,
                          letterdict:dict,
                          pdb_name:str):

        #path_list=['/home/micnag/bioinformatics/rcsb_retrieved_pdbs/rcsb_fetched_structures/NUDT4B/2q9p_0.pdb'], 
        #letterdict={0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K',
        #11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S', 19: 'T', 20: 'U', 21:
        #'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z'}, 
        
        parser = PDBParser(QUIET=True)
        prot_name = "default"
        #We only have 1 path in this list.
        pdb = path_list[0]
        # Open the correct PDB and rechain it.
        dir_name = os.path.dirname(pdb)
        structure_template = parser.get_structure(prot_name, pdb) 
        # Get the new chain ID
        new_chain = "A"  #always... A
        for model in structure_template:
            for original_chain in model:
                if original_chain.id != new_chain:
                    original_chain.id = new_chain
        save_path = os.path.join(dir_name, f"{pdb_name}_{new_chain}.pdb")
        #print(f"This is save path: {save_path=}")
        # Save the modified structure
        io = PDBIO()
        io.set_structure(structure_template)
    
        #print("we save now:")
        io.save(save_path)
        os.remove(pdb)
        

In [224]:
templates = ['5ltu_A', '5ltu_B', '7nnj_A', '7nnj_B', '2duk_A', '2duk_B', '3mcf_A', '3mcf_B', '7tn4_A', '2q9p_A', '2fvv_A', '6pck_A', '6pcl_A', '6wo7_A', '6wo8_A', '6wo9_A', '6woa_A', '6wob_A', '6woc_A', '6wod_A', '6woe_A', '6wof_A', '6wog_A', '6woh_A', '6woi_A', '7aut_A', '7aui_A', '7auk_A', '7aul_A', '7aum_A', '7aun_A', '7auo_A', '7aup_A', '7auq_A', '7aur_A', '7aus_A', '7auu_A', '7auj_A', '3h95_A', '3i7u_A', '3i7u_B', '3i7u_C', '3i7u_D', '3i7v_A', '3i7v_B', '4hfq_A', '4hfq_B']
work_dir = "/home/micnag/bioinformatics/rcsb_retrieved_pdbs/Test_OOP_pipeline"

PdbEnsemble = DownloadPipe(templates=templates, work_dir=work_dir)
PdbEnsemble.paralellized_download()
PdbEnsemble.retrieve_meta()
PdbEnsemble_meta = PdbEnsemble.meta_dict
PdbEnsemble_chains = PdbEnsemble.chain_dict
#shifts = PdbEnsemble.parallel_shift_calculation()



defaultdict(<class 'list'>, {'5ltu': ['A', 'B'], '7nnj': ['A', 'B'], '2duk': ['A', 'B'], '3mcf': ['A', 'B'], '7tn4': ['A'], '2q9p': ['A'], '2fvv': ['A'], '6pck': ['A'], '6pcl': ['A'], '6wo7': ['A'], '6wo8': ['A'], '6wo9': ['A'], '6woa': ['A'], '6wob': ['A'], '6woc': ['A'], '6wod': ['A'], '6woe': ['A'], '6wof': ['A'], '6wog': ['A'], '6woh': ['A'], '6woi': ['A'], '7aut': ['A'], '7aui': ['A'], '7auk': ['A'], '7aul': ['A'], '7aum': ['A'], '7aun': ['A'], '7auo': ['A'], '7aup': ['A'], '7auq': ['A'], '7aur': ['A'], '7aus': ['A'], '7auu': ['A'], '7auj': ['A'], '3h95': ['A'], '3i7u': ['A', 'B', 'C', 'D'], '3i7v': ['A', 'B'], '4hfq': ['A', 'B']})


In [243]:
PDB_Cleaner = PDBCleaning(work_dir=work_dir, meta_dict=PdbEnsemble_meta, chain_dict=PdbEnsemble_chains)
PDB_Cleaner.setup_cutoff(cutoff=2.5, apply_filter=True)  #apply filter to only include structures that are of good quality
PDB_Cleaner.parallel_shift_calculation()  # compute shift for each structure
PDB_Cleaner.parallel_renumbering()  # renumber based on shifts.
#PDB_cleaned_ensemble.chain_dict

struct = PDB_Cleaner.filtered_structures

In [244]:
print(struct)

['3i7v', '2q9p', '7auu', '3mcf', '7aur', '7aup', '3h95', '7auq', '6woa', '6woi', '4hfq', '6wod', '7auk', '6pck', '6wob', '7aun', '7aut', '7auj', '6wof', '6wo9', '5ltu', '7nnj', '6wo8', '7aui', '6wo7', '7aum', '3i7u', '2fvv', '6woc', '7aul', '7aus', '6pcl', '6wog', '7tn4', '6woh', '6woe']


In [245]:
PDB_Builder = PdbBuilder(work_dir=work_dir, structures=struct) #structures that are filtered

In [246]:
PDB_Builder.build_assembly()

we move into pure oligomer now!
we move into hetero oligomer now!
we move into hetero oligomer now!
we move into hetero oligomer now!


defaultdict(str,
            {'3i7v.pdb': 1,
             '2q9p.pdb': 1,
             '7auu.pdb': 1,
             '3mcf.pdb': 1,
             '7aur.pdb': 1,
             '7aup.pdb': 1,
             '3h95.pdb': 2,
             '7auq.pdb': 1,
             '6woa.pdb': 1,
             '6woi.pdb': 1,
             '4hfq.pdb': 2,
             '6wod.pdb': 1,
             '7auk.pdb': 1,
             '6pck.pdb': 1,
             '6wob.pdb': 1,
             '7aun.pdb': 1,
             '7aut.pdb': 1,
             '7auj.pdb': 1,
             '6wof.pdb': 1,
             '6wo9.pdb': 1,
             '5ltu.pdb': 2,
             '7nnj.pdb': 1,
             '6wo8.pdb': 1,
             '7aui.pdb': 1,
             '6wo7.pdb': 1,
             '7aum.pdb': 1,
             '3i7u.pdb': 4,
             '2fvv.pdb': 1,
             '6woc.pdb': 1,
             '7aul.pdb': 1,
             '7aus.pdb': 1,
             '6pcl.pdb': 1,
             '6wog.pdb': 1,
             '7tn4.pdb': 1,
             '6woh.pdb': 1,
   