In [2]:
import pyhmmer

def get_hmm_names(hmm_file_path):
    """
    Extract all HMM names from a multi-HMM flat file.
    
    Parameters:
    -----------
    hmm_file_path : str
        Path to the HMM file
    
    Returns:
    --------
    list
        List of HMM names from the file
    """
    hmm_names = []
    
    with pyhmmer.plan7.HMMFile(hmm_file_path) as hmm_file:
        for hmm in hmm_file:
            hmm_names.append(hmm.name.decode())
    
    return hmm_names

this is a notebook for parsing hmms and reorganizing hmm databases

In [8]:
import re

In [None]:
virion_list[0:5]

['seeker_phage_cluster0/PFAM:PF03354.15-Terminase',
 'seeker_phage_cluster10/PFAM:PF03354.15-Terminase',
 'seeker_phage_cluster11/PFAM:PF04860.12-Portal',
 'seeker_phage_cluster12/PDB:3CPE_A-Terminase',
 'seeker_phage_cluster13/PFAM:PF06381.11-Portal']

In [22]:

for x in virion_list[0:5]:
    print(
        re.sub("(.*?)-", "", x, count = 1)
        #x
    )

Terminase
Terminase
Portal
Terminase
Portal


In [26]:
name_dict = {}
for x in virion_list:
    annot_name = re.sub("(.*?)-", "", x, count = 1).rstrip("-").lower()

    if not annot_name in name_dict:
        name_dict[annot_name] = 1
    else:
        name_dict[annot_name] +=1


In [52]:
from typing import Dict, List, LiteralString
import pyhmmer

def get_hmm_categories(hmm_file_path):
    """
    Extract all HMM names from a multi-HMM flat file.
    
    Parameters:
    -----------
    hmm_file_path : str
        Path to the HMM file
    
    Returns:
    --------
    dict 
        HMM names (parsed): HMM IDs
    """

    hmm_dict: Dict[str, List[str]] = {}
    #hmm_dict = {}
    
    with pyhmmer.plan7.HMMFile(hmm_file_path) as hmm_file:
        for hmm in hmm_file:

            # full ID
            hmm_name = hmm.name.decode()

            #parse name
            annot_name = re.sub("(.*?)-", "", hmm_name, count = 1).rstrip("-").lower()

            if not annot_name in hmm_dict:
                hmm_dict[annot_name] = [hmm.name.decode()]
            else:
                hmm_dict[annot_name].append(hmm.name.decode())
    return hmm_dict

In [53]:
virion_dict = get_hmm_categories(
    "/Users/michaeltisza/mike_tisza/github_repos/Cenote-Taker3/hmmscan_DBs/v3.1.1/Virion_HMMs.h3m"
)

In [54]:
dnarep_dict = get_hmm_categories(
    "/Users/michaeltisza/mike_tisza/github_repos/Cenote-Taker3/hmmscan_DBs/v3.1.1/DNA_rep_HMMs.h3m"
)

In [3]:
import pyhmmer
import os
import re

def split_hmms_annot(hmm_file_path, outdir):
    """
    Extract all HMM names from a multi-HMM flat file. Make new .hmm files based on name.
     
    Parameters:
    -----------
    hmm_file_path : str
        Path to the HMM file
    
    Returns:
    --------
    outdir
    """

    with pyhmmer.plan7.HMMFile(hmm_file_path) as hmm_file:
        for hmm in hmm_file:

            # full ID
            hmm_name = hmm.name.decode()

            #parse name
            annot_name = re.sub("(.*?)-", "", hmm_name, count = 1).rstrip("-").lower()

            outf = os.path.join(outdir, f"{annot_name}.hmm")
            if not os.path.isfile(outf):
                with open(f"{outdir}/{annot_name}.hmm", "wb") as output_file:
                    hmm.write(output_file)
            else:
                with open(f"{outdir}/{annot_name}.hmm", "ab") as output_file:
                    hmm.write(output_file)

    return outdir

In [58]:
dnarep_out = split_hmms_annot(
    "/Users/michaeltisza/mike_tisza/github_repos/Cenote-Taker3/hmmscan_DBs/v3.1.1/DNA_rep_HMMs.h3m",
    "/Users/michaeltisza/mike_tisza/sandbox/split_ct3_hmms/dnarep"
)

In [4]:
virion_out = split_hmms_annot(
    "/Users/michaeltisza/mike_tisza/github_repos/Cenote-Taker3/hmmscan_DBs/v3.1.1/Virion_HMMs.h3m",
    "/Users/michaeltisza/mike_tisza/sandbox/split_ct3_hmms/virion"
)