### to run this code, you need to have standalone EMBOSS (getorf), BLAST, and HMMER installed

In [1]:
import subprocess
import os
from glob import glob
from Bio import SearchIO
from Bio import SeqIO

## load and adjust all paths

In [2]:
outdir = "/home/majnusova/all/projects/plv/data/eustig_orfs/" #adjust
genomes = glob("/home/majnusova/all/projects/plv/data/genomes_test/*.fasta")
hmm_model = "/home/majnusova/all/projects/bilabrum/data/hmm/Packiging_ATPase.hmm" #adjust
output_directory = "/home/majnusova/all/projects/plv/data/eustigs_hmmout/" #adjust
orf_files = glob("/home/majnusova/all/projects/plv/data/eustig_orfs/*_orfs.fa") #adjust: list of orf databases
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"
scaffolds_dir = "/home/majnusova/all/projects/plv/data/eve_outscaffolds/"
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"

### Extract ORFs from genomes

In [69]:
def run_getorf(genome, orf_file):
    command = ["getorf", "-sequence", genome, "-outseq", orf_file, "-table", "0", "-minsize", "360", "-find", "1"] #adjust (120 aa, START to STOP, standard gen. code)
    subprocess.run(command, check=True)

outdir = "/home/majnusova/all/projects/plv/data/eustig_orfs/" #adjust
genomes = glob("/home/majnusova/all/projects/plv/data/genomes_test/*.fasta") #adjust - this is just a test dataset

for genome in genomes:
    orf_file = f"{outdir}{genome.split('/')[-1].replace('.fasta', '_orfs.fa')}"
    run_getorf(genome, orf_file)

Find and extract open reading frames (ORFs)
Find and extract open reading frames (ORFs)


KeyboardInterrupt: 

### hmmsearch with HMM against those extracted ORFs

In [None]:
# running hhmmsearch against multiple databases at once and saving the resulting files using the .hmmout suffix
def run_hmmsearch(orf_file, hmm_model, output_file):
    command = ["hmmsearch", "-o", output_file, hmm_model, orf_file]
    subprocess.run(command, check=True)

hmm_model = "/home/majnusova/all/projects/bilabrum/data/hmm/Packiging_ATPase.hmm" #adjust
output_directory = "/home/majnusova/all/projects/plv/data/eustigs_hmmout/" #adjust
orf_files = glob("/home/majnusova/all/projects/plv/data/eustig_orfs/*_orfs.fa") #adjust: list of orf databases


# iterate over the list of database files and run hmmsearch 
for orf_file in orf_files:
    output_file = f"{output_directory}{orf_file.split('/')[-1].replace('_orfs.fa', '_atp.hmmout')}" 
    run_hmmsearch(orf_file, hmm_model, output_file) #needs to be nested inside the loop!


### saving IDs of homologs above inclusion threshold found by HMMER (in all the genomes)

In [3]:
# saving IDs of sequences above the inclustion treshold into ids_list
ids_list_orfs = []
for file in glob("/home/majnusova/all/projects/plv/data/eustigs_hmmout/*_atp.hmmout"):
    hmmer_file = SearchIO.read(file, "hmmer3-text")
    for record in hmmer_file:
        if record.is_included:
            ids_list_orfs.append(record.id)
len(ids_list_orfs) 


280

In [4]:
print(ids_list_orfs)


['NODE_5080_length_5696_cov_26.6876_1', 'NODE_381_length_27838_cov_14.2215_25', 'NODE_7293_length_4272_cov_22.9291_3', 'NODE_5397_length_5445_cov_37.4879_2', 'NODE_6066_length_4954_cov_14.3046_4', 'NODE_847_length_17049_cov_12.9829_3', 'NODE_7343_length_4249_cov_22.2454_2', 'NODE_19673_length_1550_cov_34.6107_1', 'NODE_1409_length_12728_cov_15.5927_2', 'NODE_1574_length_11923_cov_14.5027_1', 'NODE_711_length_18631_cov_14.7806_4', 'NODE_15445_length_2031_cov_23.9084_2', 'NODE_15112_length_2083_cov_16.6144_1', 'NODE_12470_length_2567_cov_17.4088_2', 'NODE_34064_length_837_cov_18.9655_1', 'NODE_4057_length_6629_cov_27.157_1', 'NODE_865_length_16875_cov_12.6059_4', 'NODE_869_length_16855_cov_12.4798_4', 'NODE_6477_length_4696_cov_21.0899_1', 'NODE_17368_length_1781_cov_15.3621_1', 'NODE_18418_length_1671_cov_21.013_1', 'NODE_352_length_29307_cov_12.7034_4', 'NODE_31105_length_924_cov_16.8895_1', 'NODE_4114_length_6570_cov_18.8181_3', 'NODE_64132_length_412_cov_37.521_1', 'NODE_21363_length

### Extracting scaffolds with the viral homologs found by HMMER (ids_list), scaffolds shorter than 18000 nt are discarded

In [24]:
# ids_list contains IDs of ORFs, not scaffolds! this list needs to be modified accordingly
ids_list_scaffolds = []
for id in ids_list_orfs:
    id_modified = id.rsplit('_', 1)[0] # removing the number of orf -> id of the scaffold
    ids_list_scaffolds.append(id_modified)
print(ids_list_scaffolds)

['NODE_5080_length_5696_cov_26.6876', 'NODE_381_length_27838_cov_14.2215', 'NODE_7293_length_4272_cov_22.9291', 'NODE_5397_length_5445_cov_37.4879', 'NODE_6066_length_4954_cov_14.3046', 'NODE_847_length_17049_cov_12.9829', 'NODE_7343_length_4249_cov_22.2454', 'NODE_19673_length_1550_cov_34.6107', 'NODE_1409_length_12728_cov_15.5927', 'NODE_1574_length_11923_cov_14.5027', 'NODE_711_length_18631_cov_14.7806', 'NODE_15445_length_2031_cov_23.9084', 'NODE_15112_length_2083_cov_16.6144', 'NODE_12470_length_2567_cov_17.4088', 'NODE_34064_length_837_cov_18.9655', 'NODE_4057_length_6629_cov_27.157', 'NODE_865_length_16875_cov_12.6059', 'NODE_869_length_16855_cov_12.4798', 'NODE_6477_length_4696_cov_21.0899', 'NODE_17368_length_1781_cov_15.3621', 'NODE_18418_length_1671_cov_21.013', 'NODE_352_length_29307_cov_12.7034', 'NODE_31105_length_924_cov_16.8895', 'NODE_4114_length_6570_cov_18.8181', 'NODE_64132_length_412_cov_37.521', 'NODE_21363_length_1410_cov_17.5911', 'NODE_729_length_18418_cov_17.5

#### ids_list_scaffolds = all viral scaffolds above inclusion threshold (may be shorter than 20000)

In [25]:
print(len(ids_list_scaffolds))

280


#### to be able to extract the scaffolds, blastable databases need to be created first (for genome and orfs)

In [26]:
def makeblastdb(file_path, db_type, blastables_outdir):
    db_name = f"{blastables_outdir}/{file_path.split('/')[-1].split('.')[0]}"
    command = ["makeblastdb", "-in", file_path, "-dbtype", db_type, "-out", db_name, "-parse_seqids"]
    subprocess.run(command, check=True)

blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"


for genome_path in genomes:
    makeblastdb(genome_path, "nucl", blastables_outdir) # misto file_path volame s genome_path

for orf_file_path in orf_files:
    makeblastdb(orf_file_path, "prot", blastables_outdir)




Building a new DB, current time: 01/11/2024 13:40:36
New DB name:   /home/majnusova/all/projects/plv/data/blastables/Chic10_scaffolds
New DB title:  /home/majnusova/all/projects/plv/data/genomes_test/Chic10_scaffolds.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/majnusova/all/projects/plv/data/blastables/Chic10_scaffolds
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 758531 sequences in 9.85493 seconds.


KeyboardInterrupt: 

### extracting all viral scaffolds present in ids_list_scaffolds and deleting those shorter than 20000

In [46]:
# this code needs to be rewritten, it works but its repetitive 
scaffolds_of_interest = []

def run_blastdbcmd(id, genome_db, outscaffold):
    if os.path.exists(outscaffold) and os.path.getsize(outscaffold) > 0:
        record = SeqIO.read(outscaffold, "fasta")
        if len(record.seq) < 20000:
            print(f"Notice: Scaffold {id} is shorter than 20000 nucleotides, removing file.")
            os.remove(outscaffold)
            return False
        else:
            scaffolds_of_interest.append(id)  
            return True

    command = ["blastdbcmd", "-entry", str(id), "-db", genome_db, "-out", outscaffold]
    result = subprocess.run(command, text=True, capture_output=True)

    if result.returncode == 0:
        if os.path.getsize(outscaffold) > 0:
            record = SeqIO.read(outscaffold, "fasta")
            if len(record.seq) < 20000:
                print(f"Notice: Scaffold {id} is shorter than 20000 nucleotides, removing file.")
                os.remove(outscaffold)
                return False
            else:
                scaffolds_of_interest.append(id) 
                return True
        else:
            print(f"Notice: The file created for ID {id} is empty, removing.")
            os.remove(outscaffold)
            return False
    else:
        print(f"Warning: ID {id} not found in database {genome_db}.")
        if os.path.exists(outscaffold) and os.path.getsize(outscaffold) == 0:
            os.remove(outscaffold)
        return False

scaffolds_dir = "/home/majnusova/all/projects/plv/data/eve_outscaffolds/"
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"
prot_database = "_orfs."

for id in ids_list_scaffolds:
    for filename in os.listdir(blastables_outdir):
        if prot_database not in filename:
            base_name = filename.split('.')[0]
            genome_db = os.path.join(blastables_outdir, base_name)
            outscaffold = f"{scaffolds_dir}/{id}_EVE.fasta"
            if run_blastdbcmd(id, genome_db, outscaffold):
                break  # Exit the loop if successful

print('Scaffolds of interest:', scaffolds_of_interest)


Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 is shorter than 20000 nucleotides, removing file.
Notice: Scaffold NODE_8926_length_10478_cov_112.808095 

In [48]:
len(scaffolds_of_interest)
scaffolds_of_interest

['Vischeria_C74_contig_15',
 'Vischeria_C74_contig_12',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_46',
 'Vischeria_C74_contig_4',
 'Vischeria_C74_contig_13',
 'Vischeria_C74_contig_27',
 'Vischeria_C74_contig_13',
 'Vischeria_C74_contig_33',
 'Vischeria_C74_contig_10',
 'Vischeria_C74_contig_10',
 'Vischeria_C74_contig_19',
 'Vischeria_C74_contig_2',
 'Vischeria_C74_contig_4',
 'Vischeria_C74_contig_33',
 'Vischeria_C74_contig_16',
 'Vischeria_C74_contig_25',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_13',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_8',
 'Vischeria_C74_contig_3',
 'Vischeria_C74_contig_18',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_16',
 'Vischeria_C74_contig_12',
 'Vischeria_C74_contig_16',
 'NODE_541_length_101851_cov_46.471309',
 'NODE_55_length_463973_cov_37.984469',
 'NODE_1103_length_61346_cov_37.814180',
 'NODE_320_length_153881_cov_39.008816',
 'NODE_3853_length_25035_cov_37.882483',
 'NODE_4047_length_24047_cov_37.496871',
 'NODE_1

In [49]:
# melo by sedet - jen ids skafoldu nad inclusion treshold a 20000+
set_scaffolds_of_interest = set(scaffolds_of_interest)
len(set_scaffolds_of_interest)

86

### saving items present in set_scaffolds_of_interest into individual files which are gonna serve as inputs for -seqidilist (blasting only against the selected scaffolds, not whole genomes)

In [44]:
seqid_dir = "/home/majnusova/all/projects/plv/data/seqidlists/" # na konci skriptu muze byt cela slozka smazana
for seqid in set_scaffolds_of_interest:
    with open(f"{seqid_dir}/{seqid}.txt", "w") as f:
        f.write(f"{seqid}")

## muze byt nize ve skriptu. 
## Turning the extracted scaffolds into single line files to make it possible to search for the repeats flanking the EVEs

## prepare blastn to find DRs (shortest one has 48 nt, longest one 101 nt) - je treba vybrat jen cast skafoldu, nektere jsou velke?

In [11]:
ids_dict_orfs = {} #dictionary of lists - potrebujeme vedet, proti jakemu genomu scaffoldy blastovat, vracime se zpet k hemmru (IDs casto neobsahuji nazev genomu, ze ktereho pochazi)

for file in glob("/home/majnusova/all/projects/plv/data/eustigs_hmmout/*_atp.hmmout"):
    hmm_records = SearchIO.read(file, "hmmer3-text")
    ids_dict_orfs[file] = [] # create empty list in the dictionary for the current file
    for record in hmm_records:
        if record.is_included:
            ids_dict_orfs[file].append(record.id) # append the ID to the list associated with the current file


# count the total number of IDs across all files:
total_ids = sum(len(id_list) for id_list in ids_dict_orfs.values())
print(f"Total number of IDs across all files: {total_ids}")

# To see the count of IDs per file:
for file, id_list in ids_dict_orfs.items():
    print(f"{file}: {len(id_list)} IDs")

Total number of IDs across all files: 280
/home/majnusova/all/projects/plv/data/eustigs_hmmout/Chic10_scaffolds_atp.hmmout: 82 IDs
/home/majnusova/all/projects/plv/data/eustigs_hmmout/Vischeria_C74_genome_v1_atp.hmmout: 27 IDs
/home/majnusova/all/projects/plv/data/eustigs_hmmout/Characiopsis_New_scaffolds_atp.hmmout: 171 IDs


In [18]:
# ok, vse sedi
for file, id_list in ids_dict_orfs.items():
    print(f"{file}: {id_list}'\n'", sep="\n")

/home/majnusova/all/projects/plv/data/eustigs_hmmout/Chic10_scaffolds_atp.hmmout: ['NODE_5080_length_5696_cov_26.6876_1', 'NODE_381_length_27838_cov_14.2215_25', 'NODE_7293_length_4272_cov_22.9291_3', 'NODE_5397_length_5445_cov_37.4879_2', 'NODE_6066_length_4954_cov_14.3046_4', 'NODE_847_length_17049_cov_12.9829_3', 'NODE_7343_length_4249_cov_22.2454_2', 'NODE_19673_length_1550_cov_34.6107_1', 'NODE_1409_length_12728_cov_15.5927_2', 'NODE_1574_length_11923_cov_14.5027_1', 'NODE_711_length_18631_cov_14.7806_4', 'NODE_15445_length_2031_cov_23.9084_2', 'NODE_15112_length_2083_cov_16.6144_1', 'NODE_12470_length_2567_cov_17.4088_2', 'NODE_34064_length_837_cov_18.9655_1', 'NODE_4057_length_6629_cov_27.157_1', 'NODE_865_length_16875_cov_12.6059_4', 'NODE_869_length_16855_cov_12.4798_4', 'NODE_6477_length_4696_cov_21.0899_1', 'NODE_17368_length_1781_cov_15.3621_1', 'NODE_18418_length_1671_cov_21.013_1', 'NODE_352_length_29307_cov_12.7034_4', 'NODE_31105_length_924_cov_16.8895_1', 'NODE_4114_le

### blasting

In [5]:
def blastn_repeats(query, output, database):
    command = ["blastn", "-query", query, "-out", output, "-db", database] 
    subprocess.run(command, check=True)

output_dir = "/home/majnusova/all/projects/plv/data/blastn_repeats/"  
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"  # the assemblies = databases
path_to_input = "/home/majnusova/all/projects/plv/data/eve_outscaffolds/"

# ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

for assembly, ids in ids_dict_orfs.items(): # keys = names of the assemblies
    database_name = assembly.split("/")[-1].replace("_atp.hmmout", "") # getting the real names of the assemblies (key: /home/majnusova/all/projects/plv/data/eustigs_hmmout/Chic10_scaffolds_atp.hmmout)
    database_path = f"{blastables_outdir}/{database_name}" # navigating the folder with blastable assemblies

    for id in ids:
        edit_id = "_".join(id.split("_")[:-1]) # removing again the number of orf to get the real name of each scaffold
        input_path = f"{path_to_input}/{edit_id}_EVE.fasta" # path to the extracted nt scaffolds found by HMMER
        output_path = f"{output_dir}/{edit_id}_blastn_results.txt"

        # check if the input file exists - scaffolds shorter than 20000 were already discarded so many inputs do not exist anymore
        if os.path.exists(input_path):
            blastn_repeats(input_path, output_path, database_path)
        else:
            # If the file does not exist skip to the next ID - maybe the only way how to overcome the problems with new BLAST?
            print(f"Input file does not exist, skipping ID: {edit_id}")


Input file does not exist, skipping ID: NODE_5080_length_5696_cov_26.6876
Input file does not exist, skipping ID: NODE_7293_length_4272_cov_22.9291
Input file does not exist, skipping ID: NODE_5397_length_5445_cov_37.4879
Input file does not exist, skipping ID: NODE_6066_length_4954_cov_14.3046
Input file does not exist, skipping ID: NODE_847_length_17049_cov_12.9829
Input file does not exist, skipping ID: NODE_7343_length_4249_cov_22.2454
Input file does not exist, skipping ID: NODE_19673_length_1550_cov_34.6107
Input file does not exist, skipping ID: NODE_1409_length_12728_cov_15.5927
Input file does not exist, skipping ID: NODE_1574_length_11923_cov_14.5027
Input file does not exist, skipping ID: NODE_711_length_18631_cov_14.7806
Input file does not exist, skipping ID: NODE_15445_length_2031_cov_23.9084
Input file does not exist, skipping ID: NODE_15112_length_2083_cov_16.6144
Input file does not exist, skipping ID: NODE_12470_length_2567_cov_17.4088
Input file does not exist, skipp

In [None]:
more efficient blast: dict? for each blastn, a text file containing the name of the scaffold should be created (-seqidlist option to blast only against a selected scaffold)

## TSD - nejspis nelze zautomatizovat? nebo zkusit vyextrahovat malou cast scaffoldu, treba plus 10 nt down i upstream od oubou casti repetice a projet to blastn-short?

## extract orfs between tirs - pujde v pohode i bez vymezeneho TSD. kontrola spravnosti elementu - v orfech musi byt protein identifikovany hmmerem hned na zacatku... 