### to run this code, you need to have standalone EMBOSS (getorf), BLAST, and HMMER installed

In [1]:
import subprocess
import os
from glob import glob
from Bio import SearchIO
from Bio import SeqIO 
from Bio.Blast import NCBIXML 

## load and adjust all paths

In [2]:
outdir = "/home/majnusova/all/projects/plv/data/eustig_orfs/" #adjust
genomes = glob("/home/majnusova/all/projects/plv/data/genomes_test/*.fasta")
hmm_model = "/home/majnusova/all/projects/bilabrum/data/hmm/Packiging_ATPase.hmm" #adjust
output_directory = "/home/majnusova/all/projects/plv/data/eustigs_hmmout/" #adjust
orf_files = glob("/home/majnusova/all/projects/plv/data/eustig_orfs/*_orfs.fa") #adjust: list of orf databases
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"
scaffolds_dir = "/home/majnusova/all/projects/plv/data/eve_outscaffolds/"
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"

### Extract ORFs from genomes

In [4]:
def run_getorf(genome, orf_file):
    command = ["getorf", "-sequence", genome, "-outseq", orf_file, "-table", "0", "-minsize", "360", "-find", "1"] #adjust (120 aa, START to STOP, standard gen. code)
    subprocess.run(command, check=True)

outdir = "/home/majnusova/all/projects/plv/data/eustig_orfs/" #adjust
genomes = glob("/home/majnusova/all/projects/plv/data/genomes_test/*.fasta") #adjust - this is just a test dataset

for genome in genomes:
    orf_file = f"{outdir}{genome.split('/')[-1].replace('.fasta', '_orfs.fa')}"
    run_getorf(genome, orf_file)

Find and extract open reading frames (ORFs)
Find and extract open reading frames (ORFs)
Find and extract open reading frames (ORFs)


### hmmsearch with HMM against those extracted ORFs -> hmmout

In [None]:
# running hhmmsearch against multiple databases at once and saving the resulting files using the .hmmout suffix
def run_hmmsearch(orf_file, hmm_model, output_file):
    command = ["hmmsearch", "-o", output_file, hmm_model, orf_file]
    subprocess.run(command, check=True)

hmm_model = "/home/majnusova/all/projects/bilabrum/data/hmm/Packiging_ATPase.hmm" #adjust
output_directory = "/home/majnusova/all/projects/plv/data/eustigs_hmmout/" #adjust
orf_files = glob("/home/majnusova/all/projects/plv/data/eustig_orfs/*_orfs.fa") #adjust: list of orf databases


# iterate over the list of database files and run hmmsearch 
for orf_file in orf_files:
    output_file = f"{output_directory}{orf_file.split('/')[-1].replace('_orfs.fa', '_atp.hmmout')}" 
    run_hmmsearch(orf_file, hmm_model, output_file) #needs to be nested inside the loop!


### saving IDs of ALL homologs above inclusion threshold found by HMMER and name of the assembly they were found in
### genome_scaffold_ids - genome: ids

In [3]:
# names of viral orfs and the genome they originate from
genome_scaffold_orfs = {}

for file in glob("/home/majnusova/all/projects/plv/data/eustigs_hmmout/*_atp.hmmout"):
    genome_name = os.path.basename(file).split('_atp.hmmout')[0] # getting the real names of the genomes; os.path.basename(file) extracts the base name from the file's full path.
    if genome_name not in genome_scaffold_orfs:
        genome_scaffold_orfs[genome_name] = []

    hmmer_file = SearchIO.read(file, "hmmer3-text")
    
    for record in hmmer_file:
        if record.is_included:
            scaffold_id = record.id  # scaffold ID - removing _orf
            genome_scaffold_orfs[genome_name].append(scaffold_id)
            
total_viral_orfs = sum(len(i)for i in genome_scaffold_orfs.values())
total_viral_orfs

280

In [4]:
genome_scaffold_orfs

{'Chic10_scaffolds': ['NODE_5080_length_5696_cov_26.6876_1',
  'NODE_381_length_27838_cov_14.2215_25',
  'NODE_7293_length_4272_cov_22.9291_3',
  'NODE_5397_length_5445_cov_37.4879_2',
  'NODE_6066_length_4954_cov_14.3046_4',
  'NODE_847_length_17049_cov_12.9829_3',
  'NODE_7343_length_4249_cov_22.2454_2',
  'NODE_19673_length_1550_cov_34.6107_1',
  'NODE_1409_length_12728_cov_15.5927_2',
  'NODE_1574_length_11923_cov_14.5027_1',
  'NODE_711_length_18631_cov_14.7806_4',
  'NODE_15445_length_2031_cov_23.9084_2',
  'NODE_15112_length_2083_cov_16.6144_1',
  'NODE_12470_length_2567_cov_17.4088_2',
  'NODE_34064_length_837_cov_18.9655_1',
  'NODE_4057_length_6629_cov_27.157_1',
  'NODE_865_length_16875_cov_12.6059_4',
  'NODE_869_length_16855_cov_12.4798_4',
  'NODE_6477_length_4696_cov_21.0899_1',
  'NODE_17368_length_1781_cov_15.3621_1',
  'NODE_18418_length_1671_cov_21.013_1',
  'NODE_352_length_29307_cov_12.7034_4',
  'NODE_31105_length_924_cov_16.8895_1',
  'NODE_4114_length_6570_cov_1

In [5]:
# creating a dict of lists (genomes: ids)
genome_scaffold_ids = {}

for file in glob("/home/majnusova/all/projects/plv/data/eustigs_hmmout/*_atp.hmmout"):
    genome_name = os.path.basename(file).split('_atp.hmmout')[0] # getting the real names of the genomes; os.path.basename(file) extracts the base name from the file's full path.
    if genome_name not in genome_scaffold_ids:
        genome_scaffold_ids[genome_name] = []

    hmmer_file = SearchIO.read(file, "hmmer3-text")
    
    for record in hmmer_file:
        if record.is_included:
            scaffold_id = record.id.rsplit("_", 1)[0] # scaffold ID - removing _orf
            genome_scaffold_ids[genome_name].append(scaffold_id)

In [6]:
genome_scaffold_ids

{'Chic10_scaffolds': ['NODE_5080_length_5696_cov_26.6876',
  'NODE_381_length_27838_cov_14.2215',
  'NODE_7293_length_4272_cov_22.9291',
  'NODE_5397_length_5445_cov_37.4879',
  'NODE_6066_length_4954_cov_14.3046',
  'NODE_847_length_17049_cov_12.9829',
  'NODE_7343_length_4249_cov_22.2454',
  'NODE_19673_length_1550_cov_34.6107',
  'NODE_1409_length_12728_cov_15.5927',
  'NODE_1574_length_11923_cov_14.5027',
  'NODE_711_length_18631_cov_14.7806',
  'NODE_15445_length_2031_cov_23.9084',
  'NODE_15112_length_2083_cov_16.6144',
  'NODE_12470_length_2567_cov_17.4088',
  'NODE_34064_length_837_cov_18.9655',
  'NODE_4057_length_6629_cov_27.157',
  'NODE_865_length_16875_cov_12.6059',
  'NODE_869_length_16855_cov_12.4798',
  'NODE_6477_length_4696_cov_21.0899',
  'NODE_17368_length_1781_cov_15.3621',
  'NODE_18418_length_1671_cov_21.013',
  'NODE_352_length_29307_cov_12.7034',
  'NODE_31105_length_924_cov_16.8895',
  'NODE_4114_length_6570_cov_18.8181',
  'NODE_64132_length_412_cov_37.521',


In [7]:
total_ids = sum(len(id) for id in genome_scaffold_ids.values())
total_ids

280

### Extracting scaffolds with the viral homologs found by HMMER (ids_list), scaffolds shorter than 18000 nt are discarded

#### ids_list_scaffolds = all viral scaffolds above inclusion threshold (may be shorter than 20000)

## to be able to extract the scaffolds, blastable databases need to be created first (for genome and orfs)

In [9]:
def makeblastdb(file_path, db_type, blastables_outdir):
    db_name = f"{blastables_outdir}/{file_path.split('/')[-1].split('.')[0]}"
    command = ["makeblastdb", "-in", file_path, "-dbtype", db_type, "-out", db_name, "-parse_seqids"]
    subprocess.run(command, check=True)

blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"


for genome_path in genomes:
    makeblastdb(genome_path, "nucl", blastables_outdir) # misto file_path volame s genome_path

for orf_file_path in orf_files:
    makeblastdb(orf_file_path, "prot", blastables_outdir)




Building a new DB, current time: 02/22/2024 17:09:02
New DB name:   /home/majnusova/all/projects/plv/data/blastables/Chic10_scaffolds
New DB title:  /home/majnusova/all/projects/plv/data/genomes_test/Chic10_scaffolds.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/majnusova/all/projects/plv/data/blastables/Chic10_scaffolds
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 758531 sequences in 9.7346 seconds.




Building a new DB, current time: 02/22/2024 17:09:14
New DB name:   /home/majnusova/all/projects/plv/data/blastables/Vischeria_C74_genome_v1
New DB title:  /home/majnusova/all/projects/plv/data/genomes_test/Vischeria_C74_genome_v1.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/majnusova/all/projects/plv/data/blastables/Vischeria_C74_genome_v1
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 55 sequences in 0.471926 seconds.




Building a 

### modifying a dictionary for blastdbcmd
#### genome_scaffold_orfs_descr contains the same IDs as genome_scaffold_ids + description

In [10]:
# creating a dict of lists (genomes: ids)
# this dict contains also the description - it allows the usage of the -range option of blastdbcmd command
genome_scaffold_orfs_descr = {}

for file in glob("/home/majnusova/all/projects/plv/data/eustigs_hmmout/*_atp.hmmout"):
    genome_name = os.path.basename(file).split('_atp.hmmout')[0] # getting the real names of the genomes; os.path.basename(file) extracts the base name from the file's full path.
    if genome_name not in genome_scaffold_orfs_descr:
        genome_scaffold_orfs_descr[genome_name] = []

    hmmer_file = SearchIO.read(file, "hmmer3-text")
    
    for record in hmmer_file:
        if record.is_included:
            genome_scaffold_orfs_descr[genome_name].append(record.id + ";" + record.description)

In [11]:
genome_scaffold_orfs_descr

{'Chic10_scaffolds': ['NODE_5080_length_5696_cov_26.6876_1;[649 - 1389] No definition line found',
  'NODE_381_length_27838_cov_14.2215_25;[7239 - 6505] (REVERSE SENSE) No definition line found',
  'NODE_7293_length_4272_cov_22.9291_3;[1285 - 551] (REVERSE SENSE) No definition line found',
  'NODE_5397_length_5445_cov_37.4879_2;[929 - 1711] No definition line found',
  'NODE_6066_length_4954_cov_14.3046_4;[1701 - 964] (REVERSE SENSE) No definition line found',
  'NODE_847_length_17049_cov_12.9829_3;[16205 - 15435] (REVERSE SENSE) No definition line found',
  'NODE_7343_length_4249_cov_22.2454_2;[3283 - 4017] No definition line found',
  'NODE_19673_length_1550_cov_34.6107_1;[1091 - 348] (REVERSE SENSE) No definition line found',
  'NODE_1409_length_12728_cov_15.5927_2;[12037 - 11291] (REVERSE SENSE) No definition line found',
  'NODE_1574_length_11923_cov_14.5027_1;[11636 - 10887] (REVERSE SENSE) No definition line found',
  'NODE_711_length_18631_cov_14.7806_4;[11717 - 12442] No defin

In [12]:
total_ids = sum(len(id) for id in genome_scaffold_orfs_descr.values())
total_ids

280

### Extracting scaffolds (30000nt downstream and upstream of the viral genes) containing viral genes detected by HMMER. 
### Scaffolds shorter than 20000 nts are discarded.

In [10]:
scaffolds_of_interest = []

def run_blastdbcmd(id, genome_db, outscaffold, range):
    command = ["blastdbcmd", "-entry", str(id), "-db", genome_db, "-out", outscaffold, "-range", range]
    print(f"Running command: {' '.join(command)}")  

    result = subprocess.run(command, text=True, capture_output=True)

    if result.returncode != 0:
        print(f"Warning: blastdbcmd failed for ID {id} in database {genome_db}. Error: {result.stderr}")
        if os.path.exists(outscaffold):
            os.remove(outscaffold)
        return False, None

    if os.path.exists(outscaffold) and os.path.getsize(outscaffold) > 0:
        record = SeqIO.read(outscaffold, "fasta")
        if len(record.seq) < 20000:
            print(f"Notice: Scaffold {id} is shorter than 20000 nucleotides, removing file.")
            os.remove(outscaffold)
            return False, None
        else:
            scaffolds_of_interest.append(id)
            return True, len(record.seq)
    else:
        print(f"Notice: No scaffold file created for ID {id}.")
        return False, None

scaffolds_dir = "/home/majnusova/all/projects/plv/data/eve_outscaffolds/"
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"
prot_database = "_orfs."

for genome, ids in genome_scaffold_orfs_descr.items(): #ids = NODE_15445_length_2031_cov_23.9084_2;[1007 - 1726] No definition line found
    print(f"Processing genome: {genome}")  
    genome_dir = os.path.join(scaffolds_dir, genome)
    os.makedirs(genome_dir, exist_ok=True)
    for i in ids:
        res_scaf_orf = i.split(";")[0]
        id_full, descr = i.split(';')
        id = id_full.rsplit('_', 1)[0]  
        print(f"Processing ID: {id}")  

        if "REVERSE" in descr:
            start_orf_part = descr.split(" - ")[1]
            start_orf = start_orf_part.split("]")[0]
            end_orf_part = descr.split(" - ")[0]
            end_orf = end_orf_part.split("[")[1]
        else:
            start_orf_part = descr.split(" - ")[0]
            start_orf = start_orf_part.split("[")[1]
            end_orf_part = descr.split(" - ")[1]
            end_orf = end_orf_part.split("]")[0]

        end_extended_orf = int(end_orf) + 30000 # when a range passed to blastdbcmd exceeds the length of the scaffold, blastdbcmd will by default extract the whole scaffold! :-)
        if int(start_orf) <= 30000:
            start_extended_orf = 1
        else:
            start_extended_orf = int(start_orf) - 30000

        for filename in os.listdir(blastables_outdir):
            if prot_database not in filename:
                base_name = filename.split('.')[0]
                genome_db = os.path.join(blastables_outdir, base_name)
                outscaffold = f"{genome_dir}/{res_scaf_orf}.fasta"

                success, scaffold_length = run_blastdbcmd(id, genome_db, outscaffold, f"{start_extended_orf}-{end_extended_orf}")
                if success:
                    break

print('Scaffolds of interest:', scaffolds_of_interest)


Processing genome: Chic10_scaffolds
Processing ID: NODE_5080_length_5696_cov_26.6876
Running command: blastdbcmd -entry NODE_5080_length_5696_cov_26.6876 -db /home/majnusova/all/projects/plv/data/blastables/Chic10_scaffolds -out /home/majnusova/all/projects/plv/data/eve_outscaffolds/Chic10_scaffolds/NODE_5080_length_5696_cov_26.6876_1.fasta -range 1-31389
Notice: Scaffold NODE_5080_length_5696_cov_26.6876 is shorter than 20000 nucleotides, removing file.
Running command: blastdbcmd -entry NODE_5080_length_5696_cov_26.6876 -db /home/majnusova/all/projects/plv/data/blastables/Characiopsis_New_scaffolds -out /home/majnusova/all/projects/plv/data/eve_outscaffolds/Chic10_scaffolds/NODE_5080_length_5696_cov_26.6876_1.fasta -range 1-31389
Error: [blastdbcmd] Entry or entries not found in BLAST database

Running command: blastdbcmd -entry NODE_5080_length_5696_cov_26.6876 -db /home/majnusova/all/projects/plv/data/blastables/Vischeria_C74_genome_v1 -out /home/majnusova/all/projects/plv/data/eve

KeyboardInterrupt: 

### list containing IDs of scaffolds (not orfs) - input for blastn

In [15]:
#sedi, v poradku; jen ids skafoldu nad inclusion treshold a 20000+
len(scaffolds_of_interest)

137

In [16]:
#len(scaffolds_of_interest)
scaffolds_of_interest

['NODE_381_length_27838_cov_14.2215',
 'NODE_352_length_29307_cov_12.7034',
 'NODE_45_length_194274_cov_4.45832',
 'NODE_16_length_371769_cov_13.539',
 'NODE_1_length_1589435_cov_13.0886',
 'NODE_10_length_626405_cov_4.34645',
 'NODE_70_length_144148_cov_190.155',
 'NODE_498_length_23434_cov_17.1238',
 'NODE_33_length_241831_cov_197.129',
 'NODE_89_length_113149_cov_3.22244',
 'NODE_33_length_241831_cov_197.129',
 'NODE_146_length_60639_cov_3.37158',
 'NODE_95_length_100189_cov_309.803',
 'NODE_122_length_74444_cov_387.187',
 'NODE_381_length_27838_cov_14.2215',
 'NODE_55_length_165510_cov_190.906',
 'NODE_58_length_162532_cov_4.15186',
 'NODE_98_length_98422_cov_62.5464',
 'NODE_43_length_199121_cov_7.50572',
 'NODE_2_length_969053_cov_7.62238',
 'NODE_37_length_230641_cov_4.40646',
 'NODE_11_length_515168_cov_3.2278',
 'NODE_63_length_153037_cov_4.46219',
 'NODE_244_length_37664_cov_11.8205',
 'NODE_104_length_85964_cov_12.9853',
 'NODE_66_length_150458_cov_3.12336',
 'NODE_3_length_

In [14]:
scaffolds_of_interest= ['NODE_381_length_27838_cov_14.2215',
 'NODE_352_length_29307_cov_12.7034',
 'NODE_45_length_194274_cov_4.45832',
 'NODE_16_length_371769_cov_13.539',
 'NODE_1_length_1589435_cov_13.0886',
 'NODE_10_length_626405_cov_4.34645',
 'NODE_70_length_144148_cov_190.155',
 'NODE_498_length_23434_cov_17.1238',
 'NODE_33_length_241831_cov_197.129',
 'NODE_89_length_113149_cov_3.22244',
 'NODE_33_length_241831_cov_197.129',
 'NODE_146_length_60639_cov_3.37158',
 'NODE_95_length_100189_cov_309.803',
 'NODE_122_length_74444_cov_387.187',
 'NODE_381_length_27838_cov_14.2215',
 'NODE_55_length_165510_cov_190.906',
 'NODE_58_length_162532_cov_4.15186',
 'NODE_98_length_98422_cov_62.5464',
 'NODE_43_length_199121_cov_7.50572',
 'NODE_2_length_969053_cov_7.62238',
 'NODE_37_length_230641_cov_4.40646',
 'NODE_11_length_515168_cov_3.2278',
 'NODE_63_length_153037_cov_4.46219',
 'NODE_244_length_37664_cov_11.8205',
 'NODE_104_length_85964_cov_12.9853',
 'NODE_66_length_150458_cov_3.12336',
 'NODE_3_length_955702_cov_6.81221',
 'NODE_28_length_282822_cov_9.9159',
 'NODE_13_length_427009_cov_190.15',
 'NODE_94_length_101213_cov_13.992',
 'NODE_2_length_969053_cov_7.62238',
 'NODE_95_length_100189_cov_309.803',
 'NODE_3_length_955702_cov_6.81221',
 'NODE_312_length_31651_cov_4.15834',
 'NODE_2_length_969053_cov_7.62238',
 'NODE_34_length_241557_cov_7.5501',
 'NODE_9_length_666414_cov_4.15495',
 'NODE_26_length_312622_cov_7.43917',
 'NODE_1_length_1589435_cov_13.0886',
 'Vischeria_C74_contig_15',
 'Vischeria_C74_contig_12',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_46',
 'Vischeria_C74_contig_4',
 'Vischeria_C74_contig_13',
 'Vischeria_C74_contig_27',
 'Vischeria_C74_contig_13',
 'Vischeria_C74_contig_33',
 'Vischeria_C74_contig_10',
 'Vischeria_C74_contig_10',
 'Vischeria_C74_contig_19',
 'Vischeria_C74_contig_2',
 'Vischeria_C74_contig_4',
 'Vischeria_C74_contig_33',
 'Vischeria_C74_contig_16',
 'Vischeria_C74_contig_25',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_13',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_8',
 'Vischeria_C74_contig_3',
 'Vischeria_C74_contig_18',
 'Vischeria_C74_contig_9',
 'Vischeria_C74_contig_16',
 'Vischeria_C74_contig_12',
 'Vischeria_C74_contig_16',
 'NODE_541_length_101851_cov_46.471309',
 'NODE_55_length_463973_cov_37.984469',
 'NODE_1103_length_61346_cov_37.814180',
 'NODE_320_length_153881_cov_39.008816',
 'NODE_3853_length_25035_cov_37.882483',
 'NODE_4047_length_24047_cov_37.496871',
 'NODE_1923_length_42336_cov_37.970586',
 'NODE_520_length_104572_cov_40.248251',
 'NODE_867_length_72283_cov_40.422873',
 'NODE_4244_length_22945_cov_43.032228',
 'NODE_4874_length_20116_cov_37.345077',
 'NODE_829_length_74096_cov_65.354085',
 'NODE_866_length_72325_cov_37.472691',
 'NODE_1465_length_50215_cov_45.222007',
 'NODE_3667_length_26145_cov_48.502378',
 'NODE_2774_length_32621_cov_36.854996',
 'NODE_4742_length_20636_cov_41.178511',
 'NODE_3601_length_26483_cov_35.616072',
 'NODE_8_length_920040_cov_17.887887',
 'NODE_171_length_238445_cov_572.810663',
 'NODE_311_length_156469_cov_32.611502',
 'NODE_15_length_759953_cov_292.559295',
 'NODE_197_length_219627_cov_52.866117',
 'NODE_152_length_259012_cov_32.238044',
 'NODE_38_length_520036_cov_47.857456',
 'NODE_27_length_600914_cov_14.349319',
 'NODE_179_length_230196_cov_11.871036',
 'NODE_132_length_285120_cov_137.485885',
 'NODE_2364_length_36824_cov_1984.494571',
 'NODE_254_length_184797_cov_7.251294',
 'NODE_377_length_136897_cov_6.550351',
 'NODE_1634_length_47073_cov_31.715572',
 'NODE_3763_length_25504_cov_3.576867',
 'NODE_113_length_322384_cov_46.569383',
 'NODE_4384_length_22242_cov_940.677600',
 'NODE_398_length_128959_cov_17.006463',
 'NODE_619_length_90376_cov_5.509109',
 'NODE_330_length_150702_cov_302.813457',
 'NODE_155_length_252422_cov_128.823373',
 'NODE_395_length_129935_cov_617.085062',
 'NODE_153_length_255781_cov_13.970744',
 'NODE_182_length_228583_cov_79.382664',
 'NODE_48_length_497299_cov_6.449061',
 'NODE_117_length_315731_cov_6.054322',
 'NODE_4297_length_22640_cov_6.481496',
 'NODE_4572_length_21411_cov_2.919518',
 'NODE_965_length_66675_cov_6.078110',
 'NODE_115_length_316723_cov_17.494170',
 'NODE_1_length_1214521_cov_6.930895',
 'NODE_65_length_422212_cov_6.412126',
 'NODE_531_length_103019_cov_5.844194',
 'NODE_173_length_232916_cov_265.596429',
 'NODE_3018_length_30458_cov_4.146045',
 'NODE_35_length_543908_cov_16.984931',
 'NODE_32_length_556671_cov_424.843629',
 'NODE_145_length_273263_cov_31.304994',
 'NODE_741_length_79403_cov_185.477208',
 'NODE_217_length_208486_cov_6.840141',
 'NODE_208_length_212550_cov_43.804855',
 'NODE_34_length_549459_cov_93.351604',
 'NODE_9_length_911479_cov_90.897796',
 'NODE_10_length_908469_cov_12.406104',
 'NODE_39_length_517803_cov_7.286389',
 'NODE_2119_length_39630_cov_2.885243',
 'NODE_1064_length_63035_cov_6.873725',
 'NODE_62_length_444389_cov_18.131093',
 'NODE_135_length_281329_cov_13.280030',
 'NODE_176_length_232184_cov_8.619279',
 'NODE_1_length_1214521_cov_6.930895',
 'NODE_2425_length_36271_cov_3.294192',
 'NODE_157_length_250643_cov_11.185995']

In [15]:
#set protoze potrebujeme scaffold pouzit jako seqidlist v ramci blastn jen jednou
set_scaffolds_of_interest = set(scaffolds_of_interest)
len(set_scaffolds_of_interest)

118

In [16]:
# ted potrebujeme srovnat scaffolds_of_interest s genome_scaffold_ids a pak to pouzijeme pro seqidlist
# ziskame dict: genomes a jejich filtered scaffolds (vice virovych orfu na jednom skafoldu -> redundance)
real_genome_scaffold_ids = {}

for genome, ids in genome_scaffold_orfs_descr.items():
    truncated_ids = [id.split(';')[0] for id in ids] 
    real_genome_scaffold_ids[genome] = truncated_ids

filtered_genome_scaffold_ids = {}

for genome, ids in real_genome_scaffold_ids.items():
    for scaffold_id in ids:
        truncated_id = scaffold_id.split('_')[:-1] 
        truncated_id = '_'.join(truncated_id)
        if truncated_id in set_scaffolds_of_interest:
            if genome not in filtered_genome_scaffold_ids:
                filtered_genome_scaffold_ids[genome] = []  # initialize the list if it doesn't exist
            filtered_genome_scaffold_ids[genome].append(scaffold_id)  

print(filtered_genome_scaffold_ids)

{'Chic10_scaffolds': ['NODE_381_length_27838_cov_14.2215_25', 'NODE_352_length_29307_cov_12.7034_4', 'NODE_45_length_194274_cov_4.45832_227', 'NODE_16_length_371769_cov_13.539_405', 'NODE_1_length_1589435_cov_13.0886_269', 'NODE_10_length_626405_cov_4.34645_166', 'NODE_70_length_144148_cov_190.155_76', 'NODE_498_length_23434_cov_17.1238_3', 'NODE_33_length_241831_cov_197.129_26', 'NODE_89_length_113149_cov_3.22244_46', 'NODE_33_length_241831_cov_197.129_40', 'NODE_146_length_60639_cov_3.37158_5', 'NODE_95_length_100189_cov_309.803_24', 'NODE_122_length_74444_cov_387.187_15', 'NODE_381_length_27838_cov_14.2215_15', 'NODE_55_length_165510_cov_190.906_69', 'NODE_58_length_162532_cov_4.15186_9', 'NODE_98_length_98422_cov_62.5464_9', 'NODE_43_length_199121_cov_7.50572_77', 'NODE_2_length_969053_cov_7.62238_389', 'NODE_37_length_230641_cov_4.40646_312', 'NODE_11_length_515168_cov_3.2278_362', 'NODE_63_length_153037_cov_4.46219_187', 'NODE_244_length_37664_cov_11.8205_8', 'NODE_104_length_859

In [14]:
sum(len(id) for id in filtered_genome_scaffold_ids.values())

137

## BLASTn to identify viral repeats - otestovat, jak to bude hledat repetice kdyz se bude blastovat ne proti celemu skafoldu ale proti tem frakcim /home/majnusova/all/projects/plv/data/eve_outscaffolds/

In [19]:
def blastn_repeats(query, output, database, seqid, outfmt):
    command = ["blastn",
               "-query", str(query), 
               "-out", str(output),
               "-db", str(database), 
               "-strand", "minus",
              # "-dust", "no",
            #   "-word_size", "9",
               "-seqidlist", str(seqid), 
               "-num_threads", "5", # zmenit na clusteru
               "-outfmt", str(outfmt)]
    subprocess.run(command, check=True) 


output_dir = "/home/majnusova/all/projects/plv/data/blastn_tirs/"
blastables_outdir = "/home/majnusova/all/projects/plv/data/blastables/"
path_to_input = "/home/majnusova/all/projects/plv/data/eve_outscaffolds/"

for genome, scaffold_ids in filtered_genome_scaffold_ids.items():
    database = f"{blastables_outdir}/{genome}" 
    outdir = os.path.join(output_dir, genome)
    os.makedirs(outdir, exist_ok=True)
    
    for scaffold_id in scaffold_ids:  
        query_file = f"{scaffold_id}.fasta" # scaffold_id = NODE_135_length_281329_cov_13.280030_127 -> NODE_135_length_281329_cov_13.280030_127.fasta
        query = os.path.join(path_to_input, genome, query_file) #/home/majnusova/all/projects/plv/data/eve_outscaffolds/NODE_135_length_281329_cov_13.280030_127.fasta
        seqidlist_file = '_'.join(scaffold_id.split('_')[:-1]) #NODE_135_length_281329_cov_13.280030
        seqidlist_path = os.path.join(outdir, f"{seqidlist_file}_seqid.txt") #/home/majnusova/all/projects/plv/data/alias_test/NODE_135_length_281329_cov_13.280030
        
        with open(seqidlist_path, 'w') as file:
            file.write(seqidlist_file + '\n')  

        output_file = f"{scaffold_id}_repeats.txt"
        output = os.path.join(outdir, output_file)
        output_file2 = f"{scaffold_id}_repeats_human.txt"
        output2 = os.path.join(outdir, output_file2)

    # Checking if the query file exists to avoid the error!
        if os.path.exists(query):
            # outfmt 6 output
            blastn_repeats(query, output, database, seqidlist_path, 5) # should be 5
            # human-readable output
            blastn_repeats(query, output2, database, seqidlist_path, 0)
        else:
            print(f"Query file {query} does not exist, skipping this scaffold ID.")



## Searching for suitable repeats in the blast outputs

In [None]:
TIRs: 200-1200 nt long; musi byt od sebe vzdalene 15 000-35 000
len(hsp.match)) = delka zalignovaneho useku vcetne gapu a mismatchu (Identities = 338/403 (84%)) -> len(hsp.match)) = 403

In [22]:
from Bio.Blast import NCBIXML

blastout_dir = "/home/majnusova/all/projects/plv/data/blastn_tirs/Vischeria_C74_genome_v1/Vischeria_C74_contig_16_620_repeats.txt"

hsp_info = {}

with open(blastout_dir) as blast_handle:
    blast_records = NCBIXML.parse(blast_handle)
    for blast_record in blast_records:
        for hsp_num, hsp in enumerate(alignment.hsps, start=1): # enumerating each HSP - the number will be used as a key to store additional values 
            hsp_info[hsp_num] = {            
            'bit_score': hsp.bits,
            'score': hsp.score,
            'evalue': hsp.expect,
            'gaps': hsp.gaps,
            'identities': hsp.identities,
            'aln_len' : len(hsp.match)
            }
hsp_info

{1: {'bit_score': 974.305,
  'score': 527.0,
  'evalue': 0.0,
  'gaps': 0,
  'identities': 527,
  'aln_len': 527},
 2: {'bit_score': 974.305,
  'score': 527.0,
  'evalue': 0.0,
  'gaps': 0,
  'identities': 527,
  'aln_len': 527},
 3: {'bit_score': 366.757,
  'score': 198.0,
  'evalue': 6.93911e-100,
  'gaps': 19,
  'identities': 338,
  'aln_len': 403},
 4: {'bit_score': 344.597,
  'score': 186.0,
  'evalue': 3.25137e-93,
  'gaps': 4,
  'identities': 208,
  'aln_len': 218},
 5: {'bit_score': 331.671,
  'score': 179.0,
  'evalue': 2.53131e-89,
  'gaps': 5,
  'identities': 274,
  'aln_len': 320},
 6: {'bit_score': 327.977,
  'score': 177.0,
  'evalue': 3.27446e-88,
  'gaps': 2,
  'identities': 200,
  'aln_len': 211},
 7: {'bit_score': 313.204,
  'score': 169.0,
  'evalue': 9.16885e-84,
  'gaps': 1,
  'identities': 216,
  'aln_len': 239},
 8: {'bit_score': 302.124,
  'score': 163.0,
  'evalue': 1.98471e-80,
  'gaps': 6,
  'identities': 206,
  'aln_len': 226},
 9: {'bit_score': 296.584,
  '

In [17]:
blastout_dir = "/home/majnusova/all/projects/plv/data/blastn_tirs/"
#def find_tirs(blastout, singleline_scaffold, outf)
hsp_info = {}
for genome in filtered_genome_scaffold_ids.keys(): #dict obsahuje genomy a ids virovych scaffoldu vcetne _orfnumber
    fullpath = os.path.join(blastout_dir, genome)
    for blastout in os.listdir(fullpath):
        blastout_path = os.path.join(fullpath, blastout)
        if blastout.endswith("repeats.txt"):
            with open(blastout_path) as blast_handle:
                blast_records = NCBIXML.parse(blast_handle)
                for blast_record in blast_records:
                    for hsp_num, hsp in enumerate(alignment.hsps, start=1): # enumerating each HSP - the number will be used as a key to store additional values 
                        hsp_info[hsp_num] = {            
                        'bit_score': hsp.bits,
                        'score': hsp.score,
                        'evalue': hsp.expect,
                        'gaps': hsp.gaps,
                        'identities': hsp.identities,
                        'aln_len' : len(hsp.match)
                        }
hsp_info

NameError: name 'alignment' is not defined

In [21]:
from collections import defaultdict
import os
from Bio.Blast import NCBIXML

blastout_dir = "/home/majnusova/all/projects/plv/data/blastn_tirs/"

for genome in filtered_genome_scaffold_ids.keys():
    fullpath = os.path.join(blastout_dir, genome)
    for blastout in os.listdir(fullpath):
        blastout_path = os.path.join(fullpath, blastout)
        if blastout.endswith("repeats.txt"):
            hsp_info = defaultdict(list)
            with open(blastout_path) as blast_handle:
                blast_records = NCBIXML.parse(blast_handle)
                for blast_record in blast_records:
                    for alignment in blast_record.alignments:
                        for hsp in alignment.hsps:
                            key = (hsp.bits, hsp.score, hsp.expect, hsp.gaps, hsp.identities, len(hsp.match))
                            hsp_info[key].append(hsp)

            for key, hsps in hsp_info.items():
                if len(hsps) > 1:
                    if len(hsp.match) >= 200 and len(hsp.match) <= 1200:
                        print(f"Soubor: {blastout}, Dvojice HSP s identickými hodnotami (bit_score, evalue, identities): {key}")
                        for hsp in hsps:
                            print(f"HSP: bit_score={hsp.bits}, evalue={hsp.expect}, identities={hsp.identities}")
                        print("-----")


Soubor: NODE_352_length_29307_cov_12.7034_4_repeats.txt, Dvojice HSP s identickými hodnotami (bit_score, evalue, identities): (183.939, 99.0, 3.6499e-47, 37, 216, 265)
HSP: bit_score=183.939, evalue=3.6499e-47, identities=216
HSP: bit_score=183.939, evalue=3.6499e-47, identities=216
-----
Soubor: Vischeria_C74_contig_33_10_repeats.txt, Dvojice HSP s identickými hodnotami (bit_score, evalue, identities): (1157.12, 626.0, 0.0, 1, 637, 642)
HSP: bit_score=1157.12, evalue=0.0, identities=637
HSP: bit_score=1157.12, evalue=0.0, identities=637
-----
Soubor: NODE_4244_length_22945_cov_43.032228_19_repeats.txt, Dvojice HSP s identickými hodnotami (bit_score, evalue, identities): (558.808, 302.0, 3.18082e-160, 0, 304, 305)
HSP: bit_score=558.808, evalue=3.18082e-160, identities=304
HSP: bit_score=558.808, evalue=3.18082e-160, identities=304
-----
Soubor: NODE_3601_length_26483_cov_35.616072_22_repeats.txt, Dvojice HSP s identickými hodnotami (bit_score, evalue, identities): (1411.96, 764.0, 0.0

In [None]:
----------------------------------------ok

## Turning the extracted scaffolds into single line files to make it possible to search for the repeats flanking the EVEs

In [None]:
scaffolds_dir = "/home/majnusova/all/projects/plv/data/eve_outscaffolds/"
singleline_dir = "/home/majnusova/all/projects/plv/data/eve_singleline_scaffolds/" 

for outscaffold in os.listdir(scaffolds_dir):  #os.listdir lists all files/directories; not paths
    outscaffold_path = os.path.join(scaffolds_dir, outscaffold) # paths to all the files listed
    with open(outscaffold_path, "r") as infile:
        lines = infile.readlines()
    header = lines[0]  # fasta header
    joined_lines = ''.join(line.strip() for line in lines[1:])  # Join all lines except the first

    output_file_path = os.path.join(singleline_dir, outscaffold)  
    with open(output_file_path, "w") as outfile:
        outfile.write(header) 
        outfile.write(joined_lines)  

### Find suitable repeats and  separate them from the rest of the genome (2x \n at both 5' and 3' end)

## prepare blastn to find DRs (shortest one has 48 nt, longest one 101 nt) 

In [None]:
kdyz se na jednom scaffoldu nachazi vice virovych proteinu (v hmmmout contig1_2, contig1_13...), tak dat id contigu do setu pro blastn,
aby se to zblastovalo jen jednou (cely skafold) a pak zkusit odentrovat vsechny vhodne repetice, co to najde a hledat pak jestli v nich je 
protein nalezeny hmmerem - kdyz jo, ulozit pod hmmer orf id jako soubor s EVE

In [1]:
from Bio import SeqIO
from collections import defaultdict

# Načtení sekvence
record = SeqIO.read("sliding_test.fasta", "fasta")
sequence = str(record.seq).upper()

# Funkce pro hledání přímých opakování
def find_direct_repeats(seq, min_size=3, max_size=8):
    repeats = defaultdict(list)
    length = len(seq)
    
    # Procházení sekvencí posuvným oknem
    for size in range(min_size, max_size + 1):
        for i in range(length - size + 1):
            fragment = seq[i:i+size]
            if seq.count(fragment) > 1:
                repeats[fragment].append(i)
    
    # Filtrace opakování, která se vyskytují více než jednou
    direct_repeats = {k: v for k, v in repeats.items() if len(v) > 1}
    return direct_repeats

# Použití funkce
direct_repeats = find_direct_repeats(sequence)
for repeat, positions in direct_repeats.items():
    print(f"Repeat: {repeat}, Positions: {positions}")


Repeat: TGA, Positions: [0, 454, 477, 539, 662, 869, 872, 881, 891, 998, 1106, 1122, 1178, 1228, 1288, 1300, 1359, 1498, 1530, 1728, 1742, 1807, 1817, 1954, 1966, 1970, 2001, 2006, 2050, 2055, 2078, 2086, 2093, 2137, 2146, 2209, 2237, 2540, 2545, 2555, 2602, 2774, 2895, 2953, 2981, 3118, 3207, 3252, 3270, 3302, 3329, 3381, 3401, 3422, 3511, 3575, 3590, 3673, 3681, 3746, 3772, 3788, 3952, 3998, 4054, 4115, 4201, 4204, 4247, 4276, 4283, 4295, 4298, 4322, 4328, 4345, 4376, 4408, 4412, 4503, 4861, 4874, 4886, 4913, 4919, 4987, 4995, 5002, 5007, 5058, 5064, 5172, 5196, 5284, 5299, 5304, 5370, 5409, 5430, 5464, 5479, 5499, 5527, 5634, 5714, 5809, 5866, 5929, 6014, 6210, 6279, 6328, 6385, 6396, 6472, 6493, 6522, 6532, 6564, 6594, 6616, 6625, 6631, 6634, 6684, 6691, 6718, 6730, 6790, 6798, 7003, 7012, 7039, 7153, 7188, 7275, 7330, 7365, 7414, 7462, 7534, 7557, 7581, 7596, 7608, 7627, 7698, 7765, 7768, 7797, 7870, 7873, 7954, 8005, 8008, 8062, 8071, 8076, 8145, 8176, 8179, 8212, 8299, 8370, 837

## extract orfs between tirs - pujde v pohode i bez vymezeneho TSD. kontrola spravnosti elementu - v orfech musi byt protein identifikovany hmmerem hned na zacatku... 

In [None]:
blastn -query visch13_scaf.txt -out visch13_repeats_dust_plus.txt -db Vischeria_C74_genome_v1 -word_size 8 -seqidlist seqid.txt -num_threads 5 -strand plus
