In [None]:
import pandas as pd
from Bio import SeqIO

# load checkv results
checkv_df = pd.read_csv(str(snakemake.input.checkv_results), sep="\t")

# filter checkv results based on input
if snakemake.params.min_completeness != '':
    checkv_filtered = checkv_df[(checkv_df["completeness"] >= snakemake.params.min_completeness)
                                & (checkv_df["viral_genes"] >= snakemake.params.min_viral_genes)
                                & (checkv_df["host_genes"] <= snakemake.params.max_bacterial_genes)]

elif snakemake.params.min_completeness == '':
    checkv_filtered = checkv_df[(checkv_df["viral_genes"] >= snakemake.params.min_viral_genes)
                                & (checkv_df["host_genes"] <= snakemake.params.max_bacterial_genes)]

filtered_contigs = set(checkv_filtered["contig_id"])
filtered_seqs = []

# parse through and combine provirus sequences for each sample
for record in SeqIO.parse(str(snakemake.input.checkv_proviruses), "fasta"):
    record.id = record.id.rpartition('_')[0]
    if record.id in filtered_contigs:
        filtered_seqs.append(record)

# parse through and combine virus sequences for each sample
for record in SeqIO.parse(str(snakemake.input.checkv_viruses), "fasta"):
    if record.id in filtered_contigs:
        filtered_seqs.append(record)

# save all sequences to specified file
SeqIO.write(filtered_seqs, str(snakemake.output), "fasta")
