In [49]:
import pandas as pd
import re
from pathlib import Path

In [57]:
bacteria = ["3500", "3502", "3516"]
version = [".hypotheticals", ""]  # both variants

def txt_to_csv(file_path: str) -> str:
    rows = []
    infile = f"{file_path}.txt"
    with open(infile, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            parts = line.split()
            if parts[0] == "*":  # skip the marker
                parts = parts[1:]

            gene = parts[0]
            ko = parts[1]
            threshold = parts[2]
            score = parts[3]
            e_value = parts[4]
            definition = " ".join(parts[5:])
            rows.append([gene, ko, threshold, score, e_value, definition])

    df = pd.DataFrame(rows, columns=["Gene", "KO", "Threshold", "Score", "E-value", "KO Definition"])

    for col in ["Threshold", "Score", "E-value"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    outfile = f"{file_path}.csv"
    df.to_csv(outfile, index=False)
    return outfile

files = [txt_to_csv(f"{b}/{b}_bakta_annotation/{b}{v}_kofamscan")
         for b in bacteria for v in version]

In [63]:
ko = ["K01580", "K20265"]
ko_definition = ["gadR"]

def find_genes(filepath, ko, ko_definition):
    df = pd.read_csv(filepath).fillna("")
    df["KO"] = df["KO"].astype(str).str.strip()
    df["KO Definition"] = df["KO Definition"].astype(str).str.strip()

    mask_ko = df["KO"].isin(ko)
    pattern = "|".join(re.escape(term) for term in ko_definition if term)
    mask_def = df["KO Definition"].str.contains(pattern, case=False, na=False)

    return df.loc[mask_ko | mask_def].copy()

In [65]:
for file in files:
    result = find_genes(file, ko, ko_definition)

    outfile = Path(file).with_suffix("")  # strip .csv
    outfile = f"{outfile}_scan.csv"

    result.to_csv(outfile, sep="\t", index=False)
    print(f"Saved: {outfile}")

Saved: 3500/3500_bakta_annotation/3500.hypotheticals_kofamscan_scan.csv
Saved: 3500/3500_bakta_annotation/3500_kofamscan_scan.csv
Saved: 3502/3502_bakta_annotation/3502.hypotheticals_kofamscan_scan.csv
Saved: 3502/3502_bakta_annotation/3502_kofamscan_scan.csv
Saved: 3516/3516_bakta_annotation/3516.hypotheticals_kofamscan_scan.csv
Saved: 3516/3516_bakta_annotation/3516_kofamscan_scan.csv
