In [1]:
from Bio import SeqIO
import os
import yaml

In [16]:
# Load YAML file and read data paths
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

data_path = config['data_path']
my_genome = config['my_genome']

# load genome files from data path
genomes = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]

# Print the list of genome files
print("Genome files in data path:")
for genome_path in genomes:
    genome_path = os.path.join(data_path, genome_path)
    print(genome_path)

my_genome_path = os.path.join(data_path, my_genome)
print(f"My genome path: {my_genome_path}")

Genome files in data path:
data/ncbi_dataset/data\GCA_001457635.1
data/ncbi_dataset/data\GCA_019046945.1
data/ncbi_dataset/data\GCA_019048645.1
data/ncbi_dataset/data\GCA_900475505.1
data/ncbi_dataset/data\GCA_900636475.1
data/ncbi_dataset/data\GCA_900637025.1
My genome path: data/ncbi_dataset/data\GCA_900636475.1


In [17]:
import os
import pandas as pd
from Bio import SeqIO


class GenomeManager:
    def __init__(self, base_path: str, genome_id: str):
        """
        base_path: path to dataset folder (e.g. data/ncbi_dataset/data)
        genome_id: genome folder name (e.g. GCA_900636475.1)
        """
        self.genome_dir = os.path.join(base_path, genome_id)
        self.fna_path = None
        self.gtf_path = None
        self.gbff_path = None
        self._find_files()

    def _find_files(self):
        """Locate .fna, .gtf, and .gbff files within the genome directory."""
        for file in os.listdir(self.genome_dir):
            if file.endswith(".fna"):
                self.fna_path = os.path.join(self.genome_dir, file)
            elif file.endswith(".gtf"):
                self.gtf_path = os.path.join(self.genome_dir, file)
            elif file.endswith(".gbff"):
                self.gbff_path = os.path.join(self.genome_dir, file)

    def read_fna(self):
        """Read FASTA (.fna) file and return list of sequences."""
        if not self.fna_path:
            raise FileNotFoundError("No .fna file found in genome directory.")
        sequences = list(SeqIO.parse(self.fna_path, "fasta"))
        print(f"Loaded {len(sequences)} sequences from {self.fna_path}")
        return sequences

    def read_gtf(self):
        """Read GTF file into a pandas DataFrame."""
        if not self.gtf_path:
            raise FileNotFoundError("No .gtf file found in genome directory.")
        df = pd.read_csv(
            self.gtf_path,
            sep="\t",
            comment="#",
            header=None,
            names=[
                "seqid", "source", "type", "start", "end",
                "score", "strand", "phase", "attributes"
            ],
        )
        print(f"Loaded {len(df)} records from {self.gtf_path}")
        return df

    def read_gbff(self):
        """Read GenBank (.gbff) file and return list of records."""
        if not self.gbff_path:
            raise FileNotFoundError("No .gbff file found in genome directory.")
        records = list(SeqIO.parse(self.gbff_path, "genbank"))
        print(f"Loaded {len(records)} GenBank records from {self.gbff_path}")
        return records

    def summary(self):
        """Display available genome files."""
        print(f"Genome directory: {self.genome_dir}")
        print(f"FASTA: {self.fna_path}")
        print(f"GTF: {self.gtf_path}")
        print(f"GBFF: {self.gbff_path}")
