In [136]:
from Bio import SeqIO
import pandas as pd
import gffutils
import os
import yaml

In [137]:
# Load YAML file and read data paths
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

data_path = config['data_path']
my_genome = config['my_genome']

# load genome files from data path
genomes = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]

# Print the list of genome files
print("Genome files in data path:")
for genome_path in genomes:
    genome_path = os.path.join(data_path, genome_path)
    print(genome_path)

my_genome_path = os.path.join(data_path, my_genome)
print(f"My genome path: {my_genome_path}")

Genome files in data path:
data/ncbi_dataset/data\GCA_001457635.1
data/ncbi_dataset/data\GCA_019046945.1
data/ncbi_dataset/data\GCA_019048645.1
data/ncbi_dataset/data\GCA_900475505.1
data/ncbi_dataset/data\GCA_900636475.1
data/ncbi_dataset/data\GCA_900637025.1
My genome path: data/ncbi_dataset/data\GCA_900636475.1


In [180]:
class GenomeManager:
    def __init__(self, base_path: str, genome_id: str):
        """
        base_path: root directory containing all genome subfolders (e.g. "data/ncbi_dataset/data")
        genome_id: name of the genome folder (e.g. "GCA_900636475.1")
        """
        self.genome_dir = os.path.join(base_path, genome_id)
        self.fna_path = None
        self.gtf_path = None
        self.gbff_path = None
        self.gff_path = None
        self._find_files()

    def _find_files(self):
        """Scan the genome directory and assign file paths for .fna, .gtf, .gbff, .gff"""
        for fname in os.listdir(self.genome_dir):
            lower = fname.lower()
            full = os.path.join(self.genome_dir, fname)
            if lower.endswith(".fna") or lower.endswith(".fa") or lower.endswith(".fasta"):
                self.fna_path = full
            elif lower.endswith(".gtf"):
                self.gtf_path = full
            elif lower.endswith(".gbff") or lower.endswith(".gbk") or lower.endswith(".gbff"):
                self.gbff_path = full
            elif lower.endswith(".gff") or lower.endswith(".gff3"):
                self.gff_path = full

    def summary(self):
        """Print what files are detected in this genome folder."""
        print(f"Genome directory: {self.genome_dir}")
        print(f"  FASTA (.fna): {self.fna_path}")
        print(f"  GTF (.gtf):   {self.gtf_path}")
        print(f"  GBFF (.gbff): {self.gbff_path}")
        print(f"  GFF (.gff/.gff3): {self.gff_path}")

    def read_fna(self, sequence_itself: bool = True):
        """Read the FASTA (.fna) file, return list of SeqRecord objects."""
        if not self.fna_path:
            raise FileNotFoundError("No .fna file found in genome directory.")
        genome = list(SeqIO.parse(self.fna_path, "fasta"))

        if not len(genome) == 1:
            raise ValueError(f"Expected 1 sequence in .fna file, found {len(genome)}")

        seq = genome[0]
        
        if sequence_itself: return seq.seq
        else: return seq

    def load_gff_from_gffutils(self):
        """Load GFF file using gffutils."""
        if not self.gff_path:
            raise FileNotFoundError("No .gff file found in genome directory.")
        # Implement GFF parsing logic here
        
        gff_db = gffutils.create.create_db(data=self.gff_path, dbfn=':memory:', force=True)
        return gff_db
    
    def load_gff(self):
        """Load GFF file using gffutils."""
        if not self.gff_path:
            raise FileNotFoundError("No .gff file found in genome directory.")
        # Implement GFF parsing logic here
        
        # Use read_csv with comment="#" so lines starting with “#” are ignored
        gff_db = pd.read_csv(self.gff_path, sep="\t", comment="#", header=None,
            names=["seqid", "source", "feature", "start", "end", "score", "strand", "phase", "attributes"],
            dtype={
                "seqid": str,
                "source": str,
                "feature": str,
                "start": int,
                "end": int,
                "score": str,    # may be “.” or numeric
                "strand": str,
                "phase": str,
                "attributes": str
            },
            # In case there are “bad lines” (e.g. wrong number of columns), skip them
            on_bad_lines="skip"
        )

        return gff_db
    
    def extract_genes(self, find_method: str = "random", n: int = 100):
        """Extract gene sequences from fasta file based on GFF annotations."""

        if find_method not in ["random", "first"]:
            raise ValueError(f"Unknown find_method: {find_method}")

        gff_db = self.load_gff()
        genome_seq = self.read_fna(sequence_itself=True)

        # Filter for gene features
        genes = gff_db[gff_db['feature'] == 'gene']
        if genes.empty:
            raise ValueError("No gene features found in GFF file.")
        elif len(genes) < n:
            raise ValueError(f"Requested {n} genes, but only found {len(genes)} in GFF file.")
        
        if find_method == "random":
            sampled_genes = genes.sample(n=n)
        elif find_method == "first":
            sampled_genes = genes.head(n)

        gene_sequences = []
        for _, row in sampled_genes.iterrows():
            start = row['start'] - 1  # Convert to 0-based index
            end = row['end']          # End is inclusive in GFF but this is 1-based
            strand = row['strand']
            gene_seq = genome_seq[start:end]
            if strand == '-':
                gene_seq = gene_seq.reverse_complement()
            gene_sequences.append((row['attributes'], gene_seq))

        return gene_sequences

In [181]:
my_genome = GenomeManager(config['data_path'], config['my_genome'])
my_genome.summary()

Genome directory: data/ncbi_dataset/data\GCA_900636475.1
  FASTA (.fna): data/ncbi_dataset/data\GCA_900636475.1\GCA_900636475.1_42197_F01_genomic.fna
  GTF (.gtf):   data/ncbi_dataset/data\GCA_900636475.1\genomic.gtf
  GBFF (.gbff): data/ncbi_dataset/data\GCA_900636475.1\genomic.gbff
  GFF (.gff/.gff3): data/ncbi_dataset/data\GCA_900636475.1\genomic.gff


In [182]:
genome = my_genome.read_fna(sequence_itself=True)
genome

Seq('GAATTATCGGAGGAGCGACTTGGCGACAAGTTGTGATGGTGGTAGGAATTGTTT...AGG')

In [183]:
gff_database = my_genome.load_gff()
gff_database.head()

Unnamed: 0,seqid,source,feature,start,end,score,strand,phase,attributes
0,LR134283.1,EMBL,region,1,1953654,.,+,.,ID=LR134283.1:1..1953654;Dbxref=taxon:1328;Is_...
1,LR134283.1,EMBL,gene,641,1258,.,+,.,ID=gene-NCTC10713_00001;Name=NCTC10713_00001;g...
2,LR134283.1,EMBL,CDS,641,1258,.,+,0,ID=cds-VED97112.1;Parent=gene-NCTC10713_00001;...
3,LR134283.1,EMBL,gene,1272,1577,.,+,.,ID=gene-NCTC10713_00002;Name=NCTC10713_00002;g...
4,LR134283.1,EMBL,CDS,1272,1577,.,+,0,ID=cds-VED97113.1;Parent=gene-NCTC10713_00002;...


In [187]:
gff_database[gff_database['feature'] == 'gene']

Unnamed: 0,seqid,source,feature,start,end,score,strand,phase,attributes
1,LR134283.1,EMBL,gene,641,1258,.,+,.,ID=gene-NCTC10713_00001;Name=NCTC10713_00001;g...
3,LR134283.1,EMBL,gene,1272,1577,.,+,.,ID=gene-NCTC10713_00002;Name=NCTC10713_00002;g...
5,LR134283.1,EMBL,gene,1662,1958,.,+,.,ID=gene-NCTC10713_00003;Name=NCTC10713_00003;g...
7,LR134283.1,EMBL,gene,2374,2577,.,+,.,ID=gene-NCTC10713_00004;Name=NCTC10713_00004;g...
9,LR134283.1,EMBL,gene,2683,3081,.,+,.,ID=gene-NCTC10713_00005;Name=NCTC10713_00005;g...
...,...,...,...,...,...,...,...,...,...
3929,LR134283.1,EMBL,gene,1950438,1951094,.,+,.,ID=gene-NCTC10713_01983;Name=pac;gbkey=Gene;ge...
3931,LR134283.1,EMBL,gene,1951091,1951522,.,+,.,ID=gene-NCTC10713_01984;Name=ssp5_3;gbkey=Gene...
3933,LR134283.1,EMBL,gene,1951602,1952372,.,+,.,ID=gene-NCTC10713_01985;Name=NCTC10713_01985;g...
3935,LR134283.1,EMBL,gene,1952484,1952726,.,+,.,ID=gene-NCTC10713_01986;Name=NCTC10713_01986;g...


In [186]:
genes = my_genome.extract_genes(find_method="random", n=1000)
print(f"Extracted {len(genes)} gene sequences.")
print(genes[:5])  # Print first 5 gene sequences as a sample

Extracted 1000 gene sequences.
[('ID=gene-NCTC10713_01960;Name=NCTC10713_01960;gbkey=Gene;gene_biotype=protein_coding;locus_tag=NCTC10713_01960', Seq('ATGGAAATTAAAGCAGTCTTTTTTGATATTGATGGGACATTAGTAAATGATAGT...TAA')), ('ID=gene-NCTC10713_00310;Name=punA;gbkey=Gene;gene=punA;gene_biotype=protein_coding;locus_tag=NCTC10713_00310', Seq('ATGAATTTAATGGATAAAATTAACGAAACAGCACAGTTTTTAAAAGATAAAGGA...TAA')), ('ID=gene-NCTC10713_01044;Name=NCTC10713_01044;gbkey=Gene;gene_biotype=protein_coding;locus_tag=NCTC10713_01044', Seq('ATGGGAGGACAATTGTTATATATTGTTTTATTCATATTCTTTATTTGGTACCTG...TAA')), ('ID=gene-NCTC10713_01892;Name=NCTC10713_01892;gbkey=Gene;gene_biotype=protein_coding;locus_tag=NCTC10713_01892', Seq('ATGGTGGAAGTACAGTATTCAGAACAAGCAAGTGATTTAATGGAGCAGTTTTCT...TGA')), ('ID=gene-NCTC10713_01905;Name=rpsI;gbkey=Gene;gene=rpsI;gene_biotype=protein_coding;locus_tag=NCTC10713_01905', Seq('ATGTCACAAGCACAATATGCAGGTACTGGACGTCGTAAAAACGCTGTTGCACGC...TAA'))]
