In [1]:
import sqlite3
from Bio import SeqIO, SeqFeature, SeqRecord
from collections import defaultdict
import pickle
import glob
import os
import pandas
from tqdm.notebook import tqdm

In [2]:
#####################################################################################################
#:::::::::::::::::::::::::::::::THESE HELPERS WORK WITH BOTH METHODS::::::::::::::::::::::::::::::::#
#####################################################################################################
def get_tables(db_name):
    conn = sqlite3.connect(db_name)
    tables = pandas.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
    conn.close()
    return tables

def get_columns(db_name, table):
    # Describe the structure of the 'genes' table
    conn = sqlite3.connect(db_name)
    columns = pandas.read_sql_query(f"PRAGMA table_info({table});", conn)
    conn.close()
    return columns

def table_head(db_name, table):
    conn = sqlite3.connect(db_name)
    table_head = pandas.read_sql_query(f"SELECT * FROM {table} LIMIT 10;", conn)
    conn.close()
    return table_head

def get_all_of_gene(db_name, table, gene):
    conn = sqlite3.connect(db_name)
    all_genes = pandas.read_sql_query(f"SELECT * FROM {table} WHERE feature_type = 'CDS' AND gene='{gene}'", conn)
    conn.close()
    return all_genes

def get_all_of_product(db_name, table, product):
    conn = sqlite3.connect(db_name)
    all_products = pandas.read_sql_query(f"SELECT * FROM {table} WHERE feature_type = 'CDS' AND product='{product}'", conn)
    conn.close()
    return all_products

def dump_table_to_df(db_name, table):
    conn = sqlite3.connect(db_name)
    dataframe = pandas.read_sql_query(f"SELECT * FROM {table};", conn)
    conn.close()
    return dataframe

def read_table_to_dict(db_name, table_name):
    """
    Reads an SQLite table and returns a list of dictionaries containing the data.
    Each dictionary corresponds to one row in the table.
    
    Args:
        db_name (str): Name of the SQLite database file
        table_name (str): Name of the table to read from
        
    Returns:
        list: List of dictionaries containing the gene data
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    # Get all rows from the table
    cursor.execute(f'SELECT * FROM {table_name}')
    
    # Get column names from cursor description
    columns = [description[0] for description in cursor.description]
    
    # Fetch all rows
    rows = cursor.fetchall()
    
    # Convert rows to list of dictionaries
    genes = []
    for row in rows:
        gene_dict = {}
        for column, value in zip(columns, row):
            gene_dict[column] = value
        genes.append(gene_dict)
    
    conn.close()
    return genes

def verify_data(original_dict, retrieved_dict):
    """
    Helper function to verify that all keys from the original dictionary structure
    are present in the retrieved dictionary.
    
    Args:
        original_dict (dict): Original dictionary structure
        retrieved_dict (dict): Retrieved dictionary from database
        
    Returns:
        bool: True if all keys are present, False otherwise
    """
    expected_keys = [
        "feature_type", "gene", "locus_tag", "note", "protein_id",
        "product", "sequence", "replicon", "replicon_name", "start",
        "end", "strand", "db_xrefs", "assembly", "translation",
        "inference", "transl_table"
    ]
    
    return all(key in retrieved_dict for key in expected_keys)
    
def read_and_pickle_genes(db_name: str, table_name: str, pickle_file: str) -> None:
    """
    Reads genes from SQLite database and saves them to a pickle file
    
    Args:
        db_name (str): Name of the SQLite database file
        table_name (str): Name of the table to read from
        pickle_file (str): Name of the pickle file to save to
    """
    # Read from database using our previous function
    genes = read_table_to_dict(db_name, table_name)
    
    # Save to pickle file
    with open(pickle_file, 'wb') as f:
        pickle.dump(genes, f)
    
    return genes


In [7]:
db = "Bbss_db_v1.db"
table_name = "ncbi_gb"
pickle_file = "bakta_genbank_dict_v1.pkl"

In [15]:
# Define file paths for pickles
ncbi_wp_genbank_pickle = 'ncbi_genbank_dict_v1.pkl'
bakta_wp_genbank_pickle = 'bakta_genbank_dict_v1.pkl'

tables = {
    "reference": 'ncbi_genbank_dict_v1.pkl',
    "ncbi_gb": 'bakta_genbank_dict_v1.pkl',
}

In [18]:
for table, pkl in tables.items():
    # Read from database and save to pickle
    genes = read_and_pickle_genes(db, table, pkl)
    
    # Print some stats to verify
    print(f'{table}: Total number of genes retrieved: {len(genes)}')
    
    # Verify first entry has all expected fields
    if genes:
        print("\nFirst gene entry fields:")
        for key in genes[0].keys():
            print(f"- {key}")

reference: Total number of genes retrieved: 115116

First gene entry fields:
- id
- feature_type
- gene
- locus_tag
- note
- protein_id
- product
- sequence
- replicon
- replicon_name
- start
- end
- strand
- db_xrefs
- assembly
- translation
- inference
- transl_table
ncbi_gb: Total number of genes retrieved: 116186

First gene entry fields:
- id
- feature_type
- gene
- locus_tag
- note
- protein_id
- product
- sequence
- replicon
- replicon_name
- start
- end
- strand
- db_xrefs
- assembly
- translation
- inference
- transl_table


In [13]:
genes

[{'id': 1,
  'feature_type': 'gene',
  'gene': 'unknown',
  'locus_tag': 'BB_U01',
  'note': 'unknown',
  'protein_id': 'unknown',
  'product': 'unknown',
  'sequence': 'ATGTATAAGTCTGTAAAAGAACAACAAGAAAAAGGAATAGATCATACATGCAGAATACTTATTCTTACCGAAACAATATTTGAAATAAATTTAATATTAGAAAATTATTCTCAAAAAACTCTACTCAAAAAGTATAACGAAAATCTCAAAAACAAAAATCTACCTCCTAGTAATATATCAACAATGAAAAAATACTTAAATCAATTAGAAAAAGAAATAAAAATCATAGCAAAATTCTATTTTAAAAACGATCAATCTCTAATTTATTATAAACTTAATTATACCCTAGAAAAAATTTGGTTAAAACTAATAGAATTATTCTACAAAGAATTAAAACAATTTATACAAAAGAACACTACTACTTAA',
  'replicon': 'AE001582.2',
  'replicon_name': 'lp21',
  'start': 252,
  'end': 618,
  'strand': 1,
  'db_xrefs': '',
  'assembly': 'B31',
  'translation': "['']",
  'inference': '',
  'transl_table': ''},
 {'id': 2,
  'feature_type': 'CDS',
  'gene': 'unknown',
  'locus_tag': 'BB_U01',
  'note': 'unknown',
  'protein_id': 'AAF07699.2',
  'product': 'conserved hypothetical protein',
  'sequence': 'ATGTATAAGTCTGTAAAAGAACAACAAGAAAAAGGAATAGATCATACATGCAGAATACTT