In [18]:
import sqlite3
from Bio import SeqIO, SeqFeature, SeqRecord
from collections import defaultdict
import pickle
import glob
import os

In [19]:
list_of_assemblies = glob.glob('/home/mf019/longread_pangenome/longread_analysis/paired_assemblies/paired_only/*/annotation/*/*.gbff')

In [56]:
# Lets set up our parsing function. This is just the barebones function. I'll expand it with additional helpers
# once I reach that point :)
# primarily to include the plasmid name for the replicon.
# and to
def parse_genbank(file_path):
    assembly_id = os.path.basename(file_path).split('.')[0]
    assemblies = SeqIO.parse(file_path, "genbank")
    genes = []
    for record in assemblies:
        for feature in record.features:
            if feature.type in ["gene", "CDS", "mRNA", "tRNA", "rRNA", "signal_peptide"]:
                gene_info = {
                    "feature_type": feature.type,
                    "gene_id": feature.id,
                    "gene": feature.qualifiers.get("gene", ["unknown"])[0],
                    "gene_name": feature.qualifiers.get("name", ["unknown"])[0],
                    "gene_family": feature.qualifiers.get("note", ["unknown"])[0],
                    "protein_id": feature.qualifiers.get("protein_id", ["unknown"])[0],
                    "product": feature.qualifiers.get("product", ["unknown"])[0],
                    "sequence": str(feature.extract(record.seq)),
                    "contig": record.id,
                    "replicon_name": "Placeholder for merging with calls :)",
                    "start": feature.location.start,
                    "end": feature.location.end,
                    "strand": feature.location.strand,
                    "dbxrefs": feature.qualifiers.get("dbxrefs",["No crossrefs?"])[0],
                    "assembly": assembly_id,
                    "refseq_id": feature.qualifiers.get("RefSeq", ["No RefSeq?"])[0],
                    "gene_seq": str(feature.extract(record.seq)),
                }
                genes.append(gene_info)
    return genes

In [57]:
all_genes = []
total_feats = 0
for assembly in list_of_assemblies:
    current_genes = []
    assembly_id = os.path.basename(assembly).split('.')[0]
    print(assembly_id)
    current_genes = parse_genbank(assembly)
    num_feats = len(current_genes)
    print(num_feats)
    total_feats += num_feats
    all_genes.append(current_genes)
all_genes_flat = [feature for features in all_genes for feature in features]

URI87H
3009
URI34H
2667
URI88H
3081
URI33H
2621
UCT110H
2763
URI39H
3161
URI91H
3225
UCT35H
2633
UWI247P
2677
URI120H
3167
URI107H
3035
UWI263P
2821
URI89H
3129
URI42H
2921
URI44H
2759
UCT109H
2953
URI40H
3017
URI117H
2671
URI47H
3031
URI86H
2735
URI36H
2923
UNY208P
2829
ESI26H
2503
UCT31H
3105
URI56H
2451
UCT30H
2847
URI103H
2925
UCT29H
3173
URI112H
2809
UWI248P
2751
UNY203P
2655
UCT96H
2979
UCT32H
2995
UNY193P
2793
UCT113H
2455
URI93H
2929
UNY169P
2685
UWI283P
2631
URI102H
2863
URI41H
2951
UNY172P
2673
UNY149P
2855
UCT92H
3101
URI118H
2711
UCT50H
2647
URI101H
3105
URI46H
2803
URI48H
2723
URI111H
3103
URI47
2665
URI103
2571
UCT31
2625
UCT92
2649
URI46
2633
UWI247
2481
UCT30
2597
URI44
2591
URI87
2673
UCT50
2611
UWI283
2511
URI107
2679
UCT110
2659
URI48
2665
UNY203
2513
UWI263
2465
URI117
2443
UCT35
2571
URI111
2723
URI33
2601
URI86
2567
UNY149
2617
UCT96
2499
URI56
2579
URI118
2383
UNY172
2499
URI112
2567
UNY208
2485
URI89
2751
URI102
2659
URI36
2623
UWI248
2487
ESI26
2581
URI40
2715


In [58]:
all_feats = len(all_genes_flat)
print(f'list: {all_feats}')
print(f'actual: {total_feats}')

list: 268857
actual: 268857


In [59]:
def create_database(db_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS genes (
                      id INTEGER PRIMARY KEY AUTOINCREMENT,
                      feature_type TEXT,
                      gene_id TEXT,
                      gene TEXT,
                      gene_name TEXT,
                      gene_family TEXT,
                      protein_id TEXT,
                      product TEXT,
                      sequence TEXT,
                      contig TEXT,
                      replicon_name TEXT,
                      start INTEGER,
                      end INTEGER,
                      strand INTEGER,
                      dbxrefs TEXT,
                      assembly TEXT,
                      refseq_id TEXT,
                      gene_seq TEXT)''')
    conn.commit()
    conn.close()
    
def populate_database(db_name, genes):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    for gene in genes:
        cursor.execute('''INSERT INTO genes
                          (feature_type, gene_id, gene, gene_name, gene_family, protein_id, product, sequence, contig, replicon_name, start, end, strand, dbxrefs, assembly, refseq_id, gene_seq)
                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                       (gene["feature_type"], gene["gene_id"], gene["gene"], gene["gene_name"], gene["gene_family"],
                        gene["protein_id"], gene["product"], gene["sequence"], gene["contig"],
                        gene["replicon_name"], gene["start"], gene["end"], gene["strand"], gene["dbxrefs"],
                        gene["assembly"], gene["refseq_id"], gene["gene_seq"]))
    conn.commit()
    conn.close()

In [60]:
# Example usage
def query_database(db_name, **kwargs):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    query = "SELECT * FROM genes WHERE "
    conditions = []
    values = []
    
    for key, value in kwargs.items():
        conditions.append(f"{key} = ?")
        values.append(value)
    
    query += " AND ".join(conditions)
    cursor.execute(query, values)
    results = cursor.fetchall()
    conn.close()
    return results

In [61]:
create_database("genes.db")
populate_database("genes.db",all_genes_flat)

ProgrammingError: Error binding parameter 17: type 'Seq' is not supported