In [1]:
import os
import pathlib
import sqlite3
import pickle
import pandas
from tqdm.notebook import tqdm
from Bio import SeqIO
from collections import defaultdict

In [2]:
def pickle_dict(input_object, output_file):
    with open(output_file, 'wb') as jar:
        pickle.dump(input_object, jar)
    print(f"Successfully Pickled to {output_file}")

######################################################################################################
#::::::::::::::::::::::::::::::::::GENBANK PARSER AND DICT CREATOR:::::::::::::::::::::::::::::::::::#
######################################################################################################
# Lets set up our parsing function. This is just the barebones function.
# I'll expand it with additional helpers once I reach that point :)
# primarily to include the plasmid name for the replicon.and more?
def parse_genbank(file_path, assembly_id, acc2name, contig2plasmid):
    set_of_keys = set()
    ref_flag = True if assembly_id == "REF" else False
    assemblies = SeqIO.parse(file_path, "genbank")
    genes = []
    for record in assemblies:
        if ref_flag == False and assembly_id not in contig2plasmid.keys():
            break
        for feature in record.features:
            if feature.type in ["gene", "CDS", "mRNA", "tRNA", "rRNA", "signal_peptide"]:
                if ref_flag:
                    assembly_id = acc_id2name[record.id]['strain']
                    replicon_name = acc_id2name[record.id]['name']
                else:
                    if record.id not in contig2plasmid[assembly_id]:
                        replicon_name = 'NA'
                    else:
                        replicon_name = contig2plasmid[assembly_id][record.id]['plasmid_name']

                locus_tag = feature.qualifiers.get("locus_tag", ['unknown'])[0]
                db_xrefs = feature.qualifiers.get("db_xref", [])
                db_xref_dict = {xref.split(":")[0]: xref.split(":")[1] for xref in db_xrefs}

                gene_info = {
                    "feature_type": feature.type,
                    "gene": feature.qualifiers.get("gene", ["unknown"])[0],
                    "locus_tag": locus_tag,
                    "note": feature.qualifiers.get("note", ["unknown"])[0],
                    "protein_id": feature.qualifiers.get("protein_id", ["unknown"])[0],
                    "product": feature.qualifiers.get("product", ["unknown"])[0],
                    "sequence": str(feature.extract(record.seq)),
                    "replicon": record.id,
                    "replicon_name": replicon_name,
                    "start": feature.location.start,
                    "end": feature.location.end,
                    "strand": feature.location.strand,
                    "assembly": assembly_id,
                    "translation": str(feature.qualifiers.get('translation',[''])),
                    "inference": feature.qualifiers.get("inference",[""])[0],
                    "transl_table": feature.qualifiers.get("transl_table",[""])[0],
                    'db_xrefs': feature.qualifiers.get("db_xrefs",[""])[0],
                    **db_xref_dict  # Unpack db_xref_dict to include db_xrefs as columns
                }
                genes.append(gene_info)
                set_of_keys.update(gene_info.keys())

    # fill out empty dicts with None!
    for gene in genes:
        for key in set_of_keys:
            if key not in gene:
                gene[key] = None

    return genes

######################################################################################################
#::::::::::::::::::::::::::::::HARD SET SCHEMA METHODS (First attempt):::::::::::::::::::::::::::::::#
######################################################################################################
def create_table(db_name, table_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute(f'''CREATE TABLE IF NOT EXISTS {table_name} (
                      id INTEGER PRIMARY KEY AUTOINCREMENT,
                      feature_type TEXT,
                      gene TEXT,
                      locus_tag TEXT,
                      note TEXT,
                      protein_id TEXT,
                      product TEXT,
                      sequence TEXT,
                      replicon TEXT,
                      replicon_name TEXT,
                      start INTEGER,
                      end INTEGER,
                      strand INTEGER,
                      db_xrefs TEXT,
                      assembly TEXT,
                      translation TEXT,
                      inference TEXT,
                      transl_table INT)''')
    conn.commit()
    conn.close()

def populate_table(db_name, table_name, genes):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    for gene in genes:
        cursor.execute(f'''INSERT INTO {table_name}
                          (feature_type, gene, locus_tag, note, protein_id, product, sequence, replicon, replicon_name, start, end, strand, db_xrefs, assembly, translation, inference, transl_table)
                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                       (gene["feature_type"], gene["gene"], gene["locus_tag"], gene["note"],
                        gene["protein_id"], gene["product"], gene["sequence"], gene["replicon"],
                        gene["replicon_name"], gene["start"], gene["end"], gene["strand"], gene["db_xrefs"],
                        gene["assembly"], gene["translation"], gene["inference"], gene["transl_table"]))
    conn.commit()
    conn.close()
    
######################################################################################################
#::::::::::::::::::::::::::::::AUTOMATED CREATION METHOD (2nd attempt):::::::::::::::::::::::::::::::#
######################################################################################################
# Make table and populate dynamically since we don't know all of the db_xrefs
# (or rather haven't used them in SCHEMA creation)
def create_and_populate_database(genes, db_name):
    df = pandas.DataFrame(genes)
    conn = sqlite3.connect(db_name)
    df.to_sql('annotations', conn, if_exists='replace', index=False)
    conn.close()


#####################################################################################################
#:::::::::::::::::::::::::::::::THESE HELPERS WORK WITH BOTH METHODS::::::::::::::::::::::::::::::::#
#####################################################################################################
def get_tables(db_name):
    conn = sqlite3.connect(db_name)
    tables = pandas.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
    conn.close()
    return tables

def get_columns(db_name, table):
    # Describe the structure of the 'genes' table
    conn = sqlite3.connect(db_name)
    columns = pandas.read_sql_query(f"PRAGMA table_info({table});", conn)
    conn.close()
    return columns

def table_head(db_name, table):
    conn = sqlite3.connect(db_name)
    table_head = pandas.read_sql_query(f"SELECT * FROM {table} LIMIT 10;", conn)
    conn.close()
    return table_head

def get_all_of_gene(db_name, table, gene):
    conn = sqlite3.connect(db_name)
    all_genes = pandas.read_sql_query(f"SELECT * FROM {table} WHERE feature_type = 'CDS' AND gene='{gene}'", conn)
    conn.close()
    return all_genes

def get_all_of_product(db_name, table, product):
    conn = sqlite3.connect(db_name)
    all_products = pandas.read_sql_query(f"SELECT * FROM {table} WHERE feature_type = 'CDS' AND product='{product}'", conn)
    conn.close()
    return all_products

def dump_table_to_df(db_name, table):
    conn = sqlite3.connect(db_name)
    dataframe = pandas.read_sql_query(f"SELECT * FROM {table};", conn)
    conn.close()
    return dataframe

def connect_to_db(db_or_conn):
    # is our input a db path or a connection to a db?
    if isinstance(db_or_conn, str):
        # it is string, it must be a db path. Create a new connection
        conn = sqlite3.connect(db_or_conn)
        close_conn = True
    else:
        # Ain't a string so must be connection. Use the existing connection
        conn = db_or_conn
        close_conn = False
    return conn, close_conn

def get_gene_from_baktaID(db_or_conn, table, bakta_id):
    """
    example:
    print(get_gene_from_baktaID(db, 'assemblies', 'CKDGNL_04300'))
    """
    # check input db_or_conn and handle it.
    conn, close_conn = connect_to_db(db_or_conn)
    all_genes = pandas.read_sql_query(f"SELECT * FROM {table} WHERE locus_tag='{bakta_id}'", conn)
    # Pack it in pack it out!
    if close_conn:
        conn.close()
    return all_genes

# SQL should never be written more than once and here is my function.
def query_term_vs_multi_column(connection, table, term, columns):
    # Construct the SQL query
    query = f"""
    SELECT *
    FROM {table}
    WHERE {' OR '.join([f"{col} = ?" for col in columns])}
    """
    # try to find the term in each column.
    try:
        result_df = pandas.read_sql_query(query, connection, params=[term] * len(columns))
        #print(f"Query for term '{term}' succeeded.")
        return result_df # return our dataframe!
    except Exception as e:
        print(f"Query for term '{term}' failed with error: {e}")
        return pandas.DataFrame() # Empty on error.


In [11]:
db_v = 'v3.1'                  # database_version
ds_v = 'v5'                  # dataset_version
pg_v = 'v8'                  # pangenome_version
pc_v = 'v10.2'                  # plasmid calls version
pg_t = f'{pg_v}_filtered_sp' # pangenome type [('split paralogs', '*_sp'), ('no split paralogs', '*_nsp')]
db_n = f'Bbss_db_{db_v}.db' # database filename

# Set core paths
analysis_dir = pathlib.Path('/home/mf019/longread_pangenome/expanded_dataset_analysis')
assemblies_dir = analysis_dir.joinpath('assemblies', f'dataset_{ds_v}')
ref_dir = analysis_dir.joinpath('ref')
db = ref_dir.joinpath('asm_db', db_n) # database path definition!
roary_results = analysis_dir.joinpath('results', pg_v, pg_t , f'roary_{pg_t}/')

# get input assemblies
list_of_assemblies = list(assemblies_dir.glob('*/*.gbff')) # Bakta genbanks for our assemblies!

# Plasmid Calls
plasmid_calls_csv = analysis_dir.joinpath('genotyping', 'replicons', 'calls_v10', 'best_hits_1000bp_v10.2.csv') # replicon ids parsed from output of plasmid_caller.py :) # MUST USE ALL HITS TO FULLY PARSE EACH ASSEMBLY
blast_parsing_pkl = ref_dir.joinpath('replicons', 'wp', 'wp_v2_info.pkl')
# define roary output paths
clustered_proteins_file = roary_results.joinpath('clustered_proteins')
roary_gene_presence_absence_v4 = roary_results.joinpath('gene_presence_absence.Rtab')

# Define file names for pickles
contig2plasmid_pickle = ref_dir.joinpath('asm_db', f'dataset_{ds_v}_contig2plasmid_dict_1kb_{pc_v}.pkl') # dict to map from contig_id to plasmid_id
assemblies_genbank_pickle = ref_dir.joinpath('asm_db', f'dataset_{ds_v}_assembly_genbank_dict_{pc_v}.pkl') # dict with all of our parsed genbanks within.

In [12]:
# let's parse our plasmid parsing dictionary using ole (somewhat)reliable.
# REMINDER: Structure is: {NCBI_ID : {'ID', 'name', 'length', 'strain'}}
# was renamed FROM: 'blast_parsing_dict.pickle' TO: 'wp_v2_info.pkl'
# {{TODO: RENAME FILE IN PLASMID CALLER CONTAINER/UPDATE SCRIPTS!}} <- done for v6

with open(blast_parsing_pkl, 'rb') as infile:
    acc2name = pickle.load(infile)

In [13]:
###########################################
# Parse best_hits for contig2plasmid dict #
###########################################
# Let's pull the plasmid calls in and  make us a dataframe
contig2plasmid_df = pandas.read_csv(plasmid_calls_csv, delimiter=',')
# subset that to just what we care about (for now)
contig2plasmid_df = contig2plasmid_df[[ 'assembly_id', 'contig_id', 'plasmid_name' ]] # dropped 'completeness'
contig2plasmid_df['contig_header'] = contig2plasmid_df['contig_id']
contig2plasmid_df['contig_id'] = contig2plasmid_df['contig_id'].apply(lambda x: x.split(' ')[0])
# and now let's flip it to a dict so we can use key lookups to simply rename them.
contig2plasmid = defaultdict(lambda: defaultdict(dict))
for _, row in contig2plasmid_df.iterrows():
    contig2plasmid[row['assembly_id']][row['contig_id']] = {
        'contig_header' : row ['contig_header'],
        'plasmid_name': row['plasmid_name'],
    }
contig2plasmid = {name: dict(contigs) for name, contigs in contig2plasmid.items()}

In [14]:
####################
# Parse assemblies #
####################
all_assembly_genes = []
total_feats = 0
for assembly in tqdm(list_of_assemblies, desc="Parsing assembly genbanks!"):
    current_genes = []
    assembly_id = '.'.join(os.path.basename(assembly).split('.')[0:-1])
    print(assembly_id)
    current_genes = parse_genbank(assembly, assembly_id, acc2name, contig2plasmid)
    num_feats = len(current_genes)
    total_feats += num_feats
    all_assembly_genes.append(current_genes)
all_assembly_genes_flat = [feature for features in all_assembly_genes for feature in features]
all_feats = len(all_assembly_genes_flat)
print("Making sure features match between input and final object")
print(f'list: {all_feats}')
print(f'actual: {total_feats}')
pickle_dict(all_assembly_genes_flat, assemblies_genbank_pickle)

Parsing assembly genbanks!:   0%|          | 0/82 [00:00<?, ?it/s]

GCF_040790765.1_ASM4079076v1_genomic
B418P
GCF_002151465.1_ASM215146v1_genomic
URI87H
URI34H
URI88H
URI33H
UCT110H
URI39H
URI91H
UCT35H
ESI361H
UWI247P
URI120H
URI107H
GCF_002151505.1_ASM215150v1_genomic
UWI263P
GCF_040790795.1_ASM4079079v1_genomic
GCF_040790785.1_ASM4079078v1_genomic
URI89H
URI42H
URI44H
UCT109H
URI40H
B500P
URI117H
URI47H
URI86H
GCF_040790715.1_ASM4079071v1_genomic
GCF_040790755.1_ASM4079075v1_genomic
URI36H
UNY208P
GCF_040790745.1_ASM4079074v1_genomic
ESI26H
UCT31H
GCF_040790805.1_ASM4079080v1_genomic
ESI403H
URI56H
XYZ459H
UCT30H
GCF_040790735.1_ASM4079073v1_genomic
GCF_040819585.1_PFhe_I_PB_Ill_cons_genomic
URI103H
UCT29H
UNY1128P
URI112H
UNY1032P
UWI248P
UNY203P
UCT96H
GCF_019134655.1_ASM1913465v1_genomic
ESI425H
UCT32H
UNY990P
UNY193P
UCT113H
URI93H
GCF_024662155.1_ASM2466215v1_genomic
UNY169P
UWI283P
UNY1090P
UNY1083P
B331P
URI102H
URI41H
UNY172P
GCF_040790775.1_ASM4079077v1_genomic
UNY149P
GCF_024662195.1_ASM2466219v1_genomic
UCT92H
URI118H
UCT50H
URI101H
UNY1

In [15]:
#create_table(db, "assemblies")
#populate_table(db,"assemblies", all_assembly_genes_flat)
create_and_populate_database(all_assembly_genes_flat, db)

In [16]:
get_tables(db)
get_columns(db, 'annotations')
table_head(db, 'annotations')

Unnamed: 0,feature_type,gene,locus_tag,note,protein_id,product,sequence,replicon,replicon_name,start,...,UniRef,UniParc,RFAM,GO,BlastRules,KEGG,PFAM,EC,COG,NCBIProtein
0,gene,unknown,BFHFJM_00001,unknown,unknown,unknown,ATATATAATTTAATAGTATATATATATAATTTAATAGATAAAAAAT...,NZ_CP161107.1,chromosome,2,...,,,,,,,,,,
1,CDS,unknown,BFHFJM_00001,(5' truncated),gnl|Bakta|BFHFJM_00001,hypothetical protein,ATATATAATTTAATAGTATATATATATAATTTAATAGATAAAAAAT...,NZ_CP161107.1,chromosome,2,...,,,,,,,,,,
2,gene,unknown,BFHFJM_00002,unknown,unknown,unknown,ATGAAATATAGTGCTATTTTATTAATATGTAGCGTTAATTTATTTT...,NZ_CP161107.1,chromosome,102,...,,,,,,,,,,
3,CDS,unknown,BFHFJM_00002,unknown,gnl|Bakta|BFHFJM_00002,Uncharacterized protein BB_0001,ATGAAATATAGTGCTATTTTATTAATATGTAGCGTTAATTTATTTT...,NZ_CP161107.1,chromosome,102,...,UniRef90_O51035,,,,,,,,,
4,gene,unknown,BFHFJM_00003,unknown,unknown,unknown,ATGGACTTTTTAAAAACCTTTTCTTTTTTGTTTTTTAGCTTTTTTT...,NZ_CP161107.1,chromosome,782,...,,,,,,,,,,
5,CDS,unknown,BFHFJM_00003,unknown,gnl|Bakta|BFHFJM_00003,Glycoside hydrolase family 3 N-terminal domain...,ATGGACTTTTTAAAAACCTTTTCTTTTTTGTTTTTTAGCTTTTTTT...,NZ_CP161107.1,chromosome,782,...,UniRef90_O54536,UPI000D02875C,,,,,,,,
6,gene,unknown,BFHFJM_00004,unknown,unknown,unknown,ATGAATGTTAAAGTTGATAAAATTTTTTCTGAAATGATACTTGAAA...,NZ_CP161107.1,chromosome,1798,...,,,,,,,,,,
7,CDS,unknown,BFHFJM_00004,unknown,gnl|Bakta|BFHFJM_00004,UTP--glucose-1-phosphate uridylyltransferase,ATGAATGTTAAAGTTGATAAAATTTTTTCTGAAATGATACTTGAAA...,NZ_CP161107.1,chromosome,1798,...,UniRef90_A0A0H3C280,UPI00016C372B,,,,,,,,
8,gene,manB,BFHFJM_00005,unknown,unknown,unknown,ATGCTTAAACAATATTCACTTAACATGAAAAATTTTAAAAAAGCTT...,NZ_CP161107.1,chromosome,3410,...,,,,,,,,,,
9,CDS,manB,BFHFJM_00005,unknown,gnl|Bakta|BFHFJM_00005,Phosphomannomutase,ATGCTTAAACAATATTCACTTAACATGAAAAATTTTAAAAAAGCTT...,NZ_CP161107.1,chromosome,3410,...,UniRef90_O51892,UPI00016C4D34,,,,,,,G,
