In [171]:
import sqlite3
from Bio import SeqIO, SeqFeature, SeqRecord
from collections import defaultdict
import pickle
import glob
import os
import pandas

In [172]:
with open('../../plasmid_id/blast/blast_parsing_dict.pickle', 'rb') as infile:
    id_parsing_dict = pickle.load(infile)
print(id_parsing_dict)

{'CP124100.1': {'ID': 'CP124100.1', 'name': 'chromosome', 'length': 910422, 'strain': 'NE_5261'}, 'CP124098.1': {'ID': 'CP124098.1', 'name': 'lp17', 'length': 17222, 'strain': 'NE_5261'}, 'CP124097.1': {'ID': 'CP124097.1', 'name': 'cp26', 'length': 26454, 'strain': 'NE_5261'}, 'CP124099.1': {'ID': 'CP124099.1', 'name': 'lp54', 'length': 53730, 'strain': 'NE_5261'}, 'CP019844.1': {'ID': 'CP019844.1', 'name': 'chromosome', 'length': 909921, 'strain': 'PAli'}, 'CP019851.1': {'ID': 'CP019851.1', 'name': 'lp17', 'length': 16820, 'strain': 'PAli'}, 'CP019845.1': {'ID': 'CP019845.1', 'name': 'cp26', 'length': 26498, 'strain': 'PAli'}, 'CP019846.1': {'ID': 'CP019846.1', 'name': 'cp32-1', 'length': 30726, 'strain': 'PAli'}, 'CP019847.1': {'ID': 'CP019847.1', 'name': 'cp32-3', 'length': 30418, 'strain': 'PAli'}, 'CP019848.1': {'ID': 'CP019848.1', 'name': 'cp32-4', 'length': 30299, 'strain': 'PAli'}, 'CP019849.1': {'ID': 'CP019849.1', 'name': 'cp32-5', 'length': 30674, 'strain': 'PAli'}, 'CP01985

In [173]:

plasmid_calls_csv = '/Users/mf019/bioinformatics/longread_pangenome/plasmid_id/output/plasmid_calls_V6/best_matches_v6_1000kb.tsv'
contig2plasmid_df = pandas.read_csv(plasmid_calls_csv, delimiter='\t')
contig2plasmid_df = contig2plasmid_df[['name', 'contig', 'best_hit', 'completeness']]
contig2plasmid_df['new_contig'] = contig2plasmid_df.groupby('name').cumcount() + 1
contig2plasmid_df['new_contig'] = contig2plasmid_df['new_contig'].apply(lambda x: f'contig_{x}')
contig2plasmid_df


Unnamed: 0,name,contig,best_hit,completeness,new_contig
0,URI47,NODE_1_length_463767_cov_77.611941,chromosome,incomplete,contig_1
1,URI47,NODE_2_length_221463_cov_78.896272,chromosome,incomplete,contig_2
2,URI47,NODE_3_length_213568_cov_76.158203,chromosome,incomplete,contig_3
3,URI47,NODE_4_length_53880_cov_109.176347,lp54,presumed,contig_4
4,URI47,NODE_5_length_39870_cov_168.736612,lp36,presumed,contig_5
...,...,...,...,...,...
16824,URI93H,contig000025,,incomplete,contig_25
16825,URI93H,contig000026,,incomplete,contig_26
16826,URI93H,contig000027,,incomplete,contig_27
16827,URI93H,contig000028,,incomplete,contig_28


In [174]:

contig2plasmid_dict = defaultdict(lambda: defaultdict(dict))
for _, row in contig2plasmid_df.iterrows():
    contig2plasmid_dict[row['name']][row['new_contig']] = {
        'contig' : row ['contig'],
        'best_hit': row['best_hit'],
        'completeness': row['completeness']
    }
    contig2plasmid_dict[row['name']][row['contig']] = {
        'contig' : row ['new_contig'],
        'best_hit': row['best_hit'],
        'completeness': row['completeness']
    }
contig2plasmid_dict = {name: dict(contigs) for name, contigs in contig2plasmid_dict.items()}

In [175]:
print(contig2plasmid_df)

         name                              contig    best_hit completeness  \
0       URI47  NODE_1_length_463767_cov_77.611941  chromosome   incomplete   
1       URI47  NODE_2_length_221463_cov_78.896272  chromosome   incomplete   
2       URI47  NODE_3_length_213568_cov_76.158203  chromosome   incomplete   
3       URI47  NODE_4_length_53880_cov_109.176347        lp54     presumed   
4       URI47  NODE_5_length_39870_cov_168.736612        lp36     presumed   
...       ...                                 ...         ...          ...   
16824  URI93H                        contig000025         NaN   incomplete   
16825  URI93H                        contig000026         NaN   incomplete   
16826  URI93H                        contig000027         NaN   incomplete   
16827  URI93H                        contig000028         NaN   incomplete   
16828  URI93H                        contig000029         NaN   incomplete   

      new_contig  
0       contig_1  
1       contig_2  
2     

In [176]:
# Lets set up our parsing function. This is just the barebones function. I'll expand it with additional helpers
# once I reach that point :)
# primarily to include the plasmid name for the replicon.
# and to
def parse_genbank(file_path, assembly_id):
    set_of_keys = set()
    ref_flag = True if assembly_id == "REF" else False
    assemblies = SeqIO.parse(file_path, "genbank")
    genes = []
    for record in assemblies:
        if ref_flag == False and assembly_id not in contig2plasmid_dict.keys():
            break
        for feature in record.features:
            if feature.type in ["gene", "CDS", "mRNA", "tRNA", "rRNA", "signal_peptide"]:
                if ref_flag:
                    assembly_id = id_parsing_dict[record.id]['strain']
                    replicon_name = id_parsing_dict[record.id]['name']
                else:
                    replicon_name = contig2plasmid_dict[assembly_id][record.id]['best_hit']

                locus_tag = feature.qualifiers.get("locus_tag", ['unknown'])[0]
                db_xrefs = feature.qualifiers.get("db_xref", [])
                db_xref_dict = {xref.split(":")[0]: xref.split(":")[1] for xref in db_xrefs}

                gene_info = {
                    "feature_type": feature.type,
                    "gene": feature.qualifiers.get("gene", ["unknown"])[0],
                    "locus_tag": locus_tag,
                    "note": feature.qualifiers.get("note", ["unknown"])[0],
                    "protein_id": feature.qualifiers.get("protein_id", ["unknown"])[0],
                    "product": feature.qualifiers.get("product", ["unknown"])[0],
                    "sequence": str(feature.extract(record.seq)),
                    "replicon": record.id,
                    "replicon_name": replicon_name,
                    "start": feature.location.start,
                    "end": feature.location.end,
                    "strand": feature.location.strand,
                    "assembly": assembly_id,
                    "translation": str(feature.qualifiers.get('translation',[''])),
                    "inference": feature.qualifiers.get("inference",[""])[0],
                    "transl_table": feature.qualifiers.get("transl_table",[""])[0],
                    **db_xref_dict  # Unpack db_xref_dict to include db_xrefs as columns
                }
                genes.append(gene_info)
                set_of_keys.update(gene_info.keys())

    # fill out empty dicts with None!
    for gene in genes:
        for key in set_of_keys:
            if key not in gene:
                gene[key] = None

    return genes

def create_and_populate_database(genes, db_name):
    df = pandas.DataFrame(genes)
    conn = sqlite3.connect(db_name)
    df.to_sql('annotations', conn, if_exists='replace', index=False)
    conn.close()

In [230]:
#def create_table(db_name, table_name):
#    conn = sqlite3.connect(db_name)
#    cursor = conn.cursor()
#    cursor.execute(f'''CREATE TABLE IF NOT EXISTS {table_name} (
#                      id INTEGER PRIMARY KEY AUTOINCREMENT,
#                      feature_type TEXT,
#                      gene TEXT,
#                      locus_tag TEXT,
#                      note TEXT,
#                      protein_id TEXT,
#                      product TEXT,
#                      sequence TEXT,
#                      replicon TEXT,
#                      replicon_name TEXT,
#                      start INTEGER,
#                      end INTEGER,
#                      strand INTEGER,
#                      db_xrefs TEXT,
#                      assembly TEXT,
#                      translation TEXT,
#                      inference TEXT,
#                      transl_table INT)''')
#    conn.commit()
#    conn.close()
#
#def populate_table(db_name, table_name, genes):
#    conn = sqlite3.connect(db_name)
#    cursor = conn.cursor()
#    for gene in genes:
#        cursor.execute(f'''INSERT INTO {table_name}
#                          (feature_type, gene, locus_tag, note, protein_id, product, sequence, replicon, replicon_name, start, end, strand, db_xrefs, assembly, translation, inference, transl_table)
#                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
#                       (gene["feature_type"], gene["gene"], gene["locus_tag"], gene["note"],
#                        gene["protein_id"], gene["product"], gene["sequence"], gene["replicon"],
#                        gene["replicon_name"], gene["start"], gene["end"], gene["strand"], gene["db_xrefs"],
#                        gene["assembly"], gene["translation"], gene["inference"], gene["transl_table"]))
#    conn.commit()
#    conn.close()

def get_tables(db_name):
    conn = sqlite3.connect(db_name)
    tables = pandas.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
    conn.close()
    return tables

def get_columns(db_name, table):
    # Describe the structure of the 'genes' table
    conn = sqlite3.connect(db_name)
    columns = pandas.read_sql_query(f"PRAGMA table_info({table});", conn)
    conn.close()
    return columns

def table_head(db_name, table):
    conn = sqlite3.connect(db_name)
    table_head = pandas.read_sql_query(f"SELECT * FROM {table} LIMIT 10;", conn)
    conn.close()
    return table_head

def get_all_of_gene(db_name, table, gene):
    conn = sqlite3.connect(db_name)
    all_genes = pandas.read_sql_query(f"SELECT * FROM {table} WHERE feature_type = 'CDS' AND gene='{gene}'", conn)
    conn.close()
    return all_genes

def get_all_of_product(db_name, table, product):
    conn = sqlite3.connect(db_name)
    all_products = pandas.read_sql_query(f"SELECT * FROM {table} WHERE feature_type = 'CDS' AND product='{product}'", conn)
    conn.close()
    return all_products

def dump_table_to_df(db_name, table):
    conn = sqlite3.connect(db_name)
    dataframe = pandas.read_sql_query(f"SELECT * FROM {table};", conn)
    conn.close()
    return dataframe

def get_gene_from_baktaID(db_or_conn, table, bakta_id):
    # is arg1 a string or a connection?
    if isinstance(db_or_conn, str):
        # it is string, thus db path. Create a new connection
        conn = sqlite3.connect(db_or_conn)
        close_conn = True
    else:
        # Ain't a string so must be connection. Use the existing connection
        conn = db_or_conn
        close_conn = False
    all_genes = pandas.read_sql_query(f"SELECT * FROM {table} WHERE locus_tag='{bakta_id}'", conn)
    # Pack it in pack it out!
    if close_conn:
        conn.close()
    return all_genes

In [178]:
list_of_assemblies = glob.glob('/Users/mf019/bioinformatics/longread_pangenome/assemblies/paired_assemblies/paired_only/*/annotation/*.gbff')
print(len(list_of_assemblies))
bakta_reference_genbank = '/Users/mf019/bioinformatics/longread_pangenome/ref/all_plasmids_bbss/Bbss.gbff'
list_of_ncbi_ref_genbanks = glob.glob('/Users/mf019/bioinformatics/longread_pangenome/plasmid_id/plasmid_seqs/raw_gbs/*.gb')
print(len(list_of_ncbi_ref_genbanks))

105
317


In [179]:
all_assembly_genes = []
total_feats = 0
for assembly in list_of_assemblies:
    current_genes = []
    assembly_id = os.path.basename(assembly).split('.')[0]
    print(assembly_id)
    current_genes = parse_genbank(assembly, assembly_id)
    num_feats = len(current_genes)
    #print(assembly_id)
    #print(num_feats)
    total_feats += num_feats
    all_assembly_genes.append(current_genes)
all_assembly_genes_flat = [feature for features in all_assembly_genes for feature in features]
all_feats = len(all_assembly_genes_flat)
print(f'list: {all_feats}')
print(f'actual: {total_feats}')

ESI26
URI112
UCT110
URI86
UCT29
URI44
URI87
URI91
UCT32
UWI263
URI48
URI33
URI118
URI102
URI34


Exception ignored in: <function tqdm.__del__ at 0x320110b80>
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/tqdm/std.py", line 1147, in __del__
    def __del__(self):

KeyboardInterrupt: 


UNY172
UWI248
UCT35
UNY149
URI103
UCT96
URI42
URI39
UNY169
URI120
URI41
UNY193
URI36
URI101
URI117
UNY203
UWI247
URI40
URI56
UCT31
URI88
UNY208
URI47
UWI283
UCT50
URI93
UCT113
URI46
UCT92
UCT109
URI107
URI111
URI89
UCT30
URI101H
ESI26H
URI118H
URI89H
URI33H
UNY1032P
URI103H
URI107H
URI56H
UNY203P
URI93H
URI102H
URI91H
UNY1090P
URI88H
URI48H
UNY1128P
URI34H
URI120H
UWI248P
URI36H
UNY1083P
UCT30H
URI42H
UNY193P
UCT29H
UCT32H
URI40H
URI111H
UCT113H
URI44H
UNY1085P
UNY208P
URI86H
URI117H
URI46H
UCT92H
UNY1038P
UNY149P
URI39H
URI41H
UCT31H
UWI263P
URI112H
UWI247P
UCT50H
UCT109H
UNY990P
UWI283P
UCT35H
URI47H
UCT110H
UNY172P
URI87H
UNY169P
UCT96H
list: 268857
actual: 268857


In [None]:
bakta_ref_genes = parse_genbank(bakta_reference_genbank, "REF")
print(len(bakta_ref_genes))

0


In [None]:
all_ref_ncbi_genes = []
total_feats = 0
for plasmid in list_of_ncbi_ref_genbanks:
    current_genes = []
    current_genes = parse_genbank(plasmid, "REF")
    num_feats = len(current_genes)
    total_feats += num_feats
    all_ref_ncbi_genes.append(current_genes)
all_ref_ncbi_genes_flat = [feature for features in all_ref_ncbi_genes for feature in features]
all_feats = len(all_ref_ncbi_genes_flat)
print(f'list: {all_feats}')
print(f'actual: {total_feats}')

list: 58093
actual: 58093


In [None]:
db = "newgenes.db"
create_table(db, "assemblies")
populate_table(db,"assemblies", all_assembly_genes_flat)

create_table(db, "reference")
populate_table(db,"reference", bakta_ref_genes)

create_table(db, "ncbi_gb")
populate_table(db, "ncbi_gb", all_ref_ncbi_genes_flat)
#

KeyError: 'dbxrefs'

In [None]:
new_db = "newannotations.db"
create_and_populate_database(all_assembly_genes_flat, new_db)

In [None]:
# List all tables
print(get_tables(new_db))

          name
0  annotations


In [None]:
# Describe the structure of the 'genes' table
print(get_columns(new_db, "annotations"))
# Query some data from the 'genes' table
print(table_head(new_db, "annotations"))

    cid           name     type  notnull dflt_value  pk
0     0   feature_type     TEXT        0       None   0
1     1           gene     TEXT        0       None   0
2     2      locus_tag     TEXT        0       None   0
3     3           note     TEXT        0       None   0
4     4     protein_id     TEXT        0       None   0
5     5        product     TEXT        0       None   0
6     6       sequence     TEXT        0       None   0
7     7       replicon     TEXT        0       None   0
8     8  replicon_name     TEXT        0       None   0
9     9          start  INTEGER        0       None   0
10   10            end  INTEGER        0       None   0
11   11         strand  INTEGER        0       None   0
12   12       assembly     TEXT        0       None   0
13   13    translation     TEXT        0       None   0
14   14      inference     TEXT        0       None   0
15   15   transl_table     TEXT        0       None   0
16   16     BlastRules     TEXT        0       N

In [None]:
# Describe the structure of the 'reference' table
print(get_columns(db, "reference"))
# Query some data from the 'reference' table
print(table_head(db, "reference"))


    cid           name     type  notnull dflt_value  pk
0     0             id  INTEGER        0       None   1
1     1   feature_type     TEXT        0       None   0
2     2           gene     TEXT        0       None   0
3     3      locus_tag     TEXT        0       None   0
4     4           note     TEXT        0       None   0
5     5     protein_id     TEXT        0       None   0
6     6        product     TEXT        0       None   0
7     7       sequence     TEXT        0       None   0
8     8       replicon     TEXT        0       None   0
9     9  replicon_name     TEXT        0       None   0
10   10          start  INTEGER        0       None   0
11   11            end  INTEGER        0       None   0
12   12         strand  INTEGER        0       None   0
13   13        dbxrefs     TEXT        0       None   0
14   14       assembly     TEXT        0       None   0
15   15    translation     TEXT        0       None   0
16   16      inference     TEXT        0       N

In [None]:
# Describe the structure of the 'reference' table
print(get_columns(db, "ncbi_gb"))
# Query some data from the 'reference' table
print(table_head(db, "ncbi_gb"))

    cid           name     type  notnull dflt_value  pk
0     0             id  INTEGER        0       None   1
1     1   feature_type     TEXT        0       None   0
2     2           gene     TEXT        0       None   0
3     3      locus_tag     TEXT        0       None   0
4     4           note     TEXT        0       None   0
5     5     protein_id     TEXT        0       None   0
6     6        product     TEXT        0       None   0
7     7       sequence     TEXT        0       None   0
8     8       replicon     TEXT        0       None   0
9     9  replicon_name     TEXT        0       None   0
10   10          start  INTEGER        0       None   0
11   11            end  INTEGER        0       None   0
12   12         strand  INTEGER        0       None   0
13   13        dbxrefs     TEXT        0       None   0
14   14       assembly     TEXT        0       None   0
15   15    translation     TEXT        0       None   0
16   16      inference     TEXT        0       N

In [None]:
specific_genes = ['ospC', 'cspZ', 'cspA']
#all_cspA_df = get_all_of_gene(db, "assemblies", "cspA") # IT IS NOT ANNOTATED AS CSPA SO THANKS REFSEQ!
all_cspa_df = get_all_of_product(db, "assemblies", "complement regulator-acquiring protein")
all_cspZ_df = get_all_of_gene(db, "assemblies", "cspZ")
all_ospC_df = get_all_of_gene(db, "assemblies", "ospC")

In [None]:
print(all_cspA_df)
print(all_cspZ_df)
print(all_ospC_df)

Empty DataFrame
Columns: [id, feature_type, gene, locus_tag, note, protein_id, product, sequence, replicon, replicon_name, start, end, strand, dbxrefs, assembly, translation, inference, transl_table]
Index: []
         id feature_type  gene     locus_tag     note              protein_id  \
0     12699          CDS  cspZ  DNBOAP_05820  unknown  gnl|Bakta|DNBOAP_05820   
1     17993          CDS  cspZ  NCOCKI_05785  unknown  gnl|Bakta|NCOCKI_05785   
2     20630          CDS  cspZ  DMHAMM_05695  unknown  gnl|Bakta|DMHAMM_05695   
3     25598          CDS  cspZ  EMEHFM_04770  unknown  gnl|Bakta|EMEHFM_04770   
4     36062          CDS  cspZ  BOHBDJ_05645  unknown  gnl|Bakta|BOHBDJ_05645   
..      ...          ...   ...           ...      ...                     ...   
124  829355          CDS  cspZ  FMOGGM_06175  unknown  gnl|Bakta|FMOGGM_06175   
125  834861          CDS  cspZ  PCFIDF_05865  unknown  gnl|Bakta|PCFIDF_05865   
126  840673          CDS  cspZ  AIHKLI_07085  unknown  gnl|Ba

In [None]:
conn = sqlite3.connect(db)
all_genes = pandas.read_sql_query(f"SELECT * FROM ncbi_gb WHERE feature_type = 'CDS' AND UPPER(gene) LIKE 'cspA'", conn)
conn.close()

print(all_genes)

        id feature_type  gene    locus_tag     note  protein_id  \
0     2938          CDS  cspA   BbuN40_A68  unknown  ACS94810.1   
1     7979          CDS  cspA  Bbu156a_A60  unknown  ACL33808.1   
2    34597          CDS  cspA       BB_A68  unknown  AAC66286.1   
3    36640          CDS  cspA   BbuZS7_A59  unknown  ACK74237.1   
4    61031          CDS  cspA   BbuN40_A68  unknown  ACS94810.1   
5    66072          CDS  cspA  Bbu156a_A60  unknown  ACL33808.1   
6    92690          CDS  cspA       BB_A68  unknown  AAC66286.1   
7    94733          CDS  cspA   BbuZS7_A59  unknown  ACK74237.1   
8   119124          CDS  cspA   BbuN40_A68  unknown  ACS94810.1   
9   124165          CDS  cspA  Bbu156a_A60  unknown  ACL33808.1   
10  150783          CDS  cspA       BB_A68  unknown  AAC66286.1   
11  152826          CDS  cspA   BbuZS7_A59  unknown  ACK74237.1   

                                              product  \
0   complement regulator acquiring protein 1; puta...   
1   complement

In [None]:
print(get_gene_from_baktaID(db, 'assemblies', 'CKDGNL_04300'))

       id feature_type     gene     locus_tag     note  \
0    1716         gene  unknown  CKDGNL_04300  unknown   
1    1717          CDS  unknown  CKDGNL_04300  unknown   
2  290342         gene  unknown  CKDGNL_04300  unknown   
3  290343          CDS  unknown  CKDGNL_04300  unknown   
4  578968         gene  unknown  CKDGNL_04300  unknown   
5  578969          CDS  unknown  CKDGNL_04300  unknown   

               protein_id                                 product  \
0                 unknown                                 unknown   
1  gnl|Bakta|CKDGNL_04300  complement regulator-acquiring protein   
2                 unknown                                 unknown   
3  gnl|Bakta|CKDGNL_04300  complement regulator-acquiring protein   
4                 unknown                                 unknown   
5  gnl|Bakta|CKDGNL_04300  complement regulator-acquiring protein   

                                            sequence  replicon replicon_name  \
0  TTGAAAAATAATAAATTAATTGCAAT

In [181]:
# SQL should never be written more than once and here is my function.
def search_terms_in_db(connection, table, term, columns):
    # Construct the SQL query
    query = f"""
    SELECT *
    FROM {table}
    WHERE {' OR '.join([f"{col} = ?" for col in columns])}
    """
    # try to find the term in each column.
    try:
        result_df = pandas.read_sql_query(query, connection, params=[term] * len(columns))
        #print(f"Query for term '{term}' succeeded.")
        return result_df # return our dataframe!
    except Exception as e:
        print(f"Query for term '{term}' failed with error: {e}")
        return pandas.DataFrame() # Empty on error.

In [187]:
pangenome_df  = pandas.read_csv('/Users/mf019/bioinformatics/longread_pangenome/longread_analysis/v4/longread_paired_v4/gene_presence_absence.Rtab', delimiter='\t')
clustered_proteins_file = '/Users/mf019/bioinformatics/longread_pangenome/longread_analysis/v4/longread_paired_v4/clustered_proteins'

In [183]:
pangene_groups = pangenome_df['Gene'].to_list()

In [184]:
from tqdm.notebook import tqdm

list_of_unresolved_groups = []
group_dfs = []
searched_groups = []
columns_to_check = ['product', 'locus_tag', 'gene']

# Establish a connection to the database
connection = sqlite3.connect(new_db)
cursor = connection.cursor()

# ok lets start this loop now.
for group in tqdm(pangene_groups, desc="Searching for those groups!"):
    if "group" in group: # pull out the obvious groups that won't be found.
        list_of_unresolved_groups.append(group) # put em in a list that we're gonna do differently! using protein_clusters above!
    else:
        group_df = search_terms_in_db(connection, "annotations", group, columns_to_check)
        group_dfs.append(group_df)
        searched_groups.append(group)

# Close the connection
connection.close()
print(list_of_unresolved_groups)

Searching for those groups!:   0%|          | 0/3921 [00:00<?, ?it/s]

['group_1108', 'group_1198', 'group_1199', 'group_1206', 'group_1208', 'group_1217', 'group_1228', 'group_1253', 'group_1299', 'group_1382', 'group_1387', 'group_1390', 'group_1391', 'group_1392', 'group_1406', 'group_1407', 'group_1408', 'group_1415', 'group_1418', 'group_1434', 'group_1438', 'group_1439', 'group_1471', 'group_1472', 'group_1482', 'group_2521', 'group_2532', 'group_2541', 'group_2544', 'group_2555', 'group_2556', 'group_2557', 'group_2558', 'group_2596', 'group_2598', 'group_2602', 'group_2667', 'group_2668', 'group_2693', 'group_2701', 'group_2711', 'group_2773', 'group_2778', 'group_2782', 'group_2784', 'group_2810', 'group_2811', 'group_2854', 'group_2894', 'group_2909', 'group_2911', 'group_2935', 'group_2952', 'group_2968', 'group_2990', 'group_2992', 'group_3022', 'group_3075', 'group_3099', 'group_3110', 'group_3116', 'group_3169', 'group_3172', 'group_3179', 'group_3181', 'group_3222', 'group_3224', 'group_3253', 'group_3263', 'group_3265', 'group_3281', 'grou

In [221]:
groups_to_genes = defaultdict(list)
with open(clustered_proteins_file, 'r') as infile:
    for line in infile.readlines():
        splitline = line.split('\t')
        group_id = splitline[0].split(':')[0]
        gene1 = [splitline[0].split(':')[1].strip(" ").strip('\n')]
        rest_of_genes = [gene.strip('\n') for gene in splitline[1::]] if len(splitline) > 1 else []
        groups_to_genes[group_id] = gene1
        groups_to_genes[group_id].extend(rest_of_genes)
print(len(groups_to_genes.keys()))


3921


In [231]:
# Establish a connection to the database
connection = sqlite3.connect(new_db)
cursor = connection.cursor()

all_groups_df = []
# ok lets start this loop now.
for group in tqdm(groups_to_genes.items(), desc="Searching for those groups!"):
    group_df = get_gene_from_baktaID(connection, "annotations", item)
    group_dfs.append(group_df)
    # Concatenate the DataFrames for this group
    if group_dfs:
        concatenated_group_df = pandas.concat(group_dfs, ignore_index=True)
        all_groups_df.append(concatenated_group_df)

# Close the connection
connection.close()

Searching for those groups!:   0%|          | 0/3921 [00:00<?, ?it/s]

: 