In [1]:
import os
import pandas as pd
import getpass
import sqlalchemy
from sqlmodel import SQLModel
import tqdm

In [2]:
conn = sqlalchemy.create_engine('postgresql://postgres:password@127.0.0.1:5432/limbless_db')
conn

Engine(postgresql://postgres:***@127.0.0.1:5432/limbless_db)

In [3]:
q = """
SELECT * FROM pg_extension WHERE extname = 'pg_trgm';
"""

if len(pd.read_sql(q, conn)) == 0:
    conn.execute('CREATE EXTENSION pg_trgm;')

pd.read_sql(q, conn)

Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,16385,pg_trgm,10,2200,True,1.6,,


In [4]:
from limbless import db, models, categories

In [5]:
conn.execute(
"""
INSERT INTO indexkit (id, name)
VALUES (0, 'empty');
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7efd7c372c50>

In [6]:
conn.execute(
"""
INSERT INTO seqindex (id, sequence, type, adapter, index_kit_id)
VALUES (0, '', '', '', 0);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7efd7a1dda90>

In [7]:
for table in SQLModel.metadata.tables.items():
    print(table[0])
    for column in table[1].columns:
        print(f" - {column.name}")

job
 - id
 - name
 - slurm_id
 - status
project
 - id
 - name
 - description
 - owner_id
libraryseqrequestlink
 - library_id
 - seq_request_id
librarysamplelink
 - library_id
 - sample_id
 - seq_index_id
runlibrarylink
 - run_id
 - library_id
indexkitlibrarytype
 - index_kit_id
 - library_type_id
sample
 - id
 - name
 - organism_id
 - project_id
 - owner_id
run
 - id
 - lane
 - r1_cycles
 - r2_cycles
 - i1_cycles
 - i2_cycles
 - experiment_id
user
 - id
 - first_name
 - last_name
 - email
 - password
 - role
experiment
 - timestamp
 - id
 - flowcell
library
 - id
 - name
 - library_type_id
 - index_kit_id
 - owner_id
organism
 - tax_id
 - scientific_name
 - common_name
 - category
seqindex
 - id
 - sequence
 - type
 - adapter
 - index_kit_id
indexkit
 - id
 - name
seqrequest
 - id
 - name
 - description
 - status
 - requestor_id
 - person_contact_id
 - billing_contact_id
 - bioinformatician_contact_id
contact
 - id
 - name
 - organization
 - email
 - phone
 - billing_code
 - address


In [8]:
password = getpass.getpass("Password: ")
admin = db.db_handler.create_user(
    email="admin@email.com",
    first_name="CeMM",
    last_name="Admin",
    password=password,
    role=categories.UserRole.ADMIN,
)
admin

User(first_name='CeMM', last_name='Admin', password='$2b$12$5BEBfhH6Zv8W9vT96EnGvuZT.FT04rZt1KZ6Ht.IVzonUnyvOLg3e', id=1, email='admin@email.com', role=1, samples=[], projects=[], libraries=[], requests=[])

In [9]:
client = db.db_handler.create_user(
    email="client@email.com",
    first_name="CeMM",
    last_name="Client",
    password=password,
    role=categories.UserRole.CLIENT,
)
client

User(first_name='CeMM', last_name='Client', password='$2b$12$tubDJavV43quhoKOca4x3epF15JB0vmlP9.W7/vBkJUfJlUWADgYi', id=2, email='client@email.com', role=4, samples=[], projects=[], libraries=[], requests=[])

In [10]:
bio = db.db_handler.create_user(
    email="bio@email.com",
    first_name="CeMM",
    last_name="Bioinformatician",
    password=password,
    role=categories.UserRole.BIOINFORMATICIAN,
)
bio

User(first_name='CeMM', last_name='Bioinformatician', password='$2b$12$7Z5U.UKN2Hl2vy/PQY7Lo.mIILo6UtNcQTU50mosn.T0Gk0C3u2B6', id=3, email='bio@email.com', role=2, samples=[], projects=[], libraries=[], requests=[])

In [11]:
tech = db.db_handler.create_user(
    email="tech@email.com",
    first_name="CeMM",
    last_name="Technician",
    password=password,
    role=categories.UserRole.TECHNICIAN,
)
tech

User(first_name='CeMM', last_name='Technician', password='$2b$12$6sm19hb04/qYDHXBZtOy8uGGNumxxfyoGO45zwUe01sPsfGEtU/.a', id=4, email='tech@email.com', role=3, samples=[], projects=[], libraries=[], requests=[])

In [12]:
label_search_columns: dict[str, list[str]] = {
    "project": ["name"],
    "experiment": ["flowcell"],
    "library": ["name"],
    # "organism": ["scientific_name", "common_name"],
    "seqindex": ["sequence", "adapter"],
    "indexkit": ["name"],
}

In [13]:
for table, columns in label_search_columns.items():
    for column in columns:
        conn.execute(f"""
            CREATE INDEX
                trgm_{table}_{column}_idx
            ON
                {table}
            USING
                gin (lower({column}) gin_trgm_ops);
        """)

In [14]:
conn.execute(f"""
    CREATE INDEX
        trgm_organism_name_idx
    ON
        organism
    USING
        gin (lower(common_name) gin_trgm_ops, lower(scientific_name) gin_trgm_ops);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7efd7a1dde10>

In [15]:
from limbless.index_kits import add_index_kits
add_index_kits(db.db_handler, datadir="data")

In [16]:
df = pd.read_csv("data/species.csv", index_col=0)
df

Unnamed: 0_level_0,scientific name,genbank common name,common name,type
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,Azorhizobium caulinodans,,,B
9,Buchnera aphidicola,,,B
11,Cellulomonas gilvus,,,B
14,Dictyoglomus thermophilum,,,B
17,Methylophilus methylotrophus,,,B
...,...,...,...,...
3071318,Vibrio phage XacF13,,,V
3071372,Oikopleura sp. OKI2018,,,E
3071373,Oikopleura sp. OSKA2016,,,E
3071397,Cyphostemma cornigerum,,,E


In [17]:
vstats = pd.read_table("data/stats/Viruses.ids", header=None, usecols=[0])
vstats = vstats.groupby(0).size().sort_values(ascending=False)

bstats = pd.read_table("data/stats/Bacteria.ids", header=None, usecols=[0])
bstats = bstats.groupby(0).size().sort_values(ascending=False)


astats = pd.read_table("data/stats/Archaea.ids", header=None, usecols=[0])
astats = astats.groupby(0).size().sort_values(ascending=False)

estats = pd.read_table("data/stats/Eukaryota.ids", header=None, usecols=[0])
estats = estats.groupby(0).size().sort_values(ascending=False)

In [18]:
stats = pd.concat([vstats, bstats, astats, estats], axis=0)

In [19]:
for tax_id, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if tax_id not in stats.index:
        continue
    
    if stats[tax_id] < 2:
        continue
        
    cat = row["type"]
    if cat == "A":
        _cat = categories.OrganismCategory.ARCHAEA
    elif cat == "B":
        _cat = categories.OrganismCategory.BACTERIA
    elif cat == "E":
        _cat = categories.OrganismCategory.EUKARYOTA
    elif cat == "V":
        _cat = categories.OrganismCategory.VIRUSES
    elif cat == "U":
        _cat = categories.OrganismCategory.UNCLASSIFIED
    else:
        _cat = categories.OrganismCategory.OTHER
    
    if not pd.isna(row["genbank common name"]):
        common_name = row["genbank common name"]
    elif not pd.isna(row["common name"]):
        common_name = row["common name"]
    else:
        common_name = None

    scientific_name = row["scientific name"]

    assert scientific_name is not None 
    assert tax_id is not None

    if len(scientific_name) > 128:
        scientific_name = scientific_name[:125] + "..."

    db.db_handler.create_organism(
        tax_id=tax_id,
        scientific_name=scientific_name,
        common_name=common_name,
        category=_cat
    )

100%|██████████| 2314792/2314792 [01:06<00:00, 34839.68it/s]


In [20]:
q = f"""
SELECT
    *,
    similarity(lower(name), lower(%(word)s)) as sml
FROM
    indexkit
ORDER BY
    sml DESC;
"""
pd.read_sql(q, conn, params={"word": "TTseq"})

Unnamed: 0,id,name,sml
0,4,10x Dual Index Kit TT Seq A,0.133333
1,3,10x Dual Index Kit TN Seq A,0.096774
2,6,10x Single Index Kit T Seq A,0.096774
3,5,10x Single Index Kit N Seq A,0.0625
4,0,empty,0.0
5,1,10x Dual Index Kit NN Set A,0.0
6,2,10x Dual Index Kit NT Set A,0.0


In [21]:
%%time
q = """
SELECT * FROM seqindex WHERE {col} %% %(word)s
ORDER BY similarity({col}, %(word)s) DESC;
"""
pd.read_sql(q.format(col="adapter"), conn, params={
    "word": "si ga g2"
})

CPU times: user 0 ns, sys: 3.74 ms, total: 3.74 ms
Wall time: 7.7 ms


Unnamed: 0,id,sequence,type,adapter,index_kit_id
0,1829,CAGCCACT,single_index_3,SI-GA-G2,6
1,1828,ACTAGGAG,single_index_2,SI-GA-G2,6
2,1827,TGATTCTA,single_index_1,SI-GA-G2,6
3,1830,GTCGATGC,single_index_4,SI-GA-G2,6
4,1825,CCAGGAGC,single_index_3,SI-GA-G1,6
...,...,...,...,...,...
523,1336,CATTACAC,single_index_2,SI-NA-G6,5
524,1337,GTGCGACA,single_index_3,SI-NA-G6,5
525,1338,TCAGTGTT,single_index_4,SI-NA-G6,5
526,1367,ATTACCGG,single_index_1,SI-NA-G7,5


In [22]:
%%time
q = """
SELECT
    *,
    greatest(similarity(common_name, %(word)s), similarity(scientific_name, %(word)s)) AS score
FROM
    {table}
WHERE
    common_name %% %(word)s
OR
    scientific_name %% %(word)s
ORDER BY
    score DESC
LIMIT 100;
"""
pd.read_sql(q.format(table="organism"), conn, params={
    "word": "msculus"
})

CPU times: user 0 ns, sys: 1.78 ms, total: 1.78 ms
Wall time: 11.1 ms


Unnamed: 0,tax_id,scientific_name,common_name,category,score
0,10090,Mus musculus,house mouse,3,0.545455
1,51337,Jaculus jaculus,lesser Egyptian jerboa,3,0.333333
