In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import getpass
import sqlalchemy
from sqlmodel import SQLModel, func
import tqdm

In [2]:
conn = sqlalchemy.create_engine('postgresql://postgres:password@127.0.0.1:5432/limbless_db')
conn

Engine(postgresql://postgres:***@127.0.0.1:5432/limbless_db)

In [3]:
q = """
SELECT * FROM pg_extension WHERE extname = 'pg_trgm';
"""

if len(pd.read_sql(q, conn)) == 0:
    conn.execute('CREATE EXTENSION pg_trgm;')

pd.read_sql(q, conn)

Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,16385,pg_trgm,10,2200,True,1.6,,


In [4]:
from limbless import db, models, categories

In [5]:
for table in SQLModel.metadata.tables.items():
    print(table[0])
    for column in table[1].columns:
        print(f" - {column.name}")

project
 - id
 - name
 - description
 - num_samples
 - owner_id
sample
 - id
 - name
 - num_libraries
 - organism_id
 - project_id
 - owner_id
seqrequestlibrarylink
 - seq_request_id
 - library_id
samplelibrarylink
 - sample_id
 - library_id
 - cmo_id
librarypoollink
 - pool_id
 - library_id
experimentpoollink
 - experiment_id
 - pool_id
 - lane
seqrequestexperimentlink
 - seq_request_id
 - experiment_id
pool
 - id
 - name
 - num_libraries
 - index_kit_id
 - owner_id
 - contact_name
 - contact_email
 - contact_phone
user
 - id
 - first_name
 - last_name
 - email
 - password
 - role
 - num_projects
 - num_pools
 - num_samples
 - num_seq_requests
experiment
 - timestamp
 - id
 - flowcell
 - r1_cycles
 - r2_cycles
 - i1_cycles
 - i2_cycles
 - sequencing_person_id
 - num_lanes
 - num_pools
 - status_id
 - sequencer_id
library
 - id
 - name
 - type_id
 - submitted
 - volume
 - dna_concentration
 - total_size
 - num_pools
 - num_samples
 - num_seq_requests
 - sample_id
 - owner_id
 - index_1

In [6]:
if os.getenv("LIMBLESS_DEBUG") == "1":
    password = "password"
else:
    password = getpass.getpass("Password: ")

In [7]:
admin = db.db_handler.create_user(
    email="admin@email.com",
    first_name="CeMM",
    last_name="Admin",
    password=password,
    role=categories.UserRole.ADMIN,
)
admin

User(id=1, email='admin@email.com', role=1, num_pools=0, num_seq_requests=0, first_name='CeMM', last_name='Admin', password='$2b$12$fH.4G5z5K2F2hkmBW6MSGegLWqKGAjtjcXaPKUUPB.C3C1i1S3yO6', num_projects=0, num_samples=0)

In [8]:
client = db.db_handler.create_user(
    email="client@email.com",
    first_name="CeMM",
    last_name="Client",
    password=password,
    role=categories.UserRole.CLIENT,
)
client

User(id=2, email='client@email.com', role=4, num_pools=0, num_seq_requests=0, first_name='CeMM', last_name='Client', password='$2b$12$dOEAGW7Em.ezHBDAk5UB9ejy/FH1FPAjoDQPOencsqkjhFs9gMSZe', num_projects=0, num_samples=0)

In [9]:
bio = db.db_handler.create_user(
    email="bio@email.com",
    first_name="CeMM",
    last_name="Bioinformatician",
    password=password,
    role=categories.UserRole.BIOINFORMATICIAN,
)
bio

User(id=3, email='bio@email.com', role=2, num_pools=0, num_seq_requests=0, first_name='CeMM', last_name='Bioinformatician', password='$2b$12$PugcrmAHngXdoWJ3IeCHkuG.au5/w4etEvsnxhig4VTIEbG9X8uIS', num_projects=0, num_samples=0)

In [10]:
tech = db.db_handler.create_user(
    email="tech@email.com",
    first_name="CeMM",
    last_name="Technician",
    password=password,
    role=categories.UserRole.TECHNICIAN,
)
tech

User(id=4, email='tech@email.com', role=3, num_pools=0, num_seq_requests=0, first_name='CeMM', last_name='Technician', password='$2b$12$ykGNNYrGZkZmgGMAunXTyufvZOSMRJ7PFefMjLks8OmWoC2uFrtYu', num_projects=0, num_samples=0)

In [11]:
sequencer = db.db_handler.create_sequencer(
    "Novaseq 6000", "192.168.0.1"
)

In [12]:
label_search_columns: dict[str, list[str]] = {
    "project": ["name"],
    "experiment": ["flowcell"],
    "pool": ["name"],
    "organism": ["scientific_name", "common_name"],
    "barcode": ["sequence", "adapter"],
    "indexkit": ["name"],
    "user": ["email", "last_name", "first_name"]
}

In [13]:
for table, columns in label_search_columns.items():
    for column in columns:
        conn.execute(f"""
            CREATE INDEX
                trgm_{table}_{column}_idx
            ON
                "{table}"
            USING
                gin (lower({column}) gin_trgm_ops);
        """)

In [14]:
conn.execute(f"""
    CREATE INDEX
        trgm_user_full_name_idx
    ON
        "user"
    USING
        gin ((first_name || ' ' || last_name) gin_trgm_ops);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x13696f790>

In [15]:
conn.execute(f"""
    SELECT
        *
    FROM
        "user"
    WHERE
        first_name || ' ' || last_name ILIKE '%%cemm%%';
""").all()

[(1, 'CeMM', 'Admin', 'admin@email.com', '$2b$12$fH.4G5z5K2F2hkmBW6MSGegLWqKGAjtjcXaPKUUPB.C3C1i1S3yO6', 1, 0, 0, 0, 0),
 (2, 'CeMM', 'Client', 'client@email.com', '$2b$12$dOEAGW7Em.ezHBDAk5UB9ejy/FH1FPAjoDQPOencsqkjhFs9gMSZe', 4, 0, 0, 0, 0),
 (3, 'CeMM', 'Bioinformatician', 'bio@email.com', '$2b$12$PugcrmAHngXdoWJ3IeCHkuG.au5/w4etEvsnxhig4VTIEbG9X8uIS', 2, 0, 0, 0, 0),
 (4, 'CeMM', 'Technician', 'tech@email.com', '$2b$12$ykGNNYrGZkZmgGMAunXTyufvZOSMRJ7PFefMjLks8OmWoC2uFrtYu', 3, 0, 0, 0, 0)]

In [16]:
# conn.execute(f"""
#     CREATE INDEX
#         trgm_organism_name_idx
#     ON
#         organism
#     USING
#         gin (lower(common_name) gin_trgm_ops, lower(scientific_name) gin_trgm_ops);
# """)

In [17]:
# from limbless.categories import LibraryType

# for library_type in LibraryType:
#     db.db_handler.create_library_type(library_type)

In [18]:
# from limbless.index_kits import add_index_kits
# add_index_kits(db.db_handler, datadir="data")

In [19]:
# df = pd.read_csv("data/species.csv", index_col=0)
# df

In [20]:
# vstats = pd.read_table("data/stats/Viruses.ids", header=None, usecols=[0])
# vstats = vstats.groupby(0).size().sort_values(ascending=False)

# bstats = pd.read_table("data/stats/Bacteria.ids", header=None, usecols=[0])
# bstats = bstats.groupby(0).size().sort_values(ascending=False)


# astats = pd.read_table("data/stats/Archaea.ids", header=None, usecols=[0])
# astats = astats.groupby(0).size().sort_values(ascending=False)

# estats = pd.read_table("data/stats/Eukaryota.ids", header=None, usecols=[0])
# estats = estats.groupby(0).size().sort_values(ascending=False)

In [21]:
# stats = pd.concat([vstats, bstats, astats, estats], axis=0)

In [22]:
# for tax_id, row in tqdm.tqdm(df.iterrows(), total=len(df)):
#     if tax_id not in stats.index:
#         continue
    
#     if stats[tax_id] < 5:
#         continue
        
#     cat = row["type"]
#     if cat == "A":
#         _cat = categories.OrganismCategory.ARCHAEA
#     elif cat == "B":
#         _cat = categories.OrganismCategory.BACTERIA
#     elif cat == "E":
#         _cat = categories.OrganismCategory.EUKARYOTA
#     elif cat == "V":
#         _cat = categories.OrganismCategory.VIRUSES
#     elif cat == "U":
#         _cat = categories.OrganismCategory.UNCLASSIFIED
#     else:
#         _cat = categories.OrganismCategory.OTHER
    
#     if not pd.isna(row["genbank common name"]):
#         common_name = row["genbank common name"]
#     elif not pd.isna(row["common name"]):
#         common_name = row["common name"]
#     else:
#         common_name = None

#     scientific_name = row["scientific name"]

#     assert scientific_name is not None 
#     assert tax_id is not None

#     if len(scientific_name) > 128:
#         scientific_name = scientific_name[:125] + "..."

#     if db.db_handler.get_organism(tax_id) is None:
#         db.db_handler.create_organism(
#             tax_id=tax_id,
#             scientific_name=scientific_name,
#             common_name=common_name,
#             category=_cat
#         )

In [23]:
if db.db_handler.get_organism(10090) is None:
    db.db_handler.create_organism(
        tax_id=10090,
        scientific_name="Mus musculus",
        common_name="House mouse",
        category=categories.OrganismCategory.EUKARYOTA,
    )

if db.db_handler.get_organism(9606) is None:
    db.db_handler.create_organism(
        tax_id=9606,
        scientific_name="Homo sapiens",
        common_name="Human",
        category=categories.OrganismCategory.EUKARYOTA,
    )

if db.db_handler.get_organism(4932) is None:
    db.db_handler.create_organism(
        tax_id=4932,
        scientific_name="Saccharomyces cerevisiae",
        common_name="Baker's yeast",
        category=categories.OrganismCategory.EUKARYOTA,
    )

if db.db_handler.get_organism(1773) is None:
    db.db_handler.create_organism(
        tax_id=1773,
        scientific_name="Mycobacterium tuberculosis",
        common_name=None,
        category=categories.OrganismCategory.BACTERIA,
    )

if db.db_handler.get_organism(5833) is None:
    db.db_handler.create_organism(
        tax_id=5833,
        scientific_name="Plasmodium falciparum",
        common_name=None,
        category=categories.OrganismCategory.EUKARYOTA,
    )

In [24]:
q = f"""
SELECT
    *,
    similarity(lower(name), lower(%(word)s)) as sml
FROM
    indexkit
ORDER BY
    sml DESC;
"""
pd.read_sql(q, conn, params={"word": "TTseq"})

Unnamed: 0,id,name,num_indices_per_adapter,sml


In [25]:
from limbless.core.DBSession import DBSession
from sqlmodel import func

In [26]:
%%time
q = """
SELECT
    *,
    greatest(similarity(common_name, %(word)s), similarity(scientific_name, %(word)s)) AS score
FROM
    {table}
WHERE
    common_name %% %(word)s
OR
    scientific_name %% %(word)s
ORDER BY
    score DESC
LIMIT 100;
"""
pd.read_sql(q.format(table="organism"), conn, params={
    "word": "msculus"
})

CPU times: user 1.33 ms, sys: 1.03 ms, total: 2.36 ms
Wall time: 37.1 ms


Unnamed: 0,tax_id,scientific_name,common_name,category,score
0,10090,Mus musculus,House mouse,3,0.545455


In [27]:
with DBSession(db.db_handler) as session:
    query = session._session.query(models.Organism)
    query = query.order_by(
        func.greatest(
            func.similarity(models.Organism.scientific_name, "mouse"),
            func.similarity(models.Organism.common_name, "mouse"),
        ).desc()
    )
    res = query.limit(10).all()

res

[Organism(category=3, scientific_name='Mus musculus', tax_id=10090, common_name='House mouse'),
 Organism(category=2, scientific_name='Mycobacterium tuberculosis', tax_id=1773, common_name=None),
 Organism(category=3, scientific_name='Homo sapiens', tax_id=9606, common_name='Human'),
 Organism(category=3, scientific_name='Saccharomyces cerevisiae', tax_id=4932, common_name="Baker's yeast"),
 Organism(category=3, scientific_name='Plasmodium falciparum', tax_id=5833, common_name=None)]

In [28]:
# from limbless.testing_dataset import create_sample_data
# create_sample_data(db.db_handler)

In [29]:
def add_indices_from_kit(df: pd.DataFrame, kit_name: str):
    num_indices_per_adapter = None
    if "single index" in kit_name.lower():
        num_indices_per_adapter = 4
    elif "dual index" in kit_name.lower():
        num_indices_per_adapter = 2

    assert num_indices_per_adapter is not None
        
    kit = db.db_handler.create_index_kit(
        name=kit_name, num_indices_per_adapter=num_indices_per_adapter
    )
    
    for _, row in df.iterrows():
        index_1 = db.db_handler.create_barcode(
            sequence=row["index_1"],
            adapter=row["index_name"],
            index_kit_id=kit.id,
            barcode_type=categories.BarcodeType.INDEX_1
        )
        index_2, index_3, index_4 = None, None, None
        if "index_2" in row:
            index_2 = db.db_handler.create_barcode(
                sequence=row["index_2"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_2
            )
        if "index_3" in row:
            index_3 = db.db_handler.create_barcode(
                sequence=row["index_3"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_3
            )
        if "index_4" in row:
            index_4 = db.db_handler.create_barcode(
                sequence=row["index_4"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_4
            )

        adapter = db.db_handler.create_adapter(
            name=row["index_name"],
            index_kit_id=kit.id,
            barcode_1_id=index_1.id,
            barcode_2_id=index_2.id if index_2 is not None else None,
            barcode_3_id=index_3.id if index_3 is not None else None,
            barcode_4_id=index_4.id if index_4 is not None else None,
        )

In [30]:
add_indices_from_kit(pd.read_csv("data/10x_kits/Dual_Index_Kit_NN_Set_A.csv"), "10x Dual Index Kit NN Set A")
add_indices_from_kit(pd.read_csv("data/10x_kits/Dual_Index_Kit_NT_Set_A.csv"), "10x Dual Index Kit NT Set A")
add_indices_from_kit(pd.read_csv("data/10x_kits/Dual_Index_Kit_TN_Set_A.csv"), "10x Dual Index Kit TN Set A")
add_indices_from_kit(pd.read_csv("data/10x_kits/Dual_Index_Kit_TT_Set_A.csv"), "10x Dual Index Kit TT Set A")
add_indices_from_kit(pd.read_csv("data/10x_kits/Single_Index_Kit_N_Set_A.csv"), "10x Single Index Kit N Set A")
add_indices_from_kit(pd.read_csv("data/10x_kits/Single_Index_Kit_T_Set_A.csv"), "10x Single Index Kit T Set A")