In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import getpass
import sqlalchemy
from sqlmodel import SQLModel, func
import tqdm

In [2]:
conn = sqlalchemy.create_engine('postgresql://postgres:password@127.0.0.1:5432/limbless_db')
conn

Engine(postgresql://postgres:***@127.0.0.1:5432/limbless_db)

In [3]:
q = """
SELECT * FROM pg_extension WHERE extname = 'pg_trgm';
"""

if len(pd.read_sql(q, conn)) == 0:
    conn.execute('CREATE EXTENSION pg_trgm;')

pd.read_sql(q, conn)

Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,16385,pg_trgm,10,2200,True,1.6,,


In [4]:
from limbless import db, models, categories

In [5]:
for table in SQLModel.metadata.tables.items():
    print(table[0])
    for column in table[1].columns:
        print(f" - {column.name}")

project
 - id
 - name
 - description
 - num_samples
 - owner_id
sample
 - id
 - name
 - num_libraries
 - organism_id
 - project_id
 - owner_id
seqrequestlibrarylink
 - seq_request_id
 - library_id
samplelibrarylink
 - sample_id
 - library_id
 - cmo_id
librarypoollink
 - pool_id
 - library_id
experimentlibrarylink
 - experiment_id
 - library_id
 - lane
librarybarcodelink
 - library_id
 - barcode_id
pool
 - id
 - name
 - num_libraries
 - owner_id
 - contact_name
 - contact_email
 - contact_phone
user
 - id
 - first_name
 - last_name
 - email
 - password
 - role
 - num_projects
 - num_pools
 - num_samples
 - num_seq_requests
experiment
 - timestamp
 - id
 - flowcell
 - r1_cycles
 - r2_cycles
 - i1_cycles
 - i2_cycles
 - sequencing_person_id
 - num_lanes
 - num_libraries
 - status_id
 - sequencer_id
library
 - id
 - type_id
 - num_pools
 - num_samples
 - num_seq_requests
 - submitted
 - kit
 - volume
 - dna_concentration
 - total_size
 - owner_id
organism
 - tax_id
 - scientific_name
 - co

In [6]:
if os.getenv("LIMBLESS_DEBUG") == "1":
    password = "password"
else:
    password = getpass.getpass("Password: ")

In [7]:
admin = db.db_handler.create_user(
    email="admin@email.com",
    first_name="CeMM",
    last_name="Admin",
    password=password,
    role=categories.UserRole.ADMIN,
)
admin

User(last_name='Admin', email='admin@email.com', role=1, num_pools=0, num_seq_requests=0, id=1, first_name='CeMM', password='$2b$12$EExr2T4S6A525CKG3oQLCOTiR5xXTyJIB/xrVvGsaMlWtBDr3CrUC', num_projects=0, num_samples=0, requests=[], projects=[], samples=[], libraries=[], pools=[])

In [8]:
client = db.db_handler.create_user(
    email="client@email.com",
    first_name="CeMM",
    last_name="Client",
    password=password,
    role=categories.UserRole.CLIENT,
)
client

User(last_name='Client', email='client@email.com', role=4, num_pools=0, num_seq_requests=0, id=2, first_name='CeMM', password='$2b$12$nnn450xeCpQRHqqzv9n.NOI3/RZkyIrDi7N4eaUC.yBN3YCVdVwY2', num_projects=0, num_samples=0, requests=[], projects=[], samples=[], libraries=[], pools=[])

In [9]:
bio = db.db_handler.create_user(
    email="bio@email.com",
    first_name="CeMM",
    last_name="Bioinformatician",
    password=password,
    role=categories.UserRole.BIOINFORMATICIAN,
)
bio

User(last_name='Bioinformatician', email='bio@email.com', role=2, num_pools=0, num_seq_requests=0, id=3, first_name='CeMM', password='$2b$12$sp9Jd2KPbs5fWQQ6nyKuvu/iLjylt4rNDekl3jed/bHuUXrvyIxrq', num_projects=0, num_samples=0, requests=[], projects=[], samples=[], libraries=[], pools=[])

In [10]:
tech = db.db_handler.create_user(
    email="tech@email.com",
    first_name="CeMM",
    last_name="Technician",
    password=password,
    role=categories.UserRole.TECHNICIAN,
)
tech

User(last_name='Technician', email='tech@email.com', role=3, num_pools=0, num_seq_requests=0, id=4, first_name='CeMM', password='$2b$12$.D8V3DzpCQQtnuCOfwKrZO/b9rKaQ/2u52rUskSVq4U9ZUvlOnTse', num_projects=0, num_samples=0, requests=[], projects=[], samples=[], libraries=[], pools=[])

In [11]:
label_search_columns: dict[str, list[str]] = {
    "project": ["name"],
    "experiment": ["flowcell"],
    "pool": ["name"],
    "organism": ["scientific_name", "common_name"],
    "barcode": ["sequence", "adapter"],
    "indexkit": ["name"],
    "user": ["email", "last_name", "first_name"]
}

In [12]:
for table, columns in label_search_columns.items():
    for column in columns:
        conn.execute(f"""
            CREATE INDEX
                trgm_{table}_{column}_idx
            ON
                "{table}"
            USING
                gin (lower({column}) gin_trgm_ops);
        """)

In [13]:
conn.execute(f"""
    CREATE INDEX
        trgm_user_full_name_idx
    ON
        "user"
    USING
        gin ((first_name || ' ' || last_name) gin_trgm_ops);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x14067eb90>

In [14]:
conn.execute(f"""
    SELECT
        *
    FROM
        "user"
    WHERE
        first_name || ' ' || last_name ILIKE '%%cemm%%';
""").all()

[(1, 'CeMM', 'Admin', 'admin@email.com', '$2b$12$EExr2T4S6A525CKG3oQLCOTiR5xXTyJIB/xrVvGsaMlWtBDr3CrUC', 1, 0, 0, 0, 0),
 (2, 'CeMM', 'Client', 'client@email.com', '$2b$12$nnn450xeCpQRHqqzv9n.NOI3/RZkyIrDi7N4eaUC.yBN3YCVdVwY2', 4, 0, 0, 0, 0),
 (3, 'CeMM', 'Bioinformatician', 'bio@email.com', '$2b$12$sp9Jd2KPbs5fWQQ6nyKuvu/iLjylt4rNDekl3jed/bHuUXrvyIxrq', 2, 0, 0, 0, 0),
 (4, 'CeMM', 'Technician', 'tech@email.com', '$2b$12$.D8V3DzpCQQtnuCOfwKrZO/b9rKaQ/2u52rUskSVq4U9ZUvlOnTse', 3, 0, 0, 0, 0)]

In [15]:
# conn.execute(f"""
#     CREATE INDEX
#         trgm_organism_name_idx
#     ON
#         organism
#     USING
#         gin (lower(common_name) gin_trgm_ops, lower(scientific_name) gin_trgm_ops);
# """)

In [16]:
# from limbless.categories import LibraryType

# for library_type in LibraryType:
#     db.db_handler.create_library_type(library_type)

In [17]:
from limbless.index_kits import add_index_kits
add_index_kits(db.db_handler, datadir="data")

In [18]:
df = pd.read_csv("data/species.csv", index_col=0)
df

Unnamed: 0_level_0,scientific name,genbank common name,common name,type
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,Azorhizobium caulinodans,,,B
9,Buchnera aphidicola,,,B
11,Cellulomonas gilvus,,,B
14,Dictyoglomus thermophilum,,,B
17,Methylophilus methylotrophus,,,B
...,...,...,...,...
3071318,Vibrio phage XacF13,,,V
3071372,Oikopleura sp. OKI2018,,,E
3071373,Oikopleura sp. OSKA2016,,,E
3071397,Cyphostemma cornigerum,,,E


In [19]:
vstats = pd.read_table("data/stats/Viruses.ids", header=None, usecols=[0])
vstats = vstats.groupby(0).size().sort_values(ascending=False)

bstats = pd.read_table("data/stats/Bacteria.ids", header=None, usecols=[0])
bstats = bstats.groupby(0).size().sort_values(ascending=False)


astats = pd.read_table("data/stats/Archaea.ids", header=None, usecols=[0])
astats = astats.groupby(0).size().sort_values(ascending=False)

estats = pd.read_table("data/stats/Eukaryota.ids", header=None, usecols=[0])
estats = estats.groupby(0).size().sort_values(ascending=False)

In [20]:
stats = pd.concat([vstats, bstats, astats, estats], axis=0)

In [21]:
for tax_id, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if tax_id not in stats.index:
        continue
    
    if stats[tax_id] < 5:
        continue
        
    cat = row["type"]
    if cat == "A":
        _cat = categories.OrganismCategory.ARCHAEA
    elif cat == "B":
        _cat = categories.OrganismCategory.BACTERIA
    elif cat == "E":
        _cat = categories.OrganismCategory.EUKARYOTA
    elif cat == "V":
        _cat = categories.OrganismCategory.VIRUSES
    elif cat == "U":
        _cat = categories.OrganismCategory.UNCLASSIFIED
    else:
        _cat = categories.OrganismCategory.OTHER
    
    if not pd.isna(row["genbank common name"]):
        common_name = row["genbank common name"]
    elif not pd.isna(row["common name"]):
        common_name = row["common name"]
    else:
        common_name = None

    scientific_name = row["scientific name"]

    assert scientific_name is not None 
    assert tax_id is not None

    if len(scientific_name) > 128:
        scientific_name = scientific_name[:125] + "..."

    if db.db_handler.get_organism(tax_id) is None:
        db.db_handler.create_organism(
            tax_id=tax_id,
            scientific_name=scientific_name,
            common_name=common_name,
            category=_cat
        )

100%|██████████| 2314792/2314792 [00:37<00:00, 61219.16it/s]


In [22]:
if db.db_handler.get_organism(4932) is None:
    db.db_handler.create_organism(
        tax_id=4932,
        scientific_name="Saccharomyces cerevisiae",
        common_name="Baker's yeast",
        category=categories.OrganismCategory.EUKARYOTA,
    )

if db.db_handler.get_organism(1773) is None:
    db.db_handler.create_organism(
        tax_id=1773,
        scientific_name="Mycobacterium tuberculosis",
        common_name=None,
        category=categories.OrganismCategory.BACTERIA,
    )

if db.db_handler.get_organism(5833) is None:
    db.db_handler.create_organism(
        tax_id=5833,
        scientific_name="Plasmodium falciparum",
        common_name=None,
        category=categories.OrganismCategory.EUKARYOTA,
    )



In [23]:
q = f"""
SELECT
    *,
    similarity(lower(name), lower(%(word)s)) as sml
FROM
    indexkit
ORDER BY
    sml DESC;
"""
pd.read_sql(q, conn, params={"word": "TTseq"})

Unnamed: 0,id,name,sml
0,4,10x Dual Index Kit TT Seq A,0.133333
1,3,10x Dual Index Kit TN Seq A,0.096774
2,6,10x Single Index Kit T Seq A,0.096774
3,5,10x Single Index Kit N Seq A,0.0625
4,1,10x Dual Index Kit NN Set A,0.0
5,2,10x Dual Index Kit NT Set A,0.0


In [24]:
from limbless.core.DBSession import DBSession
from sqlmodel import func

In [25]:
%%time
q = """
SELECT
    *,
    greatest(similarity(common_name, %(word)s), similarity(scientific_name, %(word)s)) AS score
FROM
    {table}
WHERE
    common_name %% %(word)s
OR
    scientific_name %% %(word)s
ORDER BY
    score DESC
LIMIT 100;
"""
pd.read_sql(q.format(table="organism"), conn, params={
    "word": "msculus"
})

CPU times: user 1.12 ms, sys: 742 µs, total: 1.87 ms
Wall time: 33.1 ms


Unnamed: 0,tax_id,scientific_name,common_name,category,score
0,10090,Mus musculus,house mouse,3,0.545455
1,51337,Jaculus jaculus,lesser Egyptian jerboa,3,0.333333


In [26]:
with DBSession(db.db_handler) as session:
    query = session._session.query(models.Organism)
    query = query.order_by(
        func.greatest(
            func.similarity(models.Organism.scientific_name, "mouse"),
            func.similarity(models.Organism.common_name, "mouse"),
        ).desc()
    )
    res = query.limit(10).all()

res

[Organism(common_name='house mouse', tax_id=10090, category=3, scientific_name='Mus musculus'),
 Organism(common_name='shrew mouse', tax_id=10093, category=3, scientific_name='Mus pahari'),
 Organism(common_name='Ryukyu mouse', tax_id=10089, category=3, scientific_name='Mus caroli'),
 Organism(common_name='gray mouse lemur', tax_id=30608, category=3, scientific_name='Microcebus murinus'),
 Organism(common_name='prairie deer mouse', tax_id=230844, category=3, scientific_name='Peromyscus maniculatus bairdii'),
 Organism(common_name='golden spiny mouse', tax_id=60746, category=3, scientific_name='Acomys russatus'),
 Organism(common_name='white-footed mouse', tax_id=10041, category=3, scientific_name='Peromyscus leucopus'),
 Organism(common_name='Pacific pocket mouse', tax_id=214514, category=3, scientific_name='Perognathus longimembris pacificus'),
 Organism(common_name='Agile Gracile Mouse Opossum', tax_id=191870, category=3, scientific_name='Gracilinanus agilis'),
 Organism(common_name=

In [27]:
from limbless.testing_dataset import create_sample_data
create_sample_data(db.db_handler)

In [35]:
db.db_handler.open_session()
assert db.db_handler._session is not None
s = db.db_handler.get_sample(1)
if (l := db.db_handler.get_library(1)) is None:
    l = db.db_handler.create_library(
        library_type=categories.LibraryType.GENE_EXPRESSION,
        owner_id=1
    )
    link = db.db_handler.link_sample_library(s.id, l.id)
else:
    links, _ = db.db_handler.get_sample_library_links(s.id, l.id)
    link = links[0]
print(s)
print(l)

print(link)
print(link.library)
db.db_handler.close_session()

Sample(id: 1, name:Sample 1, organism:Homo sapiens [9606] (human))
id=1 num_samples=1 submitted=False volume=None total_size=None type_id=1 num_pools=0 num_seq_requests=0 kit='custom' dna_concentration=None owner_id=1 owner=User(last_name='Admin', email='admin@email.com', role=1, num_pools=0, num_seq_requests=0, id=1, first_name='CeMM', password='$2b$12$EExr2T4S6A525CKG3oQLCOTiR5xXTyJIB/xrVvGsaMlWtBDr3CrUC', num_projects=0, num_samples=0, requests=[], projects=[], samples=[], pools=[]) cmos=[] experiments=[] sample_links=[SampleLibraryLink(library_id=1, cmo_id=None, sample_id=1)] pools=[] seq_requests=[] barcodes=[]
SampleLibraryLink(sample_id: 1, library_id: 1, cmo_id: None)
id=1 num_samples=1 submitted=False volume=None total_size=None type_id=1 num_pools=0 num_seq_requests=0 kit='custom' dna_concentration=None owner_id=1 owner=User(last_name='Admin', email='admin@email.com', role=1, num_pools=0, num_seq_requests=0, id=1, first_name='CeMM', password='$2b$12$EExr2T4S6A525CKG3oQLCOTiR5

DetachedInstanceError: Parent instance <SampleLibraryLink at 0x2a2e96b10> is not bound to a Session; lazy load operation of attribute 'sample' cannot proceed (Background on this error at: https://sqlalche.me/e/14/bhk3)