In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import getpass
import sqlalchemy
from sqlmodel import SQLModel, func
import tqdm

In [2]:
conn = sqlalchemy.create_engine('postgresql://postgres:password@127.0.0.1:5432/limbless_db')
conn

Engine(postgresql://postgres:***@127.0.0.1:5432/limbless_db)

In [3]:
q = """
SELECT * FROM pg_extension WHERE extname = 'pg_trgm';
"""

if len(pd.read_sql(q, conn)) == 0:
    conn.execute('CREATE EXTENSION pg_trgm;')

pd.read_sql(q, conn)

Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,16385,pg_trgm,10,2200,True,1.6,,


In [4]:
from limbless import db, models, categories

In [5]:
db.db_handler.open_session()

q = db.db_handler._session.query(models.User.last_name).order_by(
    func.similarity(models.User.first_name + ' ' + models.User.last_name, 'cemm').desc()
)

res = q.all()

db.db_handler.close_session()

print(q)
res

SELECT "user".last_name AS user_last_name 
FROM "user" ORDER BY similarity("user".first_name || %(first_name_1)s || "user".last_name, %(similarity_1)s) DESC


[]

In [6]:
conn.execute(
"""
INSERT INTO indexkit (id, name, num_adapters)
VALUES (0, 'empty', 0);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x17ada21d0>

In [7]:
conn.execute(
"""
INSERT INTO seqadapter (id, name, index_kit_id, num_indices)
VALUES (0, '', 0, 0);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x17adb0290>

In [8]:
conn.execute(
"""
INSERT INTO seqindex (id, sequence, type, adapter_id, index_kit_id)
VALUES (0, '', '', 0, 0);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x17adb0e50>

In [9]:
for table in SQLModel.metadata.tables.items():
    print(table[0])
    for column in table[1].columns:
        print(f" - {column.name}")

job
 - id
 - name
 - slurm_id
 - status
project
 - id
 - name
 - description
 - num_samples
 - owner_id
libraryseqrequestlink
 - library_id
 - seq_request_id
librarysamplelink
 - library_id
 - sample_id
 - seq_index_id
experimentlibrarylink
 - experiment_id
 - library_id
 - lane
indexkitlibrarytype
 - index_kit_id
 - library_type_id
sample
 - id
 - name
 - organism_id
 - num_libraries
 - project_id
 - owner_id
user
 - id
 - first_name
 - last_name
 - email
 - password
 - role
 - num_projects
 - num_libraries
 - num_samples
 - num_seq_requests
experiment
 - timestamp
 - id
 - flowcell
 - r1_cycles
 - r2_cycles
 - i1_cycles
 - i2_cycles
 - sequencing_person_id
 - num_lanes
 - num_libraries
 - status
 - sequencer_id
librarytypeid
 - id
library
 - id
 - name
 - library_type_id
 - num_samples
 - num_experiments
 - num_seq_requests
 - contact_id
 - index_kit_id
 - owner_id
organism
 - tax_id
 - scientific_name
 - common_name
 - category
seqindex
 - id
 - sequence
 - workflow
 - type
 - adapt

In [10]:
password = getpass.getpass("Password: ")
admin = db.db_handler.create_user(
    email="admin@email.com",
    first_name="CeMM",
    last_name="Admin",
    password=password,
    role=categories.UserRole.ADMIN,
)
admin

User(first_name='CeMM', last_name='Admin', password='$2b$12$ClN8ZYjU9c4oA8EL0CrJU.t2qMpUCDzO7k2Pq24jj8Es2WSnow5F.', num_projects=0, num_libraries=0, num_seq_requests=0, id=1, email='admin@email.com', role=1, num_samples=0, requests=[], projects=[], libraries=[])

In [11]:
client = db.db_handler.create_user(
    email="client@email.com",
    first_name="CeMM",
    last_name="Client",
    password=password,
    role=categories.UserRole.CLIENT,
)
client

User(first_name='CeMM', last_name='Client', password='$2b$12$8B/gdtbYh7kE2/lA5Ht/ducC8TOleQ.CRmeAspZ3hW9pe1vQdo2mu', num_projects=0, num_libraries=0, num_seq_requests=0, id=2, email='client@email.com', role=4, num_samples=0, requests=[], projects=[], libraries=[])

In [12]:
bio = db.db_handler.create_user(
    email="bio@email.com",
    first_name="CeMM",
    last_name="Bioinformatician",
    password=password,
    role=categories.UserRole.BIOINFORMATICIAN,
)
bio

User(first_name='CeMM', last_name='Bioinformatician', password='$2b$12$HzsQBylSF6Oq91W11T4Rm.5icjY7eAnYG9BOaFAWb2ml83Z3un2Bi', num_projects=0, num_libraries=0, num_seq_requests=0, id=3, email='bio@email.com', role=2, num_samples=0, requests=[], projects=[], libraries=[])

In [13]:
tech = db.db_handler.create_user(
    email="tech@email.com",
    first_name="CeMM",
    last_name="Technician",
    password=password,
    role=categories.UserRole.TECHNICIAN,
)
tech

User(first_name='CeMM', last_name='Technician', password='$2b$12$DGI6EvFmC5j5iRdToWls9uZueeY4ricGQrhTvfd0bJH3o0U86SrsK', num_projects=0, num_libraries=0, num_seq_requests=0, id=4, email='tech@email.com', role=3, num_samples=0, requests=[], projects=[], libraries=[])

In [14]:
label_search_columns: dict[str, list[str]] = {
    "project": ["name"],
    "experiment": ["flowcell"],
    "library": ["name"],
    "organism": ["scientific_name", "common_name"],
    "seqindex": ["sequence"],
    "seqadapter": ["name"],
    "indexkit": ["name"],
    "user": ["email", "last_name", "first_name"]
}

In [15]:
for table, columns in label_search_columns.items():
    for column in columns:
        conn.execute(f"""
            CREATE INDEX
                trgm_{table}_{column}_idx
            ON
                "{table}"
            USING
                gin (lower({column}) gin_trgm_ops);
        """)

In [16]:
conn.execute(f"""
    CREATE INDEX
        trgm_user_full_name_idx
    ON
        "user"
    USING
        gin ((first_name || ' ' || last_name) gin_trgm_ops);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x17adc7450>

In [17]:
conn.execute(f"""
    SELECT
        *
    FROM
        "user"
    WHERE
        first_name || ' ' || last_name ILIKE '%%cemm%%';
""").all()

[(1, 'CeMM', 'Admin', 'admin@email.com', '$2b$12$ClN8ZYjU9c4oA8EL0CrJU.t2qMpUCDzO7k2Pq24jj8Es2WSnow5F.', 1, 0, 0, 0, 0),
 (2, 'CeMM', 'Client', 'client@email.com', '$2b$12$8B/gdtbYh7kE2/lA5Ht/ducC8TOleQ.CRmeAspZ3hW9pe1vQdo2mu', 4, 0, 0, 0, 0),
 (3, 'CeMM', 'Bioinformatician', 'bio@email.com', '$2b$12$HzsQBylSF6Oq91W11T4Rm.5icjY7eAnYG9BOaFAWb2ml83Z3un2Bi', 2, 0, 0, 0, 0),
 (4, 'CeMM', 'Technician', 'tech@email.com', '$2b$12$DGI6EvFmC5j5iRdToWls9uZueeY4ricGQrhTvfd0bJH3o0U86SrsK', 3, 0, 0, 0, 0)]

In [18]:
# conn.execute(f"""
#     CREATE INDEX
#         trgm_organism_name_idx
#     ON
#         organism
#     USING
#         gin (lower(common_name) gin_trgm_ops, lower(scientific_name) gin_trgm_ops);
# """)

In [19]:
from limbless.categories import LibraryType

for library_type in LibraryType:
    db.db_handler.create_library_type(library_type)

In [20]:
from limbless.index_kits import add_index_kits
add_index_kits(db.db_handler, datadir="data")

In [21]:
df = pd.read_csv("data/species.csv", index_col=0)
df

Unnamed: 0_level_0,scientific name,genbank common name,common name,type
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,Azorhizobium caulinodans,,,B
9,Buchnera aphidicola,,,B
11,Cellulomonas gilvus,,,B
14,Dictyoglomus thermophilum,,,B
17,Methylophilus methylotrophus,,,B
...,...,...,...,...
3071318,Vibrio phage XacF13,,,V
3071372,Oikopleura sp. OKI2018,,,E
3071373,Oikopleura sp. OSKA2016,,,E
3071397,Cyphostemma cornigerum,,,E


In [22]:
vstats = pd.read_table("data/stats/Viruses.ids", header=None, usecols=[0])
vstats = vstats.groupby(0).size().sort_values(ascending=False)

bstats = pd.read_table("data/stats/Bacteria.ids", header=None, usecols=[0])
bstats = bstats.groupby(0).size().sort_values(ascending=False)


astats = pd.read_table("data/stats/Archaea.ids", header=None, usecols=[0])
astats = astats.groupby(0).size().sort_values(ascending=False)

estats = pd.read_table("data/stats/Eukaryota.ids", header=None, usecols=[0])
estats = estats.groupby(0).size().sort_values(ascending=False)

In [23]:
stats = pd.concat([vstats, bstats, astats, estats], axis=0)

In [24]:
for tax_id, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if tax_id not in stats.index:
        continue
    
    if stats[tax_id] < 2:
        continue
        
    cat = row["type"]
    if cat == "A":
        _cat = categories.OrganismCategory.ARCHAEA
    elif cat == "B":
        _cat = categories.OrganismCategory.BACTERIA
    elif cat == "E":
        _cat = categories.OrganismCategory.EUKARYOTA
    elif cat == "V":
        _cat = categories.OrganismCategory.VIRUSES
    elif cat == "U":
        _cat = categories.OrganismCategory.UNCLASSIFIED
    else:
        _cat = categories.OrganismCategory.OTHER
    
    if not pd.isna(row["genbank common name"]):
        common_name = row["genbank common name"]
    elif not pd.isna(row["common name"]):
        common_name = row["common name"]
    else:
        common_name = None

    scientific_name = row["scientific name"]

    assert scientific_name is not None 
    assert tax_id is not None

    if len(scientific_name) > 128:
        scientific_name = scientific_name[:125] + "..."

    if db.db_handler.get_organism(tax_id) is None:
        db.db_handler.create_organism(
            tax_id=tax_id,
            scientific_name=scientific_name,
            common_name=common_name,
            category=_cat
        )

100%|██████████| 2314792/2314792 [00:49<00:00, 46588.11it/s]


In [25]:
if db.db_handler.get_organism(4932) is None:
    db.db_handler.create_organism(
        tax_id=4932,
        scientific_name="Saccharomyces cerevisiae",
        common_name="Baker's yeast",
        category=categories.OrganismCategory.EUKARYOTA,
    )

if db.db_handler.get_organism(1773) is None:
    db.db_handler.create_organism(
        tax_id=1773,
        scientific_name="Mycobacterium tuberculosis",
        common_name=None,
        category=categories.OrganismCategory.BACTERIA,
    )

if db.db_handler.get_organism(5833) is None:
    db.db_handler.create_organism(
        tax_id=5833,
        scientific_name="Plasmodium falciparum",
        common_name=None,
        category=categories.OrganismCategory.EUKARYOTA,
    )



In [26]:
q = f"""
SELECT
    *,
    similarity(lower(name), lower(%(word)s)) as sml
FROM
    indexkit
ORDER BY
    sml DESC;
"""
pd.read_sql(q, conn, params={"word": "TTseq"})

Unnamed: 0,id,name,num_adapters,sml
0,4,10x Dual Index Kit TT Seq A,0,0.133333
1,3,10x Dual Index Kit TN Seq A,0,0.096774
2,6,10x Single Index Kit T Seq A,0,0.096774
3,5,10x Single Index Kit N Seq A,0,0.0625
4,0,empty,0,0.0
5,1,10x Dual Index Kit NN Set A,0,0.0
6,2,10x Dual Index Kit NT Set A,0,0.0


In [27]:
from limbless.core.DBSession import DBSession
from sqlmodel import func

In [28]:
with DBSession(db.db_handler) as session:
    res = session._session.query(models.SeqAdapter).order_by(
        func.similarity(models.SeqAdapter.name, "si ga g2").desc()
    ).limit(10).all()

res

[SeqAdapter(name='SI-GA-G2', num_indices=0, id=554, index_kit_id=6, indices=[CAGCCACT [Index 3], ACTAGGAG [Index 2], GTCGATGC [Index 4], TGATTCTA [Index 1]], index_kit=IndexKit(num_adapters=0, name='10x Single Index Kit T Seq A', id=6, library_type_ids=[LibraryTypeId(id=3)])),
 SeqAdapter(name='SI-GA-G1', num_indices=0, id=553, index_kit_id=6, indices=[TGCTCGTA [Index 4], ATGAATCT [Index 1], GATCTCAG [Index 2], CCAGGAGC [Index 3]], index_kit=IndexKit(num_adapters=0, name='10x Single Index Kit T Seq A', id=6, library_type_ids=[LibraryTypeId(id=3)])),
 SeqAdapter(name='SI-GA-G9', num_indices=0, id=561, index_kit_id=6, indices=[CCTTGTAG [Index 4], TAGGACGT [Index 1], ATCCCACA [Index 2], GGAATGTC [Index 3]], index_kit=IndexKit(num_adapters=0, name='10x Single Index Kit T Seq A', id=6, library_type_ids=[LibraryTypeId(id=3)])),
 SeqAdapter(name='SI-GA-G8', num_indices=0, id=560, index_kit_id=6, indices=[GGCTGTTG [Index 4], TATGAGCT [Index 1], CCGATAGC [Index 2], ATACCCAA [Index 3]], index_ki

In [29]:
%%time
q = """
SELECT
    *,
    similarity(lower(name), lower(%(word)s)) AS sml
FROM
    seqadapter
ORDER BY
    sml DESC
LIMIT 10;
"""
pd.read_sql(q, conn, params={
    "word": "si ga g2"
})

CPU times: user 1.17 ms, sys: 913 µs, total: 2.09 ms
Wall time: 14.2 ms


Unnamed: 0,id,name,num_indices,index_kit_id,sml
0,554,SI-GA-G2,0,6,1.0
1,561,SI-GA-G9,0,6,0.6
2,553,SI-GA-G1,0,6,0.6
3,557,SI-GA-G5,0,6,0.6
4,559,SI-GA-G7,0,6,0.6
5,560,SI-GA-G8,0,6,0.6
6,555,SI-GA-G3,0,6,0.6
7,558,SI-GA-G6,0,6,0.6
8,556,SI-GA-G4,0,6,0.6
9,483,SI-GA-A3,0,6,0.545455


In [30]:
%%time
q = """
SELECT
    *,
    greatest(similarity(common_name, %(word)s), similarity(scientific_name, %(word)s)) AS score
FROM
    {table}
WHERE
    common_name %% %(word)s
OR
    scientific_name %% %(word)s
ORDER BY
    score DESC
LIMIT 100;
"""
pd.read_sql(q.format(table="organism"), conn, params={
    "word": "msculus"
})

CPU times: user 1.21 ms, sys: 866 µs, total: 2.08 ms
Wall time: 56.8 ms


Unnamed: 0,tax_id,scientific_name,common_name,category,score
0,10090,Mus musculus,house mouse,3,0.545455
1,51337,Jaculus jaculus,lesser Egyptian jerboa,3,0.333333


In [31]:
with DBSession(db.db_handler) as session:
    query = session._session.query(models.Organism)
    query = query.order_by(
        func.greatest(
            func.similarity(models.Organism.scientific_name, "mouse"),
            func.similarity(models.Organism.common_name, "mouse"),
        ).desc()
    )
    res = query.limit(10).all()

res

[Organism(common_name='house mouse', category=3, tax_id=10090, scientific_name='Mus musculus'),
 Organism(common_name='shrew mouse', category=3, tax_id=10093, scientific_name='Mus pahari'),
 Organism(common_name='Ryukyu mouse', category=3, tax_id=10089, scientific_name='Mus caroli'),
 Organism(common_name='gray mouse lemur', category=3, tax_id=30608, scientific_name='Microcebus murinus'),
 Organism(common_name='white-footed mouse', category=3, tax_id=10041, scientific_name='Peromyscus leucopus'),
 Organism(common_name='prairie deer mouse', category=3, tax_id=230844, scientific_name='Peromyscus maniculatus bairdii'),
 Organism(common_name='golden spiny mouse', category=3, tax_id=60746, scientific_name='Acomys russatus'),
 Organism(common_name='Pacific pocket mouse', category=3, tax_id=214514, scientific_name='Perognathus longimembris pacificus'),
 Organism(common_name="Coquerel's mouse lemur", category=3, tax_id=47180, scientific_name='Mirza coquereli'),
 Organism(common_name='Agile Gra

In [32]:
from limbless.testing_dataset import create_sample_data
create_sample_data(db.db_handler)