In [1]:
import pandas as pd
import numpy as np

import os
import io
import difflib
import glob
from datetime import datetime
import pytz
from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.orm import Mapped, relationship, DeclarativeBase, mapped_column, Session

from limbless_db import categories, DBHandler, models, DBSession

from loguru import logger

import scipy
import string

In [2]:
db = DBHandler("admin", "password", "localhost")

In [3]:
def titlecase_with_acronyms(val: str) -> str:
    return " ".join([c[0].upper() + c[1:] for c in val.split(" ")])

def add_indices_from_kit(db_handler: DBHandler, path: str):
    df = pd.read_csv(path)
    kit_name = titlecase_with_acronyms(os.path.basename(path).split(".")[0].replace("_", " "))

    num_indices_per_adapter = None
    if "single index" in kit_name.lower():
        num_indices_per_adapter = 4
    elif "dual index" in kit_name.lower():
        num_indices_per_adapter = 2

    assert num_indices_per_adapter is not None
        
    if db_handler.get_index_kit_by_name(kit_name) is not None:
        print(f"Index kit {kit_name} is already present in the DB.")
        return
    
    kit = db_handler.create_index_kit(
        name=kit_name, num_indices_per_adapter=num_indices_per_adapter
    )
    
    for _, row in df.iterrows():
        index_1 = db_handler.create_barcode(
            sequence=row["index_1"],
            adapter=row["index_name"],
            index_kit_id=kit.id,
            barcode_type=categories.BarcodeType.INDEX_1
        )
        index_2, index_3, index_4 = None, None, None
        if "index_2" in row:
            index_2 = db_handler.create_barcode(
                sequence=row["index_2"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_2
            )
        if "index_3" in row:
            index_3 = db_handler.create_barcode(
                sequence=row["index_3"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_3
            )
        if "index_4" in row:
            index_4 = db_handler.create_barcode(
                sequence=row["index_4"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_4
            )

        db_handler.create_adapter(
            name=row["index_name"],
            index_kit_id=kit.id,
            plate_well=row["index_name"].split("_")[-1],
            barcode_1_id=index_1.id,
            barcode_2_id=index_2.id if index_2 is not None else None,
            barcode_3_id=index_3.id if index_3 is not None else None,
            barcode_4_id=index_4.id if index_4 is not None else None,
        )

In [4]:
def add_features_from_kit(db_handler: DBHandler, path: str, feature_type: categories.FeatureTypeEnum):
    df = pd.read_csv(path, sep="\t", comment="#")
    kit_name = titlecase_with_acronyms(os.path.basename(path).split(".")[0].replace("_", " "))

    if (kit := db_handler.get_feature_kit_by_name(kit_name)) is not None:
        print(f"Feature kit {kit_name} is already present in the DB.")
    else:
        kit = db_handler.create_feature_kit(name=kit_name, type=feature_type)
    
    for _, row in df.iterrows():
        if pd.isnull(row["barcode_id"]):
            print(f"Barcode name is null for row {row}, {kit_name}")
            raise Exception(f"Barcode name is null for row {row}.")
            
        if db_handler.get_feature_from_kit_by_feature_name(feature_kit_id=kit.id, feature_name=str(row["barcode_id"])) is not None:
            print(f"Feature {row['barcode_id']} is already present in the DB.")
            continue
        
        db_handler.create_feature(
            name=str(row["barcode_id"]),
            feature_kit_id=kit.id,
            type=feature_type,
            sequence=row["barcode_sequence"],
            pattern=row["pattern"],
            read=row["read"],
            target_name=row["barcode_target_name"] if pd.notna(row["barcode_target_name"]) else None,
            target_id=row["barcode_target_id"] if pd.notna(row["barcode_target_id"]) else None,
        )


def add_indices_from_kit(db_handler: DBHandler, path: str):
    df = pd.read_csv(path)
    kit_name = titlecase_with_acronyms(os.path.basename(path).split(".")[0].replace("_", " "))

    num_indices_per_adapter = None
    if "single index" in kit_name.lower():
        num_indices_per_adapter = 4
    elif "dual index" in kit_name.lower():
        num_indices_per_adapter = 2

    assert num_indices_per_adapter is not None
        
    if db_handler.get_index_kit_by_name(kit_name) is not None:
        print(f"Index kit {kit_name} is already present in the DB.")
        return
    
    kit = db_handler.create_index_kit(
        name=kit_name, num_indices_per_adapter=num_indices_per_adapter
    )
    
    for _, row in df.iterrows():
        index_1 = db_handler.create_barcode(
            sequence=row["index_1"],
            adapter=row["index_name"],
            index_kit_id=kit.id,
            barcode_type=categories.BarcodeType.INDEX_1
        )
        index_2, index_3, index_4 = None, None, None
        if "index_2" in row:
            index_2 = db_handler.create_barcode(
                sequence=row["index_2"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_2
            )
        if "index_3" in row:
            index_3 = db_handler.create_barcode(
                sequence=row["index_3"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_3
            )
        if "index_4" in row:
            index_4 = db_handler.create_barcode(
                sequence=row["index_4"],
                adapter=row["index_name"],
                index_kit_id=kit.id,
                barcode_type=categories.BarcodeType.INDEX_4
            )

        db_handler.create_adapter(
            name=row["index_name"],
            plate_well=row["index_name"].split("-")[-1],
            index_kit_id=kit.id,
            barcode_1_id=index_1.id,
            barcode_2_id=index_2.id if index_2 is not None else None,
            barcode_3_id=index_3.id if index_3 is not None else None,
            barcode_4_id=index_4.id if index_4 is not None else None,
        )

In [5]:
add_indices_from_kit(db, "data/index-kits/10x_kits/Dual_Index_Kit_NN_Set_A.csv")
add_indices_from_kit(db, "data/index-kits/10x_kits/Dual_Index_Kit_NT_Set_A.csv")
add_indices_from_kit(db, "data/index-kits/10x_kits/Dual_Index_Kit_TN_Set_A.csv")
add_indices_from_kit(db, "data/index-kits/10x_kits/Dual_Index_Kit_TT_Set_A.csv")
add_indices_from_kit(db, "data/index-kits/10x_kits/Dual_Index_Kit_TS_Set_A.csv")
add_indices_from_kit(db, "data/index-kits/10x_kits/Single_Index_Kit_N_Set_A.csv")
add_indices_from_kit(db, "data/index-kits/10x_kits/Single_Index_Kit_T_Set_A.csv")

In [6]:
add_features_from_kit(db, "data/feature-kits/CAR_CRISPR_EP_reverse.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/CMO_hastags_florian.tsv", categories.FeatureType.CMO)
add_features_from_kit(db, "data/feature-kits/CMO_multiome_hashtags.tsv", categories.FeatureType.CMO)
add_features_from_kit(db, "data/feature-kits/LCMV_gene_barcode.tsv", categories.FeatureType.GENE_CAPTURE)
add_features_from_kit(db, "data/feature-kits/LCMV_primer_barcode.tsv", categories.FeatureType.PRIMER_CAPTURE)
add_features_from_kit(db, "data/feature-kits/LMO_multiome_hashtags.tsv", categories.FeatureType.CMO)
add_features_from_kit(db, "data/feature-kits/LMO_multiome_hashtags_EWS.tsv", categories.FeatureType.CMO)
add_features_from_kit(db, "data/feature-kits/MF_AK_smallCROPSeq.tsv", categories.FeatureType.CUSTOM)
add_features_from_kit(db, "data/feature-kits/MF_AK_smallCROPSeq_reverse.tsv", categories.FeatureType.CUSTOM)
add_features_from_kit(db, "data/feature-kits/MultiSeq_LMO.tsv", categories.FeatureType.CMO)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_PT129_5p_MM10_GRCH38.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_PT129_MM10_GRCH38.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_V1_MM10.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_V1_MM10_ALT.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_V2_MM10.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_V2_MM10_GRCH38.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_V3_MM10.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_V3_MM10_reverse.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/PT_CRISPR_V4_MM10_reverse.tsv", categories.FeatureType.CRISPR_CAPTURE)
add_features_from_kit(db, "data/feature-kits/TotalSeqA_Antibody.tsv", categories.FeatureType.ANTIBODY)
add_features_from_kit(db, "data/feature-kits/TotalSeqA_Antibody_Multiplex.tsv", categories.FeatureType.CMO)
add_features_from_kit(db, "data/feature-kits/TotalSeqB_Antibody.tsv", categories.FeatureType.ANTIBODY)
add_features_from_kit(db, "data/feature-kits/TotalSeqC_Antibody.tsv", categories.FeatureType.ANTIBODY)
add_features_from_kit(db, "data/feature-kits/TotalSeqC_Antibody_Multiplex.tsv", categories.FeatureType.CMO)