In [None]:
import toml
import re
import urllib
import pygsheets
import benchlingapi
import requests_html

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
from paulssonlab.api.google.drive import get_drive_modified_time
import paulssonlab.cloning.workflow as workflow

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
session = benchlingapi.Session(config["benchling"]["api_key"])

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
col = workflow.get_strain_collection_sheets(gc.drive.service, "LIB")
col

In [None]:
strain_sheet = gc.open_by_key(col["strains"]).worksheet()
plasmid_sheet = gc.open_by_key(col["plasmids"]).worksheet()

# Lab Obs ordering site

In [None]:
import paulssonlab.api.labops as labops

In [None]:
browser = await labops.login_harvard(
    config["ecommons"]["username"],
    config["ecommons"]["password"],
    service="https://sysbiolabops.hms.harvard.edu/casservice",
)

In [None]:
addgene_orders = await labops.get_orders(browser, vendor="Addgene (26)")

In [None]:
my_addgene = addgene_orders[
    addgene_orders["Name"].isin(["Jacob Shenker", "Noah Olsman"])
]

In [None]:
addgene_catalog = (
    my_addgene["Catalog #"]
    .apply(lambda x: int(re.search(r"(\d+)$", x).group(1)))
    .values
)

# 3G/JUMP/Marionette/Addgene

In [None]:
threeg_kit = "https://www.addgene.org/1000000161/"
marionette_kit = "https://www.addgene.org/1000000137/"
jump_plasmids = [
    "https://www.addgene.org/126956/",
    "https://www.addgene.org/126959/",
    "https://www.addgene.org/126960/",
    "https://www.addgene.org/126961/",
    "https://www.addgene.org/126962/",
    "https://www.addgene.org/126963/",
    "https://www.addgene.org/126964/",
    "https://www.addgene.org/126965/",
    "https://www.addgene.org/126966/",
    "https://www.addgene.org/126967/",
    "https://www.addgene.org/126973/",
    "https://www.addgene.org/126974/",
    "https://www.addgene.org/126975/",
    "https://www.addgene.org/126976/",
    "https://www.addgene.org/126991/",
    "https://www.addgene.org/126996/",
    "https://www.addgene.org/127015/",
    "https://www.addgene.org/127047/",
    "https://www.addgene.org/127051/",
    "https://www.addgene.org/127025/",
    "https://www.addgene.org/127000/",
    "https://www.addgene.org/126983/",
]
jump_plasmids = sorted(jump_plasmids)

In [None]:
addgene_plasmids = [
    f"https://www.addgene.org/{catalog}/"
    for catalog in addgene_catalog
    if catalog not in (1000000059,)  # remove Densmore
]

In [None]:
all_addgene = [threeg_kit, *jump_plasmids, marionette_kit, *addgene_plasmids]

In [None]:
# only include first for duplicate URLs
all_addgene_deduplicated = list(dict.fromkeys(all_addgene))

In [None]:
len(all_addgene)

In [None]:
len(all_addgene_deduplicated)

In [None]:
addgene_data = workflow.import_addgene(
    all_addgene_deduplicated, strain_sheet, plasmid_sheet, col["plasmid_maps"]
)

# Densmore

In [None]:
densmore_kit = "https://www.addgene.org/1000000059/"
densmore_wells = [
    "A1",
    "A5",
    "A9",
    "B1",
    "B5",
    "B9",
    "C1",
    "C5",
    "C9",
    "D1",
    "D5",
    "D6",
    "D7",
    "D8",
    "D9",
    "D10",
    "D11",
    "D12",
    "E1",
    "E2",
    "E3",
    "E4",
    "E5",
    "E6",
    "E7",
]

In [None]:
def cb(entry, data):
    if data["well"] not in densmore_wells:
        return False
    else:
        return entry


addgene_data = workflow.import_addgene(
    densmore_kit, strain_sheet, plasmid_sheet, col["plasmid_maps"], callback=cb
)

# Alias syncing

In [None]:
def sync_duplicate_info(columns={"Tags": "\s", "Aliases*": ",", "Description"}):
    # aliases, tags
    pass

In [None]:
# allow batching edits using unlink/link, compare performance

In [None]:
def sync_columns(
    sheet1,
    sheet2,
    sync_column,
    join_column="Plasmid(s)*",
    join_separator="(?:\s*,\s*|\s+)",
    strategy="newest",
    separator=None,
):
    pass


def sync_colletion_sheets(
    strain_sheet,
    plasmid_sheet,
    part_sheet,
    sync_column,
    join_column="Plasmid(s)*",
    join_separator="(?:\s*,\s*|\s+)",
    strategy="newest",
    separator=None,
):
    mtime1 = get_drive_modified_time(sheet1.client.drive.service, sheet1.spreadsheet.id)
    mtime2 = get_drive_modified_time(sheet2.client.drive.service, sheet2.spreadsheet.id)
    # get columns

In [None]:
def sync_columns(
    sheets,
    sync_column,
    join_column="Plasmid(s)*",
    join_separator="(?:\s*,\s*|\s+)",
    strategy="newest",
    separator=None,
):
    mtime1 = get_drive_modified_time(sheet1.client.drive.service, sheet1.spreadsheet.id)
    mtime2 = get_drive_modified_time(sheet2.client.drive.service, sheet2.spreadsheet.id)

In [None]:
def join_sheets(sheets, join_column):
    pass


## join
# row_nums: {"strain": X, "plasmid": X, "part": X}
# ids: {"strain": "LIB2", "plasmid": "pLIB3", "part": "ALIAS"}

## for each column:
# values: {"strain": X, "plasmid": X, "part": X}
# mtimes: {"strain": X, "plasmid": X, "part": X}
# sync_strategy -> sync_func

In [None]:
col_names = strain_sheet.get_row(1)
id_colidx = col_names.index("ID*") + 1
sync_colidx = col_names.index("Aliases*") + 1
join_colidx = col_names.index("Plasmid(s)*") + 1

In [None]:
id_column = strain_sheet.get_col(id_colidx, include_tailing_empty=False)
num_rows = len(id_column)
sync_column = strain_sheet.get_values(
    (1, sync_colidx),
    (num_rows, sync_colidx),
    majdim="COLUMNS",
    include_tailing_empty_rows=True,
)[0]
join_column = strain_sheet.get_values(
    (1, join_colidx),
    (num_rows, join_colidx),
    majdim="COLUMNS",
    include_tailing_empty_rows=True,
)[0]

In [None]:
(len(id_column), len(sync_column), len(join_column))

In [None]:
(id_column[-5:], sync_column[-5:], join_column[-5:])

In [None]:
strain_sheet.get_values((), ())

In [None]:
res = (
    plasmid_sheet.client.drive.service.files()
    .get(fileId=plasmid_sheet.spreadsheet.id, fields="modifiedTime")
    .execute()
)
from_rfc3339(res["modifiedTime"])

# Ingest parts

In [None]:
rows = plasmid_sheet.get_all_records()

In [None]:
rows[0]

In [None]:
def base_url(url):
    return re.match("^(?:https?://)?(.*[^/]+)/?$", url).group(1).lower()

In [None]:
def import_threeg_part(plasmid):
    pass


def import_densmore_part(plasmid):
    pass


def import_jump_part(plasmid):
    pass


# accept extra columns via overrides={"Tags": "foo"}
# pass through tags from plasmid


def get_part():
    part = {}
    part["Name*"] = ""
    part["Author*"] = ""
    part["Date*"] = ""
    part["Tags"] = ""
    part["Plasmid/Oligos (Cutter)*"] = ""
    part["Author*"] = ""
    part["Date*"] = ""
    part["Upstream overhang*"] = ""
    part["Downstream overhang*"] = ""
    part["Sequence*"] = ""
    part["Organism/codon usage*"] = ""
    part["Description"] = ""
    return part


part_rules = [
    (
        lambda x: (base_url(threeg_kit) in x["Source*"]) and (x["Aliases*"][0] != "V"),
        import_threeg_part,
    ),
    (lambda x: base_url(densmore_kit) in x["Source*"], import_densmore_part),
    (
        lambda x: (any(base_url(j) in x["Source*"] for j in jump_plasmids))
        and ("(Empty Backbone)" not in x["Description"]),
        import_jump_part,
    ),
]

In [None]:
for row in rows:
    for predicate, rule in part_rules:
        if predicate(row):
            print(rule, row["Aliases*"])
            break