# load env

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

load_dotenv("../../.env.localhost")

In [None]:
import warnings

import dagster

warnings.filterwarnings("ignore", category=dagster.ExperimentalWarning)

# init db and client

In [None]:
from nmdc_runtime.util import nmdc_jsonschema, nmdc_jsonschema_validate

from nmdc_runtime.site.repository import run_config_frozen__normal_env
from nmdc_runtime.site.resources import (
    get_mongo,
    get_runtime_api_site_client,
)


mongo = get_mongo(run_config_frozen__normal_env)
mdb = mongo.db


client = get_runtime_api_site_client(run_config_frozen__normal_env)

# register api user/site

In [None]:
from nmdc_runtime.api.models.user import UserInDB
from nmdc_runtime.api.models.site import SiteInDB
from nmdc_runtime.api.core.auth import get_password_hash

username = ""
password = ""
site_id = ""

mdb.users.insert_one(
        UserInDB(
            username=username,
            hashed_password=get_password_hash(password),
            site_admin=[site_id ],
        ).dict(exclude_unset=True)
    )
mdb.sites.insert_one(SiteInDB(id=site_id).dict(exclude_unset=True))

# activities and data objects

In [None]:
from pathlib import Path

filepath = Path.home().joinpath(
    "Dropbox", "diary", "21", "09",
    "2021-09-15-scanon-nmdc-metadata-file-urls.txt"
)
with open(filepath) as f:
    urls = [line.strip() for line in f if line.strip()]

### Register DrsObjects for existing URLs

In [None]:
import os
import json
from pathlib import Path
import re
from tempfile import TemporaryDirectory

import requests
from tqdm.notebook import tqdm

from nmdc_runtime.util import drs_metadata_for
from nmdc_runtime.api.models.object import DrsObjectIn

pattern = re.compile(r"https?://(?P<domain>[^/]+)/(?P<path>.+)")

def url_to_name(url):
    m = pattern.match(url)
    return (
        f"{'.'.join(reversed(m.group('domain').split('.')))}"
        f"__{m.group('path').replace('/', '.')}"
    )


def fetch_url(url, timeout=30):
    return requests.get(url, timeout=timeout)


class HttpResponseNotOk(Exception):
    pass

class HttpResponseNotJson(Exception):
    pass

def response_to_json(response):
    if response.status_code != 200:
        raise HttpResponseNotOk()
    try:
        json_data = response.json()
    except:
        raise HttpResponseNotJson()
    return json_data


def json_data_from_url_to_file(json_data, url, save_dir):
    filepath = os.path.join(save_dir, url_to_name(url))
    with open(filepath, "w") as f:
        json.dump(json_data, f)
    return filepath

def json_clean(d, model, exclude_unset=False):
    return json.loads(model(**d).json(exclude_unset=exclude_unset))

In [None]:
result = {}
with TemporaryDirectory() as save_dir:
    for url in tqdm(urls):
        response = fetch_url(url)
        try:
            json_data = response_to_json(response)
        except HttpResponseNotOk:
            result[url] = {"error": "HttpResponseNotOk"}
            continue
        except HttpResponseNotJson:
            result[url] = {"error": "HttpResponseNotJson"}
            continue
        filepath = json_data_from_url_to_file(json_data, url, save_dir)
        drs_object_in = DrsObjectIn(
            **drs_metadata_for(
                filepath,
                {
                    "access_methods": [{"access_url": {"url": url}}],
                    "name": Path(filepath).name.replace(":","-")
                }
            )
        )
        result[url] = {"result": drs_object_in}

all("result" in v for v in result.values())

In [None]:
response = {}
for url, doc in tqdm(list(result.items())):
    if "error" in doc:
        continue
    
    drs_object_in = doc["result"]
    rv = client.create_object(
        json.loads(drs_object_in.json(exclude_unset=True)))
    response[url] = rv.status_code

all(v == 201 for v in response.values())

### validate and tag

In [None]:
type_collections = {
    f'nmdc:{spec["items"]["$ref"].split("/")[-1]}': collection_name
    for collection_name, spec in
    nmdc_jsonschema["properties"].items()
    if collection_name.endswith("_set")
}

def specialize_activity_set_docs(docs):
    if "activity_set" in docs:
        for doc in docs["activity_set"]:
            doc_type = doc["type"]
            collection_name = type_collections[doc_type]
            if collection_name in docs:
                docs[collection_name].append(doc)
            else:
                docs[collection_name] = [doc]
        del docs["activity_set"]
    return docs

In [None]:
drs_object_id = {}
for url in tqdm(urls):
    doc = mdb.objects.find_one({"access_methods.access_url.url": url}, ["id"])
    drs_object_id[url] = doc["id"]

In [None]:
response = {}

for drs_id in tqdm(list(drs_object_id.values())):
    docs = client.get_object_bytes(drs_id).json()
    docs = specialize_activity_set_docs(docs)
    _ = nmdc_jsonschema_validate(docs)
    response[drs_id] = client.ensure_object_tag(
        drs_id, "schema#/definitions/Database"
    )
all(v is None or v.status_code == 200 for v in response.values())

### validate and ingest

This should trigger creation of `portal-etl-1.0.0` jobs:

In [None]:
response = {}

for drs_id in tqdm(list(drs_object_id.values())):
    docs = client.get_object_bytes(drs_id).json()
    docs = specialize_activity_set_docs(docs)
    _ = nmdc_jsonschema_validate(docs)
    response[drs_id] = client.ensure_object_tag(
        drs_id, "metadata-in"
    )
all(v is None or v.status_code == 200 for v in response.values())

In [None]:
mdb.jobs.count_documents({
    "workflow.id": "portal-etl-1.0.0",
    "config.object_id": {"$in": list(drs_object_id.values())}
}) == len(drs_object_id.values())

Now, nmdc-runtime site client should claim these jobs.

In [None]:
from nmdc_runtime.api.models.util import ListRequest

max_page_size = 1000
lr = ListRequest(filter=json.dumps({
    "workflow.id": "portal-etl-1.0.0",
    "config.object_id": {"$in": list(drs_object_id.values())}
}), max_page_size=max_page_size)
jobs = []
while True:
    rv = client.list_jobs(lr.dict()).json()
    jobs.extend(rv["resources"])
    print("got", len(rv["resources"]), "jobs")
    if "next_page_token" not in rv:
        break
    else:
        lr.page_token = rv["next_page_token"]
    
    # safety escape
    if len(jobs) == len(drs_object_id.values()):
        break

In [None]:
claimed_job_ops = []
for j in tqdm(jobs):
    claimed_job_ops.append(client.claim_job(j["id"]))

In [None]:
job_ops = [rv.json() for rv in claimed_job_ops]

In [None]:
job_ops

Now, do the jobs and mark the job ops as completed, giving appropriate metadata and results.

In [None]:
from toolz import dissoc

def mongo_add_docs_result_as_dict(rv):
    return {
        collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
        for collection_name, bulk_write_result in rv.items()
    }

In [None]:
from nmdc_runtime.api.models.job import JobOperationMetadata
from nmdc_runtime.api.models.operation import Operation
from nmdc_runtime.api.models.util import ResultT

op_result = {}

for doc in tqdm(job_ops):
    op = Operation[ResultT, JobOperationMetadata](**doc)
    
    docs = client.get_object_bytes(op.metadata.job.config["object_id"]).json()
    docs = specialize_activity_set_docs(docs)
    op_result[op.id] = mongo.add_docs(docs, validate=True)

In [None]:
from datetime import datetime, timezone

from nmdc_runtime.api.models.operation import UpdateOperationRequest

now = datetime.now(timezone.utc).isoformat(timespec="seconds")

op_patch_result = {}

for op_id, rv in tqdm(list(op_result.items())):
    if client.operation_is_done(op_id):
        print("op", op_id, "marked as done already. Skipping...")
        continue
    
    op_patch = UpdateOperationRequest(
        done=True,
        result=mongo_add_docs_result_as_dict(rv),
        metadata={"done_at": now}
    )
    op_patch_result[op_id] = client.update_operation(op_id, op_patch).json()

In [None]:
op_patch_result

TODO: some logic to help your site client if it doesn't want to re-claim jobs it has already claimed, it has already done, another site has already claimed, or another site has already done. Perhaps add `done` field to `JobClaim` so site client can inspect the nature and status of all job claims.

## annotations

In [None]:
from pathlib import Path

filepath = Path.home().joinpath(
    "Dropbox", "diary", "21", "09",
    "scanon-annotations-urls.txt"
)
with open(filepath) as f:
    anno_additions_urls = [line.strip() for line in f if line.strip()]

Get `anno_fixes_urls`:
 - from https://portal.nersc.gov/project/m3408/meta/anno2/?C=M;O=D
 - `*.json` files last modified on or after 2021-09-10
 
 Fetch html and use beautiful soup to get urls?

In [None]:
from bs4 import BeautifulSoup

urlpath = "https://portal.nersc.gov/project/m3408/meta/anno2/"
rv = requests.get(f"{urlpath}?C=M;O=D")

soup = BeautifulSoup(rv.text, 'html.parser')

anno_fixes_urls = [] 

for tr in soup.find_all("tr"):
    tds = tr.find_all("td")
    if len(tds) != 5:
        continue
    
    _, td_name, td_last_modified, td_size, _ = tds
    if td_last_modified.text.startswith("2021-09"):
        name = td_name.a.text
        if name.endswith(".json"):
            anno_fixes_urls.append(f"{urlpath}{name}")

In [None]:
anno_additions_urls, anno_fixes_urls

### Register DrsObjects for existing URLs

In [None]:
url = "https://portal.nersc.gov/project/m3408/meta/stegen_metaP_activities.json"

In [None]:
from nmdc_runtime.site.drsobjects.registration import (
    fetch_url,
    response_to_json,
    json_data_from_url_to_file,
    drs_object_in_for,
)

drs_object_in = drs_object_in_for(url)

In [None]:
drs_object_in["result"].dict(exclude_unset=True)

In [None]:
import json

doc = {
  "aliases": None,
  "description": "fix biosamples INSDC ID Mongo update",
  "mime_type": "application/json",
  "name": "fix_biosample_insdc_ids.json",
  "access_methods": [
    {
      "access_url": {
        "url": "https://portal.nersc.gov/project/m3408/meta/fix_biosample_insdc_ids.json"
      },
      "region": None,
      "type": "https"
    }
  ],
  "checksums": [
    {
      "checksum": "8aca72ffe32265e2c2a6a4de9ae47a53",
      "type": "md5"
    }
  ],
  "created_time": "2021-10-13T23:34:13.740Z",
  "size": 47968,
  "updated_time": "2021-10-13T23:34:13.740Z",
  "version": None
}

In [None]:
from nmdc_runtime.api.models.object import DrsObjectIn

DrsObjectIn(**doc)

In [None]:
all("result" in v for v in drs_object_in_for.values())

In [None]:
create_drs_object_response = {}
for url, doc in tqdm(list(drs_object_in_for.items())):
    if "error" in doc:
        continue
    
    drs_object_in = doc["result"]
    rv = client.create_object(
        json.loads(drs_object_in.json(exclude_unset=True)))
    create_drs_object_response[url] = rv.status_code

In [None]:
all(v == 201 for v in create_drs_object_response.values())

### Stage ~80GB of annotations metadata locally

`https://data.microbiomedata.org/data/` maps to `/project/projectdirs/m3408/ficus/pipeline_products/` on NERSC CFS. 

`https://portal.nersc.gov/project/m3408/` maps to `/project/projectdirs/m3408/www/` on NERSC CFS.

Want to xfer via Globus all of the `anno_additions_urls + anno_fixes_urls` to my local system.

1. cp relevant files to a common folder on NERSC CFS
2. gzip all of the files
3. initiate xfer

In [None]:
us = [u.replace("https://data.microbiomedata.org/data/","/project/projectdirs/m3408/ficus/pipeline_products/") for u in anno_additions_urls]

In [None]:
"cp --parents " + " ".join(us) + " /project/projectdirs/m3408/xfer-staging-area/"

In [None]:
us = [u.replace("https://portal.nersc.gov/project/m3408/","/project/projectdirs/m3408/www/") for u in anno_fixes_urls]

In [None]:
"cp --parents " + " ".join(us) + " /project/projectdirs/m3408/xfer-staging-area/"

Downloaded all to `/Users/dwinston/nmdc_files/2021-09-scanon-meta/`.

In [None]:
import json

prefixes_url_to_local = {
    "https://data.microbiomedata.org/data/": "/Users/dwinston/nmdc_files/2021-09-scanon-meta/ficus/pipeline_products/",
    "https://portal.nersc.gov/project/m3408/": "/Users/dwinston/nmdc_files/2021-09-scanon-meta/www/",
}

def load_local_json(url):
    path = url
    for before, after in prefixes_url_to_local.items():
        path = path.replace(before, after)
    with open(path) as f:
        return json.load(f)

### validate and tag

In [None]:
drs_object_id = {}
for url in tqdm(urls):
    doc = mdb.objects.find_one({"access_methods.access_url.url": url}, ["id"])
    drs_object_id[url] = doc["id"]

In [None]:
response = {}

skip = True
for url, drs_id in tqdm(list(drs_object_id.items())):
    if url == "https://portal.nersc.gov/project/m3408/meta/anno2/503568_186507_features.json":
        skip = False
        print("skipping", url, "...")
        continue
    if skip:
        continue
        
    print("loading bytes for", url, "...")
    docs = load_local_json(url)
    print(docs.keys())
    print("loaded. validating...")
    _ = nmdc_jsonschema_validate(docs)
    print("validated. ensuring tags...")
    response[drs_id] = client.ensure_object_tag(
        drs_id, "schema#/definitions/Database"
    )
    response[drs_id] = client.ensure_object_tag(
        drs_id, "metadata-in"
    )
    print("done with", url)
all(v is None or v.status_code == 200 for v in response.values())

In [None]:
for url, drs_id in drs_object_id.items():
    if url == "https://portal.nersc.gov/project/m3408/meta/anno2/503568_186507_features.json":
        print(drs_id)

In [None]:
drs_object_ids_to_ingest = list(set(drs_object_id.values()) - {"sys07d2q49"})

Claim the jobs.

In [None]:
with open("drs_object_ids_to_ingest.json","w") as f:
    json.dump(drs_object_ids_to_ingest, f)

In [None]:
with open("drs_object_ids_to_ingest.json") as f:
    drs_object_ids_to_ingest = json.load(f)

In [None]:
mdb.jobs.count_documents({
    "workflow.id": "portal-etl-1.0.0",
    "config.object_id": {"$in": drs_object_ids_to_ingest}
}) == len(drs_object_ids_to_ingest)

In [None]:
from nmdc_runtime.api.models.util import ListRequest

max_page_size = 1000
lr = ListRequest(filter=json.dumps({
    "workflow.id": "portal-etl-1.0.0",
    "config.object_id": {"$in": drs_object_ids_to_ingest}
}), max_page_size=max_page_size)
jobs = []
while True:
    rv = client.list_jobs(lr.dict()).json()
    jobs.extend(rv["resources"])
    print("got", len(rv["resources"]), "jobs")
    if "next_page_token" not in rv:
        break
    else:
        lr.page_token = rv["next_page_token"]
    
    # safety escape
    if len(jobs) == len(drs_object_ids_to_ingest):
        break

claimed_job_ops = []
for j in tqdm(jobs):
    claimed_job_ops.append(client.claim_job(j["id"]))

job_ops = [rv.json() for rv in claimed_job_ops]

Do the jobs, and mark the job ops as done, giving appropriate metadata and results.

In [None]:
job_ops = list(mdb.operations.find({
    "metadata.job.workflow.id": "portal-etl-1.0.0",
    "metadata.job.config.object_id": {"$in": drs_object_ids_to_ingest},
    "done": False
}))

len(job_ops)

In [None]:
from datetime import datetime, timezone
import gc

from nmdc_runtime.api.models.job import JobOperationMetadata
from nmdc_runtime.api.models.operation import Operation
from nmdc_runtime.api.models.util import ResultT
from nmdc_runtime.api.models.operation import UpdateOperationRequest

op_result = {}
op_patch_result = {}

for doc in tqdm(job_ops):
    op = Operation[ResultT, JobOperationMetadata](**doc)
    object_info = client.get_object_info(op.metadata.job.config["object_id"]).json()
    url = object_info["access_methods"][0]["access_url"]["url"]
    docs = load_local_json(url)
    op_result[op.id] = mongo.add_docs(docs, validate=False, replace=False)
    del docs
    gc.collect()
    
    if client.operation_is_done(op.id):
        print("op", op.id, "marked as done already. Skipping...")
    else:
        op_patch = UpdateOperationRequest(
            done=True,
            result=mongo_add_docs_result_as_dict(op_result[op.id]),
            metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")}
        )
        op_patch_result[op.id] = client.update_operation(op.id, op_patch).json()
        print("op", op.id, "marked as done.")

## omics processing docs

- `omics_processing_set` docs from GOLD
- many with no `has_output` relationships
- so check `was_informed_by` in `read_QC_analysis_activity_set` for `omics_processing_set` IDs
- for matches to above, add the read QC activity's `has_input` IDs to the omics processing doc's `has_output` set.

In [None]:
mdb_staging = mdb.client["nmdc_etl_staging"]

In [None]:
from toolz import dissoc

omics_processing_docs = [dissoc(d, "_id") for d in mdb_staging["gold.omics_processing_set"].find()]

In [None]:
assert len(omics_processing_docs) == 889
assert all(len(d.get("part_of", [])) <= 1 for d in omics_processing_docs)

In [None]:
omics_processing_docs_for_spruce = [d for d in omics_processing_docs if "gold:Gs0110138" in d.get("part_of" ,[])]

In [None]:
len(omics_processing_docs_for_spruce)

In [None]:
from toolz import assoc
from tqdm.notebook import tqdm

docs_to_add = {
    "omics_processing_set": [],
}

for ompro_doc in tqdm(omics_processing_docs_for_spruce):
    project_id = ompro_doc["id"]
    activity_docs = [
        dissoc(d, "_id") for d in
        mdb.read_QC_analysis_activity_set.find({"was_informed_by": project_id})
    ]
    for adoc in activity_docs:
        assert len(adoc.get("has_input", [])) == 1
        data_object_id = adoc.get("has_input")[0]
        docs_to_add["omics_processing_set"].append(assoc(ompro_doc, "has_output", [data_object_id]))

In [None]:
from nmdc_runtime.util import nmdc_jsonschema_validate

_ = nmdc_jsonschema_validate(docs_to_add)

In [None]:
rv = mongo.add_docs(docs_to_add)

In [None]:
rv

In [None]:
rv['omics_processing_set'].upserted_count

In [None]:
ompro_ids_not_added = {d["id"] for d in omics_processing_docs_for_spruce} - {d["id"] for d in docs_to_add["omics_processing_set"]}

In [None]:
len(ompro_ids_not_added)

In [None]:
for id_ in ompro_ids_not_added:
    print(id_)

In [None]:
import requests

rv = requests.get("https://portal.nersc.gov/project/m3408/meta/spruce-mg-mapping.txt")

In [None]:
lines = rv.text.split("\n")

In [None]:
ompro_ids = {line.split(",")[0] for line in lines if line.strip()}

In [None]:
ompro_ids & ompro_ids_not_added

In [None]:
(mdb.omics_processing_set.count_documents({"has_output.0": {"$exists": True}})
 ==
 mdb.omics_processing_set.count_documents({})
)

In [None]:
mdb.omics_processing_set.count_documents({})

# 2021-10-11 reingest of metaP data objects metadata

In [None]:
import json
from pathlib import Path

with open(Path("~").expanduser().joinpath(
    'Dropbox', 'diary', '21', '10',
    '2021-09-14-stegen_emsl_analysis_data_objects.json'
)) as f:
    docs = json.load(f)

In [None]:
{"data_object_set": docs}

In [None]:
[d["id"] for d in docs]

In [None]:
from bson.objectid import ObjectId

In [None]:
[ObjectId(d["id"]).generation_time for d in docs]

# 2021-10-15 registration of mongoexports

In [None]:
import json
import os

with open(os.path.expanduser("~/mongoexport/2021-10-14/drs_objects_in.json")) as f:
    drs_objects_in = json.load(f)

In [None]:
rvs = {}
for o in drs_objects_in:
    rvs[o["name"]] = client.create_object(o)

In [None]:
rvs

In [None]:
len(rvs)

In [None]:
for name, rv in rvs.items():
    print(rv.json()["id"])

In [None]:
names = [n for n in mdb.list_collection_names() if n.endswith("_set") and mdb[n].estimated_document_count() > 0]

In [None]:
len(names)

In [None]:
from nmdc_runtime.site.backup.nmdcdb_mongoexport import collection_stats

pprint(collection_stats(mdb))