In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv

load_dotenv("../../.env.localhost")

True

# S3 init

In [3]:
from functools import lru_cache
import os

import boto3

API_SITE_BUCKET = os.getenv("API_SITE_ID")


@lru_cache
def get_s3_client():
    _session = boto3.session.Session()
    return _session.client(
        "s3",
        region_name=os.getenv("DO_REGION_NAME"),
        endpoint_url=os.getenv("DO_ENDPOINT_URL"),
        aws_access_key_id=os.getenv("DO_SPACES_KEY"),
        aws_secret_access_key=os.getenv("DO_SPACES_SECRET"),
    )


def presigned_url_to_put(
    key, client=None, mime_type=None, bucket=API_SITE_BUCKET, expires_in=300
):
    return client.generate_presigned_url(
        ClientMethod="put_object",
        Params={"Bucket": bucket, "Key": key, "ContentType": mime_type},
        ExpiresIn=expires_in,
    )

In [4]:
s3client = get_s3_client()

In [5]:
response = s3client.list_buckets()
for space in response['Buckets']:
    print(space['Name'])
print(f"\nusing Bucket {API_SITE_BUCKET}\n")
response = s3client.list_objects(Bucket=API_SITE_BUCKET)
for obj in response['Contents']:
    print(obj['Key'])

nmdc-runtime
polyneme

using Bucket nmdc-runtime

do/03c3-5kqv-57
do/08j73-wn755
do/22e0-wqt8-69
do/4kr3-xa85-27
do/5acs-zdnm-73
do/5f0b-pbs6-30
do/720v-9dag-24
do/85k8-agkq-63
do/8a8w-s0qw-11
do/8e7t-nd51-05
do/acvg-nm6k-61
do/aea1-qvxa-25
do/azs9-t5dd-49
do/c4n1-9a3d-06
do/ewfm-fh80-44
do/ges8-mq5d-58
do/gygb-zv5w-13
do/j9cm-9hx6-69
do/pjwn-8yhy-54
do/qzna-7hxj-86
do/s50g-0rq0-58
do/veh6-4naq-38
do/y2s2-p1ag-52
do/yqm2-ehg2-11
gold_etl/nmdc_database.json.zip
test/test.txt


# Site Client Session

In [6]:
from datetime import timedelta
import os
import time

import requests
from toolz import merge

from nmdc_runtime.api.core.util import expiry_dt_from_now, has_passed
from nmdc_runtime.api.models.operation import ListOperationsRequest, ListOperationsResponse

class SiteClientSession:
    def __init__(
        self, base_url=os.getenv("API_HOST"), site_id=os.getenv("API_SITE_ID"),
        client_id=os.getenv("API_SITE_CLIENT_ID"),
        client_secret=os.getenv("API_SITE_CLIENT_SECRET")
    ):
        self.base_url = base_url
        self.site_id = site_id
        self.client_id = client_id
        self.client_secret = client_secret
        self.headers = {}
        self.token_response = None
        self.refresh_token_after = None
        self.get_token()
    
    def _request(self, method, url_path, params_or_json_data=None):
        self.ensure_token()
        kwargs = {
            "url": self.base_url + url_path,
            "headers": self.headers
        }
        if method.upper() == "GET":
            kwargs["params"] = params_or_json_data
        else:
            kwargs["json"] = params_or_json_data
        return requests.request(method, **kwargs)
    
    def get_token(self):
        rv = requests.post(self.base_url + "/token", data={
            "grant_type": "client_credentials",
            "client_id": self.client_id,
            "client_secret": self.client_secret,
        })
        self.token_response = rv.json()
        if "access_token" not in self.token_response:
            raise Exception(f"Getting token failed: {self.token_response}")
            
        self.headers["Authorization"] = f'Bearer {self.token_response["access_token"]}'
        self.refresh_token_after = expiry_dt_from_now(**self.token_response["expires"]) - timedelta(seconds=5)
    
    def ensure_token(self):
        if has_passed(self.refresh_token_after):
            self.get_token()
    
    def put_object_in_site(self, object_in):
        return self._request("POST", f"/sites/{self.site_id}:putObject", object_in)
    
    def get_site_object_link(self, access_method):
        return self._request("POST", f"/sites/{self.site_id}:getObjectLink", access_method)
    
    def update_operation(self, op_id, op_patch):
        return self._request("PATCH", f"/operations/{op_id}", op_patch)
    
    def list_operations(self, req):
        rv = self._request("GET", "/operations", req)
        lor = ListOperationsResponse(**rv.json())
        resources_so_far = lor.resources
        if not lor.next_page_token:
            return resources_so_far
        else:
            resources_rest = self.list_operations(merge(req, {"page_token": lor.next_page_token}))
            return resources_so_far + resources_rest

In [7]:
session = SiteClientSession()

In [8]:
import json

op_docs = [op.dict() for op in session.list_operations({"filter": json.dumps({"done": False})})]

In [9]:
len(op_docs)

198

In [10]:
from pprint import pprint

pprint(op_docs[0])

{'done': False,
 'expire_time': datetime.datetime(2021, 7, 23, 16, 25, 29, 753000),
 'id': '04q2-2jj2-15',
 'metadata': {'expires_in_seconds': 300,
              'model': 'nmdc_runtime.api.models.operation.ObjectPutMetadata',
              'object_id': 'hhjw-88ex-68',
              'site_id': 'nmdc-runtime',
              'url': 'https://nyc3.digitaloceanspaces.com/nmdc-runtime/do/hhjw-88ex-68?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=3YNLWOW2XZ2PLSB56PR2%2F20210623%2Fnyc3%2Fs3%2Faws4_request&X-Amz-Date=20210623T162029Z&X-Amz-Expires=300&X-Amz-SignedHeaders=content-type%3Bhost&X-Amz-Signature=ffeec8cbb2d3d72ac0e2d631aed48fde3daf51ddcbe44de441ecb4ede5c4f104'},
 'result': None}


In [13]:
!echo '{"hello": "kjiersten"}' > test.json

In [14]:
from datetime import datetime, timezone
import mimetypes
import os
from pathlib import Path

from nmdc_runtime.api.core.util import sha256hash_from

def drs_metadata_for(path, base=None):
    """given file path, get drs metadata
    
    required: size, created_time, and at least one checksum.
    """
    base = {} if base is None else base
    if "size" not in base:
        base["size"] = os.path.getsize(path)
    if "created_time" not in base:
        base["created_time"] = datetime.fromtimestamp(os.path.getctime(path), tz=timezone.utc)
    if "checksums" not in base:
        base["checksums"] = [{"type": "sha-256", "checksum": sha256hash_from(path)}]
    if "mime_type" not in base:
        base["mime_type"] = mimetypes.guess_type(path)[0]
    if "name" not in base:
        base["name"] = Path(path).name
    return base

In [15]:
for _ in range(100):
    rv = session.put_object_in_site({"mime_type": "application/json", "name": "test.json"})

In [16]:
op = rv.json()
op

{'id': 'qzbe-h9hb-53',
 'done': False,
 'expire_time': '2021-07-23T17:00:11.175664+00:00',
 'result': None,
 'metadata': {'model': 'nmdc_runtime.api.models.operation.ObjectPutMetadata',
  'object_id': '2wem-dzg1-84',
  'site_id': 'nmdc-runtime',
  'url': 'https://nyc3.digitaloceanspaces.com/nmdc-runtime/do/2wem-dzg1-84?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=3YNLWOW2XZ2PLSB56PR2%2F20210623%2Fnyc3%2Fs3%2Faws4_request&X-Amz-Date=20210623T165511Z&X-Amz-Expires=300&X-Amz-SignedHeaders=content-type%3Bhost&X-Amz-Signature=b09996591df81ed1c5bc2823f39bcac123f78dc34ed47166ff5ebdf2620c8dd7',
  'expires_in_seconds': 300}}

In [17]:
import mimetypes
import requests

def put_object(filepath, url, mime_type=None):
    if mime_type is None:
        mime_type = mimetypes.guess_type(filepath)[0]
    with open(filepath) as f:
        return requests.put(url, data=f, headers={"Content-Type": mime_type})

In [18]:
rv = put_object("test.json", op["metadata"]["url"])
rv

<Response [200]>

In [19]:
rv = session.get_site_object_link({"access_id": op["metadata"]["object_id"]})

In [20]:
rv.json()

{'headers': None,
 'url': 'https://nyc3.digitaloceanspaces.com/nmdc-runtime/do/2wem-dzg1-84?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=3YNLWOW2XZ2PLSB56PR2%2F20210623%2Fnyc3%2Fs3%2Faws4_request&X-Amz-Date=20210623T165613Z&X-Amz-Expires=300&X-Amz-SignedHeaders=host&X-Amz-Signature=f142908283febdbaf9d567df74f039fcc11846c490b3bbf619ef5ae2498df5e5'}

In [21]:
rv = requests.get(rv.json()["url"])

In [22]:
rv.content

b'{"hello": "kjiersten"}\n'

In [23]:
rv.json()

{'hello': 'kjiersten'}

In [None]:
# TODO make SiteClientSession method that uses /objects/{object_id}/access/{access_id}
#   ensuring 'nmdc-runtime' access_id for object.

In [24]:
from nmdc_runtime.api.models.object import DrsObjectIn


access_id = f'{op["metadata"]["site_id"]}:{op["metadata"]["object_id"]}'
drs_obj_in = DrsObjectIn(**drs_metadata_for("test.json", {"access_methods": [{"access_id": access_id}]}))

In [25]:
from pprint import pprint

pprint(drs_obj_in.dict(exclude_unset=True))

{'access_methods': [{'access_id': 'nmdc-runtime:2wem-dzg1-84'}],
 'checksums': [{'checksum': '1dd4fe72fa34e037d051de94fb7e13ddf8517867a322d00899a4a2c89367bbf6',
                'type': 'sha-256'}],
 'created_time': datetime.datetime(2021, 6, 23, 16, 54, 44, 854174, tzinfo=datetime.timezone.utc),
 'mime_type': 'application/json',
 'name': 'test.json',
 'size': 23}


In [26]:
import json
from pprint import pprint

op_patch = {"done": True, "result": json.loads(drs_obj_in.json(exclude_unset=True))}
pprint(op_patch)

{'done': True,
 'result': {'access_methods': [{'access_id': 'nmdc-runtime:2wem-dzg1-84'}],
            'checksums': [{'checksum': '1dd4fe72fa34e037d051de94fb7e13ddf8517867a322d00899a4a2c89367bbf6',
                           'type': 'sha-256'}],
            'created_time': '2021-06-23T16:54:44.854174+00:00',
            'mime_type': 'application/json',
            'name': 'test.json',
            'size': 23}}


In [27]:
rv = session.update_operation(op["id"], op_patch)

In [28]:
pprint(rv.json())

{'done': True,
 'expire_time': '2021-07-23T17:00:11.175000',
 'id': 'qzbe-h9hb-53',
 'metadata': {'expires_in_seconds': 300,
              'model': 'nmdc_runtime.api.models.operation.ObjectPutMetadata',
              'object_id': '2wem-dzg1-84',
              'site_id': 'nmdc-runtime',
              'url': 'https://nyc3.digitaloceanspaces.com/nmdc-runtime/do/2wem-dzg1-84?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=3YNLWOW2XZ2PLSB56PR2%2F20210623%2Fnyc3%2Fs3%2Faws4_request&X-Amz-Date=20210623T165511Z&X-Amz-Expires=300&X-Amz-SignedHeaders=content-type%3Bhost&X-Amz-Signature=b09996591df81ed1c5bc2823f39bcac123f78dc34ed47166ff5ebdf2620c8dd7'},
 'result': {'access_methods': [{'access_id': 'nmdc-runtime:2wem-dzg1-84'}],
            'checksums': [{'checksum': '1dd4fe72fa34e037d051de94fb7e13ddf8517867a322d00899a4a2c89367bbf6',
                           'type': 'sha-256'}],
            'created_time': '2021-06-23T16:54:44.854174+00:00',
            'mime_type': 'application/json',
       

In [None]:
import json

from nmdc_runtime.api.core.util import dotted_path_for
from nmdc_runtime.api.models.operation import ObjectPutMetadata

op_docs = [op.dict() for op in session.list_operations({
    "filter": json.dumps({
        "done": True,
        "metadata.model": dotted_path_for(ObjectPutMetadata),
    })
})]

In [None]:
pprint(op_docs[0])

In [None]:
# TODO: dagster sensor to check for done:true operations with metadata.model:ObjectPutMetadata.
# sensor can check op_id to see if already processed.
# sensor issues list_operations with filter.
# can page through results
# sensor assumes that expired operations are periodically removed from underlying resource store
# (so that list_operations doesn't return too much).

# User Client Session

In [None]:
import os

import requests

from nmdc_runtime.api.core.util import expiry_dt_from_now, has_passed

class UserClientSession:
    def __init__(
        self, base_url=os.getenv("API_HOST"),
        username=os.getenv("API_ADMIN_USER"),
        password=os.getenv("API_ADMIN_PASS")
    ):
        self.base_url = base_url
        self.username = username
        self.password = password
        self.headers = {}
        self.token_response = None
        self.refresh_token_after = None
        self.get_token()
        
    def _post(self, url_path, json_body=None):
        self.ensure_token()
        return requests.post(self.base_url + url_path, json=json_body, headers=self.headers)
    
    def get_token(self):
        rv = requests.post(self.base_url + "/token", data={
            "grant_type": "password",
            "username": self.username,
            "password": self.password,
        })
        self.token_response = rv.json()
        if "access_token" not in self.token_response:
            raise Exception(f"Getting token failed: {self.token_response}")
            
        self.headers["Authorization"] = f'Bearer {self.token_response["access_token"]}'
        self.refresh_token_after = expiry_dt_from_now(**self.token_response["expires"])
    
    def ensure_token(self):
        if has_passed(self.refresh_token_after):
            self.get_token()
    
    def create_object(self, object_in):
        return self._post("/objects", object_in)

In [None]:
user = UserClientSession()

In [None]:
drs_obj = drs_metadata_for("test.json")

In [None]:
drs_obj

In [None]:
drs_obj["access_methods"] = [{"access_id": f"{site_id}:{object_id}"}]

In [None]:
drs_obj

In [None]:
rv = user.create_object({})
rv

In [None]:
rv.json()

# Mongo init

In [None]:
from functools import lru_cache
import os

import pymongo.database
from pymongo import MongoClient


@lru_cache
def get_mongo_db() -> pymongo.database.Database:
    _client = MongoClient(
        host=os.getenv("MONGO_HOST"),
        username=os.getenv("MONGO_USERNAME"),
        password=os.getenv("MONGO_PASSWORD"),
    )
    return _client[os.getenv("MONGO_DBNAME")]

In [None]:
mdb = get_mongo_db()

In [None]:
mdb.list_collection_names()

In [None]:
mdb.operations.find_one({"id": "r1ew-5n6n-92"})

# GSP schema  / Cordra stuff

Need "id" in payload, e.g.:
```
"results": [
    {
      "id": "test/activity",
      "type": "Schema",
      "content": {
        "name": "Activity",
        "schema": collschemas["activity_set"]
      }
    }
  ]
```

In [None]:
from time import time
import os

tic = time()

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

In [None]:
os.environ["NMDC_JSON_SCHEMA_FILE"] = "/Users/dwinston/Desktop/nmdc.schema.gsp.json"

In [None]:
import json
import re
from toolz import assoc_in, dissoc
from zipfile import ZipFile

from mongospawn.schema import collschemas_for

from nmdc_mongo import (
    add_to_db,
    correct_metaP_doc,
    dbschema,
    fetch_and_validate_json,
    fetch_conform_and_persist_from_manifest,
    fetch_json,
    get_db,
    reset_database,
    snake_case_set_name
)

In [None]:
###########################
# Adjustments for GSP below
###########################

defined_object_names = set(dbschema["definitions"])

set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

existing_set_names = set(dbschema["properties"])

for object_without_set in (defined_object_names - set(set_for_object_name.keys())):
    proposed_set_name = snake_case_set_name(object_without_set)
    if proposed_set_name not in existing_set_names:
        dbschema["properties"][proposed_set_name] = {
            "description": (f"This property links a database object to the set of"
                            f" {object_without_set} objects within it."),
            "items": {"$ref": f"#/definitions/{object_without_set}"},
            "type": "array",
        }
        
dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]

# 'k' not capitalized upstream perhaps. should conform!
#dbschema = assoc_in(dbschema, ["definitions", "MetagenomeAssembly", "properties", "scaf_l_gt50k", "type"], "number")

In [None]:
collschemas = collschemas_for(dbschema)

# Reconstruct
set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

In [None]:
sorted(collschemas.keys())

In [None]:
collschemas["biosample_set"]

In [None]:
import requests

In [None]:
rv = requests.post("http://localhost:8080/auth/token",
              {"grant_type": "password", "username": "admin", "password": "nmdcrulez"})

In [None]:
rv.json()

In [None]:
auth_header = {"Authorization": f'Bearer {rv.json()["access_token"]}'}

In [None]:
auth_header

In [None]:
rv = requests.post("http://localhost:8080/uploadObjects", json={
    "results": [
        {
          "type": "Schema",
          "content": {
            "name": "Biosample",
            "schema": collschemas["biosample_set"]
          }
        }
      ]
}, headers=auth_header)

In [None]:
rv

In [None]:
import json
with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:
    json.dump({
        "results": [
            {
              "id": "test/activity",
              "type": "Schema",
              "content": {
                "name": "Activity",
                "schema": collschemas["activity_set"]
              }
            }
          ]
    }, f, indent=2)

In [None]:
import json
with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:
    json.dump({
        "results": [
            {
              "id": "test/study",
              "type": "Schema",
              "content": {
                "name": "Study",
                "schema": collschemas["study_set"]
              }
            }
          ]
    }, f, indent=2)

In [None]:
rv = requests.get("http://localhost:8080/search?query=type:%22Schema%22", headers=auth_header)

In [None]:
from pprint import pprint

template = rv.json()
del template["pageNum"]
del template["pageSize"]
del template["size"]
del template["results"][0]
del template["results"][0]
template["results"][0]["id"] = "test/abcd1234"
template["results"][0]["content"]["name"] = "Document2"
template["results"][0]["content"]["schema"]["title"] = "Document2"
del template["results"][0]["content"]["identifier"]

pprint(template)

In [None]:
import json
with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:
    json.dump(template, f, indent=2)

In [None]:
!cat /Users/dwinston/Desktop/cordra-upload.json

In [None]:
rv = requests.post("http://localhost:8080/uploadObjects", json=template, headers=auth_header)
rv