In [151]:
import os

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

from nmdc_mongo import get_db

db_share = get_db("dwinston_share")
db_scratch = get_db("dwinston_scratch")

In [2]:
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

TOKEN_FILE = os.path.expanduser("~/token.nmdc-gcloud-api.pickle")
CREDENTIALS_FILE = os.path.expanduser('~/nmdc-gcloud-api-credentials.json')

# If modifying these scopes, delete the token*.pickle file.
SCOPES = [
    'https://www.googleapis.com/auth/spreadsheets.readonly',
    'https://www.googleapis.com/auth/drive.readonly'
]

def get_gcloud_api_creds(scopes=SCOPES, token_file=TOKEN_FILE, credentials_file=CREDENTIALS_FILE):
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists(token_file):
        with open(token_file, 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                credentials_file, scopes)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open(token_file, 'wb') as token:
            pickle.dump(creds, token)
    return creds

def get_sheet_values(sheet_id, sheet_range):
    creds = get_gcloud_api_creds()
    sheets_service = build('sheets', 'v4', credentials=creds)

    sheet = sheets_service.spreadsheets()
    result = sheet.values().get(spreadsheetId=sheet_id,
                                range=sheet_range).execute()
    values = result.get('values', [])

    if not values:
        print('No data found.')
        return []
    else:
        return [row for row in values]

In [3]:
rows = get_sheet_values(sheet_id='1nZOJYiC2QN0hOn5nDj9y9mteWeGyzQQls17zH5mESww', sheet_range='Sheet1!A:E')

In [6]:
import io

from googleapiclient.http import MediaIoBaseDownload


def get_file(file_id):
    creds = get_gcloud_api_creds()
    drive_service = build('drive', 'v3', credentials=creds)

    request = drive_service.files().get_media(fileId=file_id)
    f = io.BytesIO()
    downloader = MediaIoBaseDownload(f, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print("Download %d%%." % int(status.progress() * 100))
    
    return f


In [24]:
import json
from pprint import pprint

f = get_file('1XoSHcImd9LRlZb2nYNWucGtTzgqmdMd0')
s = f.getvalue().decode("utf-8")
f.close()
try:
    stegen_sample_template = json.loads(s)
except json.JSONDecodeError:
    stegen_sample_template = json.loads(
        s.replace('\n', '')\
        .replace("$BIOSAMPLE_ID", '"$BIOSAMPLE_ID"')\
        .replace("“", '"')
    )

Download 100%.


In [64]:
import re

gold_pattern = re.compile(r"Gb\d+")

def prefix_sample_id(s):
    if ":" in s:
        return s
    elif re.fullmatch(gold_pattern, s):
        return "gold:" + s
    else:
        return "emsl:" + s

In [184]:
omics = []
for i, row in enumerate(rows):
    if i == 0: # skip header row
        continue
    omics.append({
        "omics_id": row[0],
        "omics_type": row[1],
        "sample_name": row[2],
        "sample_id": prefix_sample_id(row[3]),
        "new": len(row) > 4 and row[4] == "TRUE"
    })

In [94]:
existing_ids = [
    d["id"] for d in
    db_share.biosample_set.find({"id": {"$in": [o["sample_id"] for o in omics]}}, ["id"])
]

In [95]:
assert {o["sample_id"] for o in omics if o["new"]} == {o["sample_id"] for o in omics} - set(existing_ids) 

In [96]:
from toolz import assoc_in, get_in

def transform_in(doc, keys, fn):
    initial = get_in(keys, doc)
    transformed = fn(initial)
    return assoc_in(doc, keys, transformed)

In [97]:
def fill_template(template, sample_id, sample_name):
    doc = assoc_in(template, ["id"], sample_id)
    doc = transform_in(
        doc, ["identifier", "has_raw_value"],
        lambda s: s.replace("$BIOSAMPLE_NAME", sample_name)
    )
    doc = transform_in(
        doc, ["name"],
        lambda s: s.replace("$BIOSAMPLE_NAME", sample_name)
    )
    return doc

In [114]:
def term_subdocs_to_id_strings(doc):
    keys_with_term_ids = [
        k for k in doc
        if isinstance(doc[k], dict)
        and "term" in doc[k]
        and "id" in doc[k]["term"]
    ]
    for k in keys_with_term_ids:
        doc = assoc_in(doc, [k, "term"], doc[k]["term"]["id"])
    return doc

In [123]:
new_samples = {}
for o in omics:
    if o["new"]:
        new_samples[o["sample_id"]] = o["sample_name"]

docs = []

for sample_id, sample_name in new_samples.items():
    doc = fill_template(stegen_sample_template, sample_id, sample_name)
    doc = term_subdocs_to_id_strings(doc)
    docs.append(doc)

In [144]:
def compare_doc_to_mongo_collection_validator(mongo_collection, doc):
    doc_items = sorted([(k, v) for k, v in doc.items()])
    validator_items = sorted([
        (k, v) for k, v in
        validator_for(mongo_collection)['properties'].items()
        if k in list(doc)
    ])
    idx_vi = 0
    for k, v in doc_items:
        print("\n#####"+k+"\n")
        print("doc value:")
        pprint(k)
        print()
        print("validator spec:")
        for vi in validator_items:
            if vi[0] == k:
                pprint(vi[1])
                break
        else:
            print(f"NO SPEC FOR {k}")

Need to update mongo collection schema to account for updated fields in nmdc.schema.json:

In [160]:
from nmdc_mongo import dbschema, validator_for, collschemas_for
from nmdc_mongo.admin import admin_client, reset_database_schema

dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]
collschemas = collschemas_for(dbschema)

targetdb_as_admin = admin_client["dwinston_share"]
target_collection_names = ["biosample_set"]
reset_database_schema(targetdb_as_admin, target_collection_names, collschemas)

updating biosample_set


In [161]:
import jsonschema

jsonschema.validate({"biosample_set": docs}, schema=dbschema)

In [162]:
from pymongo import ReplaceOne

rv = db_share.biosample_set.bulk_write([ReplaceOne({"id": doc["id"]}, doc, upsert=True) for doc in docs])

In [163]:
rv.upserted_count

35

# Second checklist item of GH Issue 252

In [185]:
omics = [
    transform_in(o, ["omics_id"], lambda s: "emsl:"+s if ":" not in s else s)
    for o in omics
]

In [186]:
omics_ids = [o["omics_id"] for o in omics]

found_omics_ids = [
    d["id"] for d in
    db_share.omics_processing_set.find({"id": {"$in": omics_ids}},["id"])
]

In [187]:
assert set(omics_ids) == set(found_omics_ids)

In [191]:
omics_updates = {}
for o in omics:
    omics_updates[o["omics_id"]] = o

In [198]:
from toolz import dissoc

replacing_omics_type = {}

requests = []

for doc in db_share.omics_processing_set.find({"id": {"$in": omics_ids}}):
    omics_type = get_in(["omics_type", "has_raw_value"], doc)
    updates = omics_updates[doc["id"]]
    if omics_type != updates["omics_type"]:
        replacing_omics_type[doc["id"]] = {"from": omics_type, "to": updates["omics_type"]}
    doc = assoc_in(doc, ["omics_type", "has_raw_value"], updates["omics_type"])
    doc = assoc_in(doc, ["has_input"], [updates["sample_id"]])
    requests.append(ReplaceOne({"id": doc["id"]}, dissoc(doc, "_id")))

In [199]:
replacing_omics_type

{}

In [201]:
len(requests)

434

In [202]:
rv = db_share.omics_processing_set.bulk_write(requests)

In [203]:
rv.modified_count

434