In [1]:
import os
view_server = os.environ.get("VIEW_SERVER","view-server")
url = os.environ.get("EGERIA_VIEW_SERVER_URL","https://host.docker.internal:9443")
user_id = os.environ.get("EGERIA_USER", "peterprofile")
user_pwd = os.environ.get("EGERIA_USER_PASSWORD")

from pyegeria import AutomatedCuration, ServerOps
import asyncio
import nest_asyncio
nest_asyncio.apply()

ucServerTemplateGUID="dcca9788-b30f-4007-b1ac-ec634aff6879"  
ucCatalogTemplateGUID="5ee006aa-a6d6-411b-9b8d-5f720c079cae" 
ucSchemaTemplateGUID="5bf92b0f-3970-41ea-b0a3-aacfbf6fd92e"  
ucVolumeTemplateGUID="92d2d2dc-0798-41f0-9512-b10548d312b7"  
ucTableTemplateGUID="6cc1e5f5-4c1e-4290-a80e-e06643ffb13d"   
ucFunctionTemplateGUID="a490ba65-6104-4213-9be9-524e16fed8aa"

s_client = ServerOps("integration-daemon",url,'erinoverview')


![Egeria Logo](https://raw.githubusercontent.com/odpi/egeria/main/assets/img/ODPi_Egeria_Logo_color.png)

### Egeria and Unity Catalog demo

# Onboarding data safely into Unity Catalog (UC)

## Introduction

Whether Callie can process the data and for what type of processing requires more than access control.  She has access to the data (provided by UC) because she is on the project.  There are additional legal obligations - such as who it can be shared with, when it must be deleted, the type of data protection mechanisms that must be in place.  These are partly her responsibility, and partly the responsibility of others - and this coordination across different professionals/departments/tools  is difficult in a busy organization.

## Clinical Trials

Clinical trials are used to test that new treatments are both safe and effective.  They involve taking measurements from various patients both before and after they start the treatment.  Callie has to analysis these measurements as part of the package to submit to the regulators.

The data comes in from a variety of hospitals.  It is personally sensitive to the patients, of importance to the business and subject to regulatory control and so care is needed that:

* it has been collected correctly
* it is protected at all times
* the correct data sharing agreements are in place to provide legal cover, both for the hospitals and Coco Pharmaceuticals.

In this example, there are three hospitals supplying data:

![Onboarding Process](unity-catalog-onboarding-process.png)

## Setting up the clinical trail

The first step is to create the processes that will be used by the staff during the clinical trial.  
It uses generic process steps and creates processes for the clinical trial that are initialized with all of the correct values.  
This reduces the chance that someone will use the wrong value by accident.

---

In [1]:
setUpClinicalTrialName="ClinicalTrials@CocoPharmaceuticals:set-up-clinical-trial"

setUpDataLakeGATName="ClinicalTrials@CocoPharmaceuticals:set-up-data-lake"
certifyHospitalGATName = "ClinicalTrials@CocoPharmaceuticals:certify-hospital"
onboardHospitalGATName = "ClinicalTrials@CocoPharmaceuticals:onboard-hospital"

projectGUID="a2915132-9d9a-4449-846f-43a871b5a6a0"


action_targets = [{
      "class" : "NewActionTarget",
      "actionTargetName": "clinicalTrialProject",
      "actionTargetGUID": projectGUID
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "landingAreaConnector",
      "actionTargetGUID": "1b98cdac-dd0a-4621-93db-99ef5a1098bc"
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "lastUpdateConnector",
      "actionTargetGUID": "fd26f07c-ae44-4bc5-b457-37b43112224f"
    }]

requestParameters = {
    "dataLakeFileTemplateGUID" : "b2ec7c9d-3462-488a-897d-8e873658dded",
    "landingAreaDirectoryTemplateGUID" : "fbdd8efd-1b69-474c-bb6d-0a304b394146",
    "landingAreaFileTemplateGUID" : "5e5ffc97-237d-46c6-95c3-49405035dedc",
}

egeria_client.initiate_gov_action_type(setUpClinicalTrialName, None, action_targets, None, requestParameters, None)


NameError: name 'ucVolumeTemplateGUID' is not defined

---

## Setting up the data lake resources in Unity Catalog (UC)

The first step in the demo is to create the schema and volume in the data lake as the destination for the files from the hospital.

----

In [3]:
egeria_client = AutomatedCuration(view_server, url, user_id, user_pwd)

token = egeria_client.create_egeria_bearer_token()

In [4]:
setUpDataLakeProcessName="ClinicalTrials:PROJ-CT-TBDF:set-up-data-lake"

dataLakeDirectoryPathName="/deployments/data/coco-data-lake/research/clinical-trials/drop-foot/weekly-measurements"
catalogGUID=xxx

action_targets = [{
      "class" : "NewActionTarget",
      "actionTargetName": "clinicalTrialProject",
      "actionTargetGUID": projectGUID
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "dataLakeCatalog",
      "actionTargetGUID": catalogGUID
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "lastUpdateConnector",
      "actionTargetGUID": "fd26f07c-ae44-4bc5-b457-37b43112224f"
    }]

requestParameters = {
    "dataLakeSchemaTemplateGUID" : ucSchemaTemplateGUID,
    "dataLakeSchemaName" : "teddy_bear_drop_foot,
    "dataLakeVolumeTemplateGUID" : ucVolumeTemplateGUID,
    "dataLakeVolumeDirectoryPathName" : dataLakeDirectoryPathName
}

egeria_client.initiate_gov_action_type(setUpDataLakeName, None, action_targets, None, requestParameters, None)

'163a358e-edac-4078-bbcf-22c3e32e3293'

In [None]:
s_client.refresh_integration_connectors(None)

----

This process creates a description of the volume required in Egeria.  When the Integration Connectors next refresh, the volume is pushed into Unity Catalog (UC).

## Creating the onboarding pipelines

Once the volume is in place, the next step is to create the pipelines for the three hospitals.

----

In [5]:

onboardHospitalName = "ClinicalTrials@CocoPharmaceuticals:onboard-hospital"
newFileProcessName="Coco:GovernanceActionProcess:ClinicalTrials:WeeklyMeasurements:Onboarding"
genericOnboardingProcessGUID="508d3878-8eae-47e5-8507-ee936f33b418"

oakDeneHospitalGUID="7905f803-7b7e-47c4-8b35-d0a0cfa47469"
oakDeneContactPerson="80bf48b0-5ef2-4294-950d-0c6fd568a1b2"
oldMarketHospitalGUID="fe8f4065-6664-4739-9438-3330909e6b98"
oldMarketContactPerson="fabc88d6-d28e-4e2d-9086-6affc8c45a7a"
hamptonHospitalGUID="c596f5c4-0aee-4fdc-969b-69fa26b72529"
hamptonContactPerson="e2bcf56b-f822-47d8-82fa-94cd2a5a772c"

landingAreaRootDirectoryName="landing-area"
oakDeneLandingAreaDirectoryName="landing-area/hospitals/oak-dene/clinical-trials/drop-foot"
oldMarketLandingAreaDirectoryName="landing-area/hospitals/old-market/clinical-trials/drop-foot"
hamptonLandingAreaDirectoryName="landing-area/hospitals/hampton/clinical-trials/drop-foot"



In [6]:
actionTargets = [{
      "class" : "NewActionTarget",
      "actionTargetName": "hospital",
      "actionTargetGUID": oakDeneHospitalGUID
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "hospitalContactPerson",
      "actionTargetGUID": oakDeneContactPerson
    }]

requestParameters = {
       "landingAreaDirectoryPathName" : oakDeneLandingAreaDirectoryName,
       "destinationDirectory" : dataLakeDirectoryPathName,
       "newFileProcessName" : newFileProcessName
    }

egeria_client.initiate_gov_action_type(onboardHospitalName, None, actionTargets, None, requestParameters, None)

'ecf7abe2-272d-4625-b61d-5361ccf6a617'

In [7]:
actionTargets = [{
      "class" : "NewActionTarget",
      "actionTargetName": "hospital",
      "actionTargetGUID": oldMarketHospitalGUID
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "hospitalContactPerson",
      "actionTargetGUID": oldMarketContactPerson
    }]

requestParameters = {
       "landingAreaDirectoryPathName" : oldMarketLandingAreaDirectoryName,
    }

egeria_client.initiate_gov_action_type(onboardHospitalName, None, actionTargets, None, requestParameters, None)

'3f75cfd1-ad5a-41c6-aa96-2daca67b9eea'

In [8]:
actionTargets2 = [
    {
      "class" : "NewActionTarget",
      "actionTargetName": "clinicalTrialProject",
      "actionTargetGUID": projectGUID
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "hospital",
      "actionTargetGUID": hamptonHospitalGUID
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "landingAreaConnector",
      "actionTargetGUID": "1b98cdac-dd0a-4621-93db-99ef5a1098bc"
    },
    {
      "class" : "NewActionTarget",
      "actionTargetName": "hospitalContactPerson",
      "actionTargetGUID": hamptonContactPerson
    }]

requestParameters2 = {
       "landingAreaDirectoryTemplateGUID" : landingAreaFolderTemplateGUID,
       "landingAreaDirectoryPathName" : hamptonLandingAreaDirectoryName,
       "landingAreaFileTemplateGUID" : landingAreaTemplateGUID,
       "dataLakeFileTemplateGUID" : dataLakeTemplateGUID,
       "destinationDirectory" : dataLakeDirectoryPathName,
       "newFileProcessName" : newFileProcessName
    }

egeria_client.initiate_gov_action_type(onboardHospitalName, None, actionTargets2, None, requestParameters2, None)

'f1d37353-2f39-4db5-a77c-badf290637e4'

In [None]:
s_client.refresh_integration_connectors(None)