# Replicator

## 1. Context
This is the same logic used in the AWS Lambda function that powers the [Replicator UI](https://ckanrepl.utl.intra.dev-toronto.ca/). It makes life easier for replicating packages between environments in bulk (rather than one-by-one via the UI).

In [1]:
from urllib.parse import urljoin

import requests
import ckanapi

## 2. Initialize CKAN

The `envs` dict is structured in the same manner as the `ckan_credentials_secret` variable in Airflow: what is needed for the RemoteCKAN object.

In [4]:
envs = {
    "dev": {
        "address": "https://ckanadmin0.intra.dev-toronto.ca/",
        "apikey": ""
    },
    "qa": {
        "address": "https://ckanadmin0.intra.qa-toronto.ca/",
        "apikey": ""
    },
    "prod": {
        "address": "https://ckanadmin0.intra.prod-toronto.ca/",
        "apikey": ""
    },
}

for env, args in envs.items():
    envs[env] = ckanapi.RemoteCKAN(**args)

## 3. Replicate functionality

1. Read package
2. Prepare package (need to remove automated fields, such as date created)
3. Create (if it doesn't exist) or patch (if it does) package and resources from one CKAN env to the other
4. Clean package from origin CKAN. **Note**: not done automatically in this notebook to prevent mistakes, but function is available

In [8]:
def prep_package(ckan, package_name_or_id):
    """
    returns package, and underlying resources, stripped of automated fields so they can be used to create/patch
    in the destination CKAN
    """
    package = ckan.action.package_show(id=package_name_or_id)

    package_fields = [
        "name",
        "title",
        "resources",
        "notes",
        "collection_method",
        "excerpt",
        "limitations",
        "dataset_category",
        "image_url",
        "information_url",
        "is_retired",
        "refresh_rate",
        "tags",
        "civic_issues",
        "topics",
        "owner_division",
        "owner_section",
        "owner_unit",
        "owner_email",
    ]

    for k in list(package.keys()):
        if k not in package_fields:
            package.pop(k)

    for k in package_fields:
        if k not in package:
            package[k] = None

    resource_fields = [
        "id",
        "name",
        "description",
        "datastore_active",
        "url",
        "url_type",
        "extract_job",
        "format",
        "is_preview",
        "position",
    ]

    for r in package["resources"]:
        for k in list(r.keys()):
            if k not in resource_fields:
                r.pop(k)

        for k in resource_fields:
            if k not in r:
                r[k] = None

    package["resources"] = sorted(package["resources"], key=lambda i: i["position"])

    if "tags" in package and not package["tags"] is None:
        package["tags"] = ",".join([x["name"] for x in package["tags"]])

    return package


def replicate(from_ckan, to_ckan, package_name_or_id, mode):
    """
    - from_ckan (ckanapi.RemoteCKAN object): origin CKAN
    - to_ckan (ckanapi.RemoteCKAN object): destination CKAN
    - package_name_or_id: usually use package name, e.g. covid-19-cases-in-toronto
    - mode: this is primarily of use for the UI but since it's the same logic
        - "update": if the package already exists
        - "create": if the package is new
    """
    package = prep_package(from_ckan, package_name_or_id)
    resources = package.pop("resources")

    package["license_id"] = "open-government-licence-toronto"

    if "tags" in package:
        if package["tags"] is None:
            package["tags"] = []
        else:
            package["tags"] = [{"name": t} for t in package["tags"].split(",")]

    if mode == "create":
        package["owner_org"] = "city-of-toronto"

        target = to_ckan.action.package_create(**package)
    elif mode == "update":
        package["id"] = package["name"]

        target = to_ckan.action.package_patch(**package)

    for r in resources:
        r["package_id"] = target["id"]

        r_id = r.pop("id")
        r_url = r.pop("url")
        r_url_type = r.pop("url_type")

        if r.pop("datastore_active"):
            context = from_ckan.action.datastore_search(
                resource_id=r_id, limit=0, include_total=True
            )

            data = from_ckan.action.datastore_search(
                resource_id=r_id, limit=context["total"],
            )

            p = {
                "fields": [f for f in context["fields"] if f["id"] != "_id"],
                "records": [
                    {k: v for k, v in x.items() if k != "_id"} for x in data["records"]
                ],
            }

            for x in target["resources"]:
                if x["name"] == r["name"]:
                    p["resource_id"] = x["id"]

                    to_ckan.action.datastore_delete(id=x["id"])
                    break

            if "resource_id" not in p:
                p["resource"] = r

            to_ckan.action.datastore_create(**p)
        elif r_url_type == "upload":
            data = requests.get(r_url).content

            func = "resource_create"
            for x in target["resources"]:
                if x["name"] == r["name"]:
                    r["id"] = x["id"]
                    func = "resource_patch"
                    break

            fmt_actual = r_url.split("?")[0].split("/")[-1].split(".")[-1]
            requests.post(
                urljoin(to_ckan.address, "api/3/action/{0}".format(func)),
                data=r,
                headers={"Authorization": to_ckan.apikey},
                files={"upload": (r["name"] + "." + fmt_actual, data)},
            )
        else:
            func = "resource_create"
            for x in target["resources"]:
                if x["name"] == r["name"]:
                    r["id"] = x["id"]
                    func = "resource_patch"
                    break

            r["url"] = r_url

            if func == "resource_create":
                to_ckan.action.resource_create(**r)
            else:
                to_ckan.action.resource_patch(**r)

    kept_resources = [r["name"] for r in resources]
    for r in target["resources"]:
        if not r["name"] in kept_resources:
            to_ckan.action.resource_delete(id=r["id"])

    return urljoin(to_ckan.address, "dataset/{0}".format(target["name"]))


def clean(ckan, package_name_or_id, level='package'):
    """
    cleans either the entire package or the underlying resources in a package
    """
    if level == 'package':
        ckan.action.dataset_purge(id=package_name_or_id)
    elif level == 'resource':
        package = ckan.action.package_show(id=package_name_or_id)

        for r in package['resources']:
            ckan.action.resource_delete(id=r['id'])

## 4. Replicate packages

In [12]:
packages_to_replicate = [
    "motor-vehicle-collisions-involving-killed-or-seriously-injured-persons",
    "covid-19-cases-in-toronto",
    "bodysafe",
]

for n, p in enumerate(packages_to_replicate):
    print(f"{n+1}/{len(packages_to_replicate)}: {p}")
    replicate(envs["prod"], envs["dev"], p, "update")

1/3: motor-vehicle-collisions-involving-killed-or-seriously-injured-persons
2/3: covid-19-cases-in-toronto
3/3: bodysafe
