## Setup

*You must run the cells in this section each time you connect to a new runtime. For example, when you return to the notebook after an idle timeout, when the runtime crashes, or when you restart or factory reset the runtime.*

Install requirements (*Note: ocdskingfishercolab installs google-colab, which expects specific versions of pandas and numpy*):


In [None]:
! pip install --upgrade pip > pip.log
! pip install --upgrade 'ocdskingfishercolab>=0.4,<0.5' ipywidgets psycopg2-binary >> pip.log

In [None]:
# @title Import packages and load extensions { display-mode: "form" }

import gzip
import json
import os
import shutil
import tempfile
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path

import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from google.colab.data_table import DataTable
from google.colab.files import download
from ipywidgets import widgets
from ocdskingfishercolab import (
    authenticate_gspread,
    calculate_coverage,
    download_dataframe_as_csv,
    format_thousands,
    render_json,
    save_dataframe_to_sheet,
    save_dataframe_to_spreadsheet,
    set_dark_mode,
    set_light_mode,
)

# Load https://pypi.org/project/ipython-sql/
%load_ext sql
# Load https://colab.research.google.com/notebooks/data_table.ipynb
%load_ext google.colab.data_table

In [None]:
# @title Configure the notebook environment { display-mode: "form" }

# Increase max columns so that Pandas DataFrames with many columns are rendered as data tables.
DataTable.max_columns = 50
# Remove the index from data tables for easier copy-pasting to Google Docs.
DataTable.include_index = False

# Return Pandas DataFrames instead of regular result sets.
%config SqlMagic.autopandas = True
# Don't print number of rows affected.
%config SqlMagic.feedback = False

# If you set Tools > Settings > Site > Theme to dark, uncomment this line.
# set_dark_mode()
# If you are creating plots to copy-paste into reports, uncomment this line.
# set_light_mode()

## Setup download data from the Data Registry

In [None]:
# @title Data registry functions{ display-mode: "form" }
import requests

DATA_REGISTRY_BASE_URL = "https://data.open-contracting.org/en/"
PUBLICATIONS_URL = f"{DATA_REGISTRY_BASE_URL}publications.json"


def get_publications():
    publications = requests.get(PUBLICATIONS_URL, timeout=10).json()
    for publication in publications:
        publication["label"] = f"{publication['country']} - {publication['title']}"
    return publications


def get_publication_select_box():
    return widgets.Dropdown(
        options=sorted([entry["label"] for entry in get_publications()]),
        description="Publication:",
        disabled=False,
    )


def get_available_years(publication):
    years = ["full"]
    if publication["date_from"] and publication["date_to"]:
        year_from = int(publication["date_from"][:4])
        year_to = int(publication["date_to"][:4])
        years.extend(list(range(year_from, year_to + 1)))
    return years


def get_years_select_box(publication_select_box):
    selected_publication = next(
        entry for entry in get_publications() if entry["label"] == publication_select_box.value
    )
    return (
        widgets.Dropdown(
            options=get_available_years(selected_publication),
            description="Year:",
            disabled=False,
        ),
        selected_publication,
    )


def download_file(selected_publication, selected_year):
    file_name = f"{selected_publication['source_id']}-{selected_year}.jsonl"
    download_url = (
        f'{DATA_REGISTRY_BASE_URL}publication/{selected_publication["id"]}/download?name={selected_year}.jsonl.gz'
    )
    response = requests.get(download_url, timeout=10)
    with tempfile.NamedTemporaryFile() as gz_file:
        gz_file.write(response.content)
        with gzip.open(gz_file.name) as i, Path(file_name).open("wb") as o:
            shutil.copyfileobj(i, o)
    return file_name

## Get the fields used by all OCDS publications in the Registry

Use this notebook to get the list of the fields implemented by all the publishers in the Data Registry, for example, to check what publishers are publishing specific fields.

### Aggegate the lists of fields across all publications

In [None]:
final_dataset = pd.DataFrame()

for publication in get_publications():
    data = (
        pd.DataFrame.from_dict(publication.get("coverage", {}), orient="index", columns=["count"])
        .reset_index()
        .rename(columns={"index": "path"})
    )
    data["publisher"] = publication["source_id"]
    final_dataset = pd.concat([final_dataset, data])

In [None]:
final_dataset

Export the results as CSV

In [None]:
final_dataset.to_csv("ocds_fields_from_all_publishers.csv", index=False)