In [None]:
# default_exp portalcatalog

# portalcatalog
> API Details


### Imports

In [None]:
#export
import urllib
import gzip
import json
import pandas as pd
from functools import partial

### Utility Code

In [None]:
#export

def read_gz(url:str):
    response = urllib.request.urlopen(url)
    decompressed_bytes = gzip.decompress(response.read())
    text = decompressed_bytes.decode('utf-8')
    return text

def get_jsonlines(jsonl_content: str):
    return [json.loads(jline) for jline in jsonl_content.split('\n') if jline]

### Portal Catalog Classes

In [None]:
#export
def get_value_for_language(lang: str, value_dict: dict):
    return value_dict.get(lang, None)
    
def get_datasets(jsonl_content: str):
    return [Dataset(json.loads(jline)) for jline in jsonl_content.split('\n') if jline]
    
class Dataset:
    
    """
    A dataset
    """
    def __init__(self, record=None, language:str = 'en'):
        self.record = record
        self.resources = [Resource(record, language) for record in record['resources']]
        self.language = language
        for key, value in record.items():
            if value:
                if isinstance(value, dict):
                    value_fn = partial(get_value_for_language, value_dict=value)
                    setattr(self, f'get_{key}', value_fn)
    
    def get_org(self):
        if hasattr(self, 'get_org_title_at_publication'):
            return self.get_org_title_at_publication(self.language)
        return 'Organization'
    
    def get_name(self):
        if hasattr(self, 'get_data_series_name'):
            return self.get_data_series_name(self.language)
        return f'{self.get_org()} Dataset'
                    
    def get_notes(self, language:str = None):
        language = language or self.language
        return self.get_notes_translated(lang=language)
    
    def resources_as_html(self):
        df= pd.DataFrame([{'name': resource.get_name(), 'state': resource.get_state()} 
                          for resource in self.resources])
        return df.to_html(index=False)
        
    
    def __repr__(self):
        return f'Dataset {self.get_name()}'
    
    def _repr_html_(self):
        html = f'<h3>{self.get_name()}</h3>'
        html += f'<span>{self.get_notes()}</span>'
        html += pd.DataFrame([{'Publisher': self.get_org()}]).to_html(index=False, justify='left')
        html += '<h3>Resources</h3>'
        html += self.resources_as_html()
        return html
    
class Resource:
    
    """
    A resource. Belongs to a dataset, which will have 1 or more resources
    """
    def __init__(self, record, language:str = 'en'):
        self.record = record
        self.id = record.get('id')
        self.language = language
    
    def get_name(self):
        return get_value_for_language(self.language, self.record.get('name_translated'))
    
    def get_state(self):
        return self.record.get('state', '')
    
    def get_url(self):
        return self.record.get('url', '')
    
    def __repr__(self):
        return f'{self.get_name()}'
    
    def _repr_html_(self):
        html = f'<h3>{self.get_name()}</h3>'
        return html
    
    
class PortalCatalog:
    
    """
    A portal catalog
    """
    def __init__(self, jsonl_content: str):
        self.datasets = get_datasets(jsonl_content)
    
    @classmethod
    def from_jsonl(cls, jsonl_content: str):
        return cls(jsonl_content)

## Tests

In [None]:
url = 'http://open.canada.ca/static/od-do-canada.jl.gz'
text = read_gz(url)
lines = get_jsonlines(text)

In [None]:
#datasets = get_datasets(text)
catalog = PortalCatalog.from_jsonl(text)

In [None]:
catalog.datasets[3000]

Publisher
Statistics Canada

name,state
Dataset,active


In [None]:
catalog.datasets[560].resources[0]

In [None]:
lines[3000]

{'association_type': [],
 'audience': [],
 'author': None,
 'author_email': 'open-ouvert@tbs-sct.gc.ca',
 'collection': 'primary',
 'contributor': {},
 'creator_user_id': 'aa584ab4-544c-4c5c-81da-d1cff9bd96fa',
 'data_series_issue_identification': {'en': '', 'fr': ''},
 'data_series_name': {'en': '', 'fr': ''},
 'date_published': '2003-02-11 00:00:00',
 'digital_object_identifier': '',
 'display_flags': [],
 'frequency': 'unknown',
 'geographic_region': [],
 'groups': [],
 'id': '0955056e-2ee3-4bed-92a0-9f4d946e0194',
 'imso_approval': 'true',
 'isopen': False,
 'jurisdiction': 'federal',
 'keywords': {'en': ['census of population'],
  'fr': ['recensement de la population']},
 'license_id': 'ca-ogl-lgo',
 'license_title': 'Open Government Licence - Canada',
 'license_url': 'http://open.canada.ca/en/open-government-licence-canada',
 'maintainer': None,
 'maintainer_email': 'open-ouvert@tbs-sct.gc.ca',
 'metadata_contact': {},
 'metadata_created': '2016-09-24T01:25:25.743264',
 'metadata

In [None]:
ds: Dataset = Dataset(lines[1020])
ds

Publisher
Statistics Canada

name,state
Dataset,active
Dataset,active
Dataset,active
Supporting Document,active
Supporting Document,active


In [None]:
import pandas as pd
pd.options.display.max_columns = 30
pd.DataFrame(lines[0]['resources'])

Unnamed: 0,cache_last_updated,cache_url,created,data_quality,datastore_active,description,format,hash,id,language,last_modified,mimetype,mimetype_inner,name,name_translated,package_id,position,resource_type,revision_id,state,url,url_type
0,,,2017-01-25T11:50:41.625040,[],False,,other,,62d71256-c58c-4351-a527-d79fed951c49,"[en, fr]",,,,Link to Canada Land Survey Records - English,{'en': 'Link to Canada Land Survey Records - E...,00000d1c-2567-4f51-a08b-d11c3413f829,0,dataset,be3ee0f1-5d67-42af-b29d-de4ae7cb271e,active,http://clss.nrcan.gc.ca/plan-eng.php?id=71069+...,
1,,,2017-01-25T11:50:41.625066,[],False,,other,,77359319-8d18-4b3a-b95f-614465b98c61,"[en, fr]",,,,Link to Canada Land Survey Records - French,{'en': 'Link to Canada Land Survey Records - F...,00000d1c-2567-4f51-a08b-d11c3413f829,1,dataset,be3ee0f1-5d67-42af-b29d-de4ae7cb271e,active,http://satc.rncan.gc.ca/plan-fra.php?id=71069+...,
