In [1]:
import json
import re
import datetime

import ckanapi
import geopandas as gpd
import nltk
import numpy as np
import pandas as pd
import petk
import requests

from nltk.corpus import wordnet
from shapely.geometry import mapping, shape, Point, LineString, Polygon
from sklearn.preprocessing import MinMaxScaler

from datetime import datetime as dt
from time import sleep

# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/dottyz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
PACKAGE_FRAMEWORK = 'catalogue-quality-scores'

RESOURCE_METHODS = 'scoring-methods'
RESOURCE_SCORES = 'catalogue-scorecard'

MODEL_VERSION = 'v0.0.2'

DIMENSIONS = ['interpretability', 'usability', 'metadata', 'freshness', 'granularity', 'completeness', 'accessibility'] # Ordered by importance

METADATA_FIELDS = ['collection_method', 'limitations', 'civic_issues', 'topics', 'owner_division', 'owner_email']

DATA_CKAN = {
    'address': 'https://ckan0.cf.opendata.inter.prod-toronto.ca'
#     'apikey': ''
}

STORAGE_CKAN = {
    'address': 'https://ckanadmin0.intra.dev-toronto.ca',
    'apikey': '784f11cc-b170-4377-83a3-38ba28662b16'
}

In [3]:
def get_model(ckan, pid=PACKAGE_FRAMEWORK):
    try:
        model = ckan.action.package_show(id=pid)
    except ckanapi.NotAuthorized:
        raise Exception('Permission required to search for the framework package')
    except ckanapi.NotFound:
        raise Exception('Framework package not found')
    
    return {
        r['name']: r for r in model.pop('resources')
    }

In [4]:
def read_datastore(ckan, rid, rows=10000):
    records = []
    
    is_geospatial = False
    
    has_more = True
    while has_more:
        result = ckan.action.datastore_search(id=rid, limit=rows, offset=len(records))
        
        records += result['records']
        has_more = len(records) < result['total']
    
    df = pd.DataFrame(records).drop('_id', axis=1)
    
    if 'geometry' in df.columns:
        df['geometry'] = df['geometry'].apply(lambda x: shape(json.loads(x)))
        
        df = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'})
    
    return df, [x for x in result['fields'] if x['id'] != '_id']

In [10]:
def score_usability(package, columns, data):
    # TODO: once non-tabular data are passed through - measure the level of nested fields
    
    columns_meaningful = 0
    columns_constant = 0
    
    for f in columns:
        eng_words = [wordnet.synsets(x) for x in re.split('\s|_|-', f['id'])]

        if len([x for x in eng_words if len(x)]) / len(eng_words) > 0.8:
            columns_meaningful += 1
        
        if f['id'] == 'geometry' or data[f['id']].nunique() == 1:
            columns_constant += 1
    
    return np.mean([columns_meaningful / len(columns), 1 - columns_constant / len(columns)])

In [5]:
def score_metadata(package, columns):
    fields_filled = 0
    for field in METADATA_FIELDS:
        if field in package and not package[field] is None:
            # TODO: measure the quality of the description
            
            fields_filled += 1
            
    columns_described = 0
    for f in columns:
        if 'info' in f and len(f['info']['notes']):
            columns_described += 1

    return np.mean([fields_filled/len(METADATA_FIELDS), columns_described / len(columns)])

In [13]:
def score_freshness(package):
    # TODO: measure the difference between last refreshed and current date
    pass

In [8]:
def calculate_weights(dimensions, data, method='sr'):
    for i, row in enumerate(data):
        row['dimensions'] = {}
        for dim, fields in METRICS.items():
            scores = [ row['metrics'][x] for x in fields ]

            row[dim] = sum(scores)/len(scores)

    N = len(dimensions)
    
    if method == 'sr':
        denom = np.array([ ((1 / (i + 1)) + ((N + 1 - (i + 1)) / N)) for i, x in enumerate(dimensions) ]).sum()
        weights = [ ((1 / (i + 1)) + ((N + 1 - (i + 1)) / N)) / denom for i, x in enumerate(dimensions) ]
    elif method == 'rs':
        denom = np.array([ (N + 1 - (i + 1)) for i, x in enumerate(dimensions)]).sum()
        weights = [ (N + 1 - (i + 1)) / denom for i, x in enumerate(dimensions) ]
    elif method == 'rr':
        denom = np.array([ 1 / (i + 1) for i, x in enumerate(dimensions) ]).sum()
        weights = [ (1 / (i + 1)) / denom for i, x in enumerate(dimensions) ]
    elif method == 're':
        exp = 0.2
        denom = np.array([ (N + 1 - (i + 1)) ** exp for i, x in enumerate(dimensions) ]).sum()
        weights = [ (N + 1 - (i + 1)) ** exp / denom for i, x in enumerate(dimensions) ]
    else:
        raise Exception('Invalid weighting method provided')
    
    return weights

In [9]:
def update_model(ckan, model, storage):
    if not RESOURCE_METHODS in storage:
        r = requests.post(
            '{0}/api/3/action/resource_create'.format(ckan.address),
            data={
                'package_id': PACKAGE_FRAMEWORK,
                'name': RESOURCE_METHODS,
                'format': 'json',
                'is_preview': False
            },
            headers={
                'Authorization': ckan.apikey
            },
            files={
                'upload': ('{0}.json'.format(RESOURCE_METHODS), json.dumps({}))
            }
        )

        storage[RESOURCE_METHODS] = json.loads(r.content)['result']

    r = requests.get(
        storage[RESOURCE_METHODS]['url'],
        headers={
            'Authorization': ckan.apikey
        }
    )

    scoring_methods = json.loads(r.content)
    scoring_methods[MODEL_VERSION] = model

    r = requests.post(
        '{0}/api/3/action/resource_patch'.format(ckan.address),
        data={
            'id': storage[RESOURCE_METHODS]['id']
        },
        headers={
            'Authorization': ckan.apikey
        },
        files={
            'upload': ('{0}.json'.format(RESOURCE_METHODS), json.dumps(scoring_methods))
        }
    )

In [10]:
def update_score(ckan, data, weight, dimensions, storage):
    df = pd.DataFrame(data).set_index(['package', 'resource'])

    scores = pd.DataFrame([weight] * len(df.index))
    scores.index = df.index
    scores.columns = dimensions

    scores = df.multiply(scores)

    df['score'] = scores.sum(axis=1)
    df['score_norm'] = MinMaxScaler().fit_transform(df[['score']])

    df = df.groupby('package').mean()

    df['grade'] = pd.cut(df['score'], bins=[-1, .3, .5, .8, 1], labels=['D','C','B','A'])
    df['grade_norm'] = pd.cut(df['score_norm'], bins=[-1, .3, .5, .8, 1], labels=['D','C','B','A'])

    df['recorded_at'] = dt.now()
    df['model'] = MODEL_VERSION

    df = df.reset_index()

    report = petk.DataReport(df).describe(as_dict=True)

    ckan_fields = {
        'STRING': 'text',
        'NUMERIC': 'float',
        'DATE': 'timestamp'
    }

    fields = [
        {
            'id': k,
            'type': ckan_fields[v]
        } for k, v in report['content_type'].items()
    ]

    if not RESOURCE_SCORES in storage:
        storage[RESOURCE_SCORES] = ckan.action.datastore_create(
            resource={
                'package_id': PACKAGE_FRAMEWORK,
                'name': RESOURCE_SCORES,
                'format': 'csv',
                'is_preview': False
            },
            fields=fields,
            records=[]
        )

    df['recorded_at'] = df['recorded_at'].apply(lambda x: x.strftime('%Y-%m-%dT%H:%M:%SZ'))

    ckan.action.datastore_upsert(
        method='insert',
        resource_id=storage[RESOURCE_SCORES]['resource_id'],
        records=df.to_dict(orient='row')
    )

In [18]:
source = ckanapi.RemoteCKAN(**DATA_CKAN)
ckan = ckanapi.RemoteCKAN(**STORAGE_CKAN)

In [19]:
storage = get_model(ckan)

In [20]:
packages = ckan.action.current_package_list_with_resources(limit=500)

results = []
for p in packages:
    for r in p['resources']:
        if not 'datastore_active' in r or not r['datastore_active']:
            continue
        
        data, fields = read_datastore(ckan, r['id'])
        
        results.append({
            'package': p['name'],
            'resource': r['name'],
            'metrics': {
#                 'interpretability': 1,
                'usability': score_usability(p, fields, data),
                'metadata': score_metadata(p, fields),
#                 'freshness': score_freshness(p),
#                 'granularity': 1,
                'completeness': 1 - (np.sum(len(data) - data.count()) / np.prod(data.shape)),
                'accessibility': 1
            }
        })

  'cv': series.std() / series.mean(),


In [21]:
dimensions = [ x for x in DIMENSIONS if x in METRICS ]
weights = calculate_weights(dimensions, data)

In [27]:
model = {
    'aggregation_methods': {
        'metrics_to_dimension': 'avg',
        'dimensions_to_score': 'sum_and_reciprocal'
    },
    'dimensions': [
        {
            'name': dim,
            'rank': i + 1,
            'weight': wgt,
            'metrics': METRICS[dim]
        } for i, (dim, wgt) in enumerate(zip(dimensions, weights))
    ]
}

update_model(ckan, model, storage)
update_score(ckan, data, weights, dimensions, storage)

In [88]:
records = []

is_geospatial = False

has_more = True
while has_more:
    result = source.action.datastore_search(id='b9214fd7-60d1-45f3-8463-a6bd9828f8bf', limit=5000, offset=len(records))

    records += result['records']
    has_more = len(records) < result['total']

df = pd.DataFrame(records).drop('_id', axis=1)

if 'geometry' in df.columns:
    df['geometry'] = df['geometry'].apply(lambda x: shape(json.loads(x)))

    df = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'})