In [1]:
import json
import re

import ckanapi
import geopandas as gpd
import nltk
import petk
import numpy as np
import pandas as pd
import requests

from nltk.corpus import wordnet
from shapely.geometry import mapping, shape
from sklearn.preprocessing import MinMaxScaler

from datetime import datetime as dt
from time import sleep

# nltk.download('wordnet')

In [77]:
PACKAGE_FRAMEWORK = 'catalogue-quality-scores'

RESOURCE_MODEL = 'scoring-methods'
RESOURCE_SCORES = 'catalogue-scorecard'

MODEL_VERSION = 'v0.0.5'

# DIMENSIONS = ['interpretability', 'usability', 'metadata', 'freshness', 'granularity', 'completeness', 'accessibility'] # Ordered by importance
DIMENSIONS = ['usability', 'metadata', 'freshness', 'completeness', 'accessibility'] # Ordered by importance

DATA_CKAN = {
    'address': 'https://ckan0.cf.opendata.inter.prod-toronto.ca'
#     'apikey': ''
}

STORAGE_CKAN = {
    'address': 'https://ckanadmin0.intra.dev-toronto.ca',
    'apikey': '784f11cc-b170-4377-83a3-38ba28662b16'
}

In [3]:
def get_model(ckan, pid=PACKAGE_FRAMEWORK):
    try:
        model = ckan.action.package_show(id=pid)
    except ckanapi.NotAuthorized:
        raise Exception('Permission required to search for the framework package')
    except ckanapi.NotFound:
        raise Exception('Framework package not found')
    
    return {
        r['name']: r for r in model.pop('resources')
    }

In [4]:
def read_datastore(ckan, rid, rows=10000):
    records = []
    
    is_geospatial = False
    
    has_more = True
    while has_more:
        result = ckan.action.datastore_search(id=rid, limit=rows, offset=len(records))
        
        records += result['records']
        has_more = len(records) < result['total']
    
    df = pd.DataFrame(records).drop('_id', axis=1)
    
    if 'geometry' in df.columns:
        df['geometry'] = df['geometry'].apply(lambda x: shape(json.loads(x)))
        
        df = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'})
    
    return df, [x for x in result['fields'] if x['id'] != '_id']

In [5]:
def score_usability(columns, data):
    '''
        How easy is it to use the data given how it is organized/structured?
        
        TODO's: 
            * level of nested fields?
            * long vs. wide?
            * if ID columns given, are these ID's common across datasets?
    '''
    
    def parse_col_name(s):
        camel_to_snake = re.sub(
            '([a-z0-9])([A-Z])', 
            r'\1_\2', 
            re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
        ).lower()

        return camel_to_snake == s, re.split('-|_|\s', camel_to_snake)

    metrics = {
        'col_names': 0, # Are the column names easy to understand?
        'col_constant': 0 # Are there columns where all values are constant?
    }
    
    for f in columns:
        is_camel, words = parse_col_name(f['id'])
        eng_words = [ wordnet.synsets(w) for w in words if len(w) ]

        if len(eng_words) / len(words) > 0.8:
            metrics['col_names'] += (1 if not is_camel else 0.5) / len(columns)
        
        if not f['id'] == 'geometry' and data[f['id']].nunique() == 1:
            metrics['col_constant'] += 1 / len(columns)
    
    if isinstance(data, gpd.GeoDataFrame):
        counts = data['geometry'].is_valid.value_counts()
        
        metrics['geo_validity'] = 1 - (counts[False] / (len(data) * 0.05)) if False in counts else 1
    
    return np.mean(list(metrics.values()))

In [6]:
METADATA_FIELDS = ['collection_method', 'limitations', 'civic_issues', 'topics', 'owner_division', 'owner_email']

def score_metadata(package, columns):
    '''
        How easy is it to understand the context of the data?
        
        TODO's: 
            * Measure the quality of the metadata as well
    '''
    
    metrics = {
        'desc_dataset': 0, # Does the metadata describe the dataset well?
        'desc_columns': 0 # Does the metadata describe the data well?
    }
    
    for field in METADATA_FIELDS:
        if field in package and not package[field] is None and len(package[field]):
            metrics['desc_dataset'] += 1 / len(METADATA_FIELDS)
            
    for f in columns:
        if 'info' in f and len(f['info']['notes']):
            metrics['desc_columns'] += 1 / len(columns)

    return np.mean(list(metrics.values()))

In [78]:
TIME_MAP = {
    'read-time': 1,
    'daily': 1,
    'weekly': 7,
    'monthly': 30,
    'quarterly': 52 * 7 / 4,
    'semi-annually': 52 * 7 / 2,
    'annually': 365
}

def score_freshness(package):
    '''
        How up to date is the data?
    '''
    
    metrics = {}
    
    rr = package['refresh_rate'].lower()
    if rr in TIME_MAP and 'last_refreshed' in package and package['last_refreshed']: 
        days = (dt.now() - dt.strptime(package['last_refreshed'], '%Y-%m-%dT%H:%M:%S.%f')).days
        
        # Greater than 2 periods have a score of 0
        metrics['elapse_periods'] = max(0, 1 - math.ceil(days / TIME_MAP[rr]) / 2)
        
        # Decrease the score starting from ~0.5 years to ~3 years
        metrics['elapse_days'] = 1 - (1 / (1 + math.exp(4 * (2.25 * 365 - x))))
        
        return np.mean(list(metrics.values()))
    
    return 0

In [8]:
def calculate_weights(dimensions, method='rs'):
    N = len(dimensions)
    
    if method == 'sr':
        denom = np.array([ ((1 / (i + 1)) + ((N + 1 - (i + 1)) / N)) for i, x in enumerate(dimensions) ]).sum()
        weights = [ ((1 / (i + 1)) + ((N + 1 - (i + 1)) / N)) / denom for i, x in enumerate(dimensions) ]
    elif method == 'rs':
        denom = np.array([ (N + 1 - (i + 1)) for i, x in enumerate(dimensions)]).sum()
        weights = [ (N + 1 - (i + 1)) / denom for i, x in enumerate(dimensions) ]
    elif method == 'rr':
        denom = np.array([ 1 / (i + 1) for i, x in enumerate(dimensions) ]).sum()
        weights = [ (1 / (i + 1)) / denom for i, x in enumerate(dimensions) ]
    elif method == 're':
        exp = 0.2
        denom = np.array([ (N + 1 - (i + 1)) ** exp for i, x in enumerate(dimensions) ]).sum()
        weights = [ (N + 1 - (i + 1)) ** exp / denom for i, x in enumerate(dimensions) ]
    else:
        raise Exception('Invalid weighting method provided')
    
    return weights

In [31]:
def update_model(ckan, model, storage):
    if not RESOURCE_MODEL in storage:
        r = requests.post(
            '{0}/api/3/action/resource_create'.format(ckan.address),
            data={
                'package_id': PACKAGE_FRAMEWORK,
                'name': RESOURCE_MODEL,
                'format': 'json',
                'is_preview': False,
                'is_zipped': False
            },
            headers={
                'Authorization': ckan.apikey
            },
            files={
                'upload': ('{0}.json'.format(RESOURCE_MODEL), json.dumps({}))
            }
        )
        
        storage[RESOURCE_MODEL] = json.loads(r.content)['result']

    r = requests.get(
        storage[RESOURCE_MODEL]['url'],
        headers={
            'Authorization': ckan.apikey
        }
    )

    scoring_methods = json.loads(r.content)
    scoring_methods[MODEL_VERSION] = model

    r = requests.post(
        '{0}/api/3/action/resource_patch'.format(ckan.address),
        data={
            'id': storage[RESOURCE_MODEL]['id']
        },
        headers={
            'Authorization': ckan.apikey
        },
        files={
            'upload': ('{0}.json'.format(RESOURCE_MODEL), json.dumps(scoring_methods))
        }
    )

In [79]:
def update_score(ckan, data, weight, dimensions, storage):
    df = pd.DataFrame(data).set_index(['package', 'resource'])

    scores = pd.DataFrame([weight] * len(df.index))
    scores.index = df.index
    scores.columns = dimensions

    scores = df.multiply(scores)

    df['score'] = scores.sum(axis=1)
    df['score_norm'] = MinMaxScaler().fit_transform(df[['score']])

    df = df.groupby('package').mean()

    df['grade'] = pd.cut(df['score'], bins=[-1, .3, .5, .8, 1], labels=['D','C','B','A'])
    df['grade_norm'] = pd.cut(df['score_norm'], bins=[-1, .3, .5, .8, 1], labels=['D','C','B','A'])

    df['recorded_at'] = dt.now().strftime('%Y-%m-%dT%H:%M:%SZ')
    df['model'] = MODEL_VERSION

    df = df.reset_index()

    if not RESOURCE_SCORES in storage:
        storage[RESOURCE_SCORES] = ckan.action.datastore_create(
            resource={
                'package_id': PACKAGE_FRAMEWORK,
                'name': RESOURCE_SCORES,
                'format': 'csv',
                'is_preview': False,
                'is_zipped': True
            },
            records=df.to_dict(orient='row')
        )
    else:
        ckan.action.datastore_upsert(
            method='insert',
            resource_id=storage[RESOURCE_SCORES]['resource_id'],
            records=df.to_dict(orient='row')
        )

In [80]:
source = ckanapi.RemoteCKAN(**DATA_CKAN)
ckan = ckanapi.RemoteCKAN(**STORAGE_CKAN)

In [81]:
packages = source.action.current_package_list_with_resources(limit=500)

data = []
for p in packages:
    for r in p['resources']:
        if not 'datastore_active' in r or not r['datastore_active']:
            continue
        
        content, fields = read_datastore(source, r['id'])
        
        data.append({
            'package': p['name'],
            'resource': r['name'],
#             'interpretability': 1,
            'usability': score_usability(fields, content),
            'metadata': score_metadata(p, fields),
            'freshness': score_freshness(p),
#             'granularity': 1,
            'completeness': 1 - (np.sum(len(content) - content.count()) / np.prod(content.shape)),
            'accessibility': 1 if 'extract_job' in r and r['extract_job'] else 0.5
        })

ProxyError: HTTPSConnectionPool(host='ckan0.cf.opendata.inter.prod-toronto.ca', port=443): Max retries exceeded with url: /api/action/current_package_list_with_resources (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 407 Proxy Authentication Required')))

In [13]:
weights = calculate_weights(DIMENSIONS)

In [14]:
df = pd.DataFrame(data).set_index(['package', 'resource'])

scores = pd.DataFrame([weights] * len(df.index))
scores.index = df.index
scores.columns = DIMENSIONS

scores = df.multiply(scores)

df['score'] = scores.sum(axis=1)
df['score_norm'] = MinMaxScaler().fit_transform(df[['score']])

df = df.groupby('package').mean()

df['grade'] = pd.cut(df['score'], bins=[-1, .3, .5, .8, 1], labels=['D','C','B','A'])
df['grade_norm'] = pd.cut(df['score_norm'], bins=[-1, .3, .5, .8, 1], labels=['D','C','B','A'])

df['recorded_at'] = dt.now()
df['model'] = MODEL_VERSION

df = df.reset_index()

In [53]:
model = {
    'aggregation_methods': {
        'metrics_to_dimension': 'avg',
        'dimensions_to_score': 'sum_and_reciprocal'
    },
    'dimensions': [
        {
            'name': dim,
            'rank': i + 1,
            'weight': wgt,
#             'metrics': DIMENSIONS[dim]
        } for i, (dim, wgt) in enumerate(zip(DIMENSIONS, weights))
    ]
}

storage = get_model(ckan)

In [54]:
update_model(ckan, model, storage)

In [55]:
update_score(ckan, data, weights, DIMENSIONS, storage)

In [70]:
df.groupby('grade_norm').count()

Unnamed: 0_level_0,package,accessibility,completeness,freshness,metadata,usability,score,score_norm,grade,recorded_at,model,grade_norm_num
grade_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
D,6,6,6,6,6,6,6,6,6,6,6,6
C,16,16,16,16,16,16,16,16,16,16,16,16
B,31,31,31,31,31,31,31,31,31,31,31,31
A,17,17,17,17,17,17,17,17,17,17,17,17
