In [3]:
import pandas as pd
import requests
import ckanapi
import math

In [4]:
mapping = {
    "id": "measure_id",
    "m": "measure_name",
    "it": "interval_type",
    "vt": "value_type",
    "ytd": "year_to_date_ind",
    "ht": "has_target_ind",
    "v": "variance",
    "yv": "year_to_date_variance",
    "bv": "budget_variance",
    "da": "decimal_accuracy", # should this be precision?
    "dd": "desired_direction",
    "c": "category",
    "ds":"data_source_notes",
    "cp": "city_perspective_note",
    "y": "year",
    "p": "period_number_in_year",
    "v": "measure_value",
    "target":"target",
    "notes":"notes",
    "c": "category",
    "kw": "keywords",
}

def get_category_measures(measures, category):
    subset = []
    for m in measures:
        assert len(m["c"]) == 1, f"Measure has more than 1 category: {m['c']}"
        if m["c"][0].lower() == category.lower():
            subset.append(m)
            
    return subset


def make_measures_records(measures):
    records = []
    
    for i in measures:
        item = { **i }
        data_points = item.pop("vs")
        
        assert len(i["c"]) == 1, f"Item '{i['m']}' ({i['id']}) belongs to more than 1 category: {item['c']}"
        
        item["c"] = item["c"][0]
        
        for dp in data_points:
            r = { k: v for k, v in {**item, **dp}.items() if v }
            r["m"] = r["m"].replace("\n", " ")
            r["ds"] = r["ds"].replace("&amp;", "&")
            r["ytd"] = r["ytd"].lower() == "true"
            r["ht"] = r["ht"].lower() == "true"
            if "da" in r:
                r["da"] = int(r["da"])
            if "yv" in r:
                r["yv"] = float(r["yv"])
            if "bv" in r:
                r["bv"] = float(r["bv"])
            
            for original,updated in mapping.items():
                if original in r:
                    r[updated] = r.pop(original)

            records.append(r)
            
    return records

def join_narratives(records, narratives):
    new_records = []
    
    for record in records:
        r = {**record}
        
        n = narratives.get(f'{r[mapping["id"]]}')
        if n is not None and len(n)>1:
            r["notes"] = n.replace("<br />", "\n").strip("\n").strip()
            
        new_records.append(r)
    
    return new_records


def build_data_dict():
    data_dict = []
    
    for m in mapping.values():
        data_dict.append({
            "id": m,
            "type": "text",
        })
        
    for c in data_dict:
        if c["id"] in ["measure_id", "year_to_date_variance", "budget_variance", "measure_value","target"]:
            c["type"] = "float"
        elif c["id"] in ["year_to_date_ind", "has_target_ind"]:
            c["type"] = "boolean"
        elif c["id"] in ["decimal_accuracy", "year", "period_number_in_year"]:
            c["type"] = "int"

        
    return  data_dict

In [5]:
measures = requests.get("https://contrib.wp.intra.prod-toronto.ca/app_content/tpp_measures/").json()
notes = requests.get("https://contrib.wp.intra.prod-toronto.ca/app_content/tpp_narratives/").json()

tmp_records = make_measures_records(measures["measures"])
records = join_narratives(tmp_records, notes)
fields = build_data_dict()

In [6]:
targets=measures["targets"][0]
df_target = pd.DataFrame()
for k, v in targets.items():
    df = pd.DataFrame(v)
    df["measure_id"] = float(k)
    df_target = df_target.append (df.rename(columns={"v":"target", "p":"period_number_in_year", "y":"year"}))
df_measure = pd.DataFrame(records)
df_measure_target = pd.merge(df_measure,df_target, how='left', on=['measure_id', 'year', 'period_number_in_year'])

In [7]:
df_measure_with_target = df_measure_target[df_measure_target['target'] == df_measure_target['target']][['measure_id', 'year', 'period_number_in_year','target']]
df_measure_with_target['matched']=True
print('total target number:', len(df_target), '\nmacthed:', len(df_measure_with_target))

total target number: 774 
macthed: 750


In [7]:
compare_df = pd.merge(df_target[['measure_id', 'year', 'period_number_in_year','target']], df_measure_with_target, how='left', on=['measure_id', 'year', 'period_number_in_year'])

In [8]:
compare_df[compare_df['matched'] != True]

Unnamed: 0,measure_id,year,period_number_in_year,target_x,target_y,matched
399,2.12,2020,10,63526731.0,,
400,2.12,2020,11,67280274.0,,
401,2.12,2020,12,64429309.0,,
537,2.17,2021,4,712895.0,,
538,2.17,2021,5,664891.0,,
539,2.17,2021,6,739639.0,,
540,2.17,2021,7,733744.0,,
541,2.17,2021,8,645728.0,,
542,2.17,2021,9,742224.0,,
543,2.17,2021,10,710018.0,,


In [9]:
#verify some target 
df_measure[(df_measure['measure_id']==2.12) & (df_measure['year']==2020) & (df_measure['period_number_in_year']==12)]

Unnamed: 0,measure_id,measure_name,interval_type,value_type,year_to_date_ind,has_target_ind,measure_value,year_to_date_variance,desired_direction,category,data_source_notes,year,period_number_in_year,keywords,notes,decimal_accuracy,city_perspective_note,budget_variance


In [8]:
ckan = ckanapi.RemoteCKAN(
    address="https://ckanadmin0.intra.qa-toronto.ca/",
    apikey='',
)

In [31]:
# find target package and resource and purge content, otherwise create resource
package_name='toronto-progress-portal'
res_name='Toronto progress portal - Key metrics'
res_id=None

try:
    package=ckan.action.package_show(id=package_name)
    resources=package.pop('resources')
    # find datastore resource with same name and delete data
    for r1 in resources:
        if r1['name']==res_name: # and r1['datastore_active']:
            res_id = r1['id']
            ckan.action.datastore_delete(id=res_id, filters={}, force=True)
            ckan.action.resource_patch(id=res_id, datastore_active=True, is_preview=True)
            break
except:
    print('Error finding package')

if not res_id:
    try:
        r = {'package_id': package['id'], 'format': 'CSV', 'extract_job': 'Python-progress','name': res_name, 'datastore_active': True,'is_preview': True}
        res = ckan.action.datastore_create( resource=r, fields=fields, primary_key='measure_id,year,period_number_in_year', force=True)
        res_id = res['resource_id']
    except:
        print('Error creating resource')

In [32]:
res_id

'17274ffd-9f64-4931-82e5-4b31675bf0e3'

In [33]:
i=0
while i < len(df_measure_target):
    rec=[ { k:v for k,v in r.items() if v == v } for r in df_measure_target[i:i+1000].to_dict(orient="records") ]
    try:
        print('Try:',i,i+1000)
        progress_insert=ckan.action.datastore_upsert(id=res_id, records=rec, force=True, method='insert')
    except Exception as e:
        print('Something is wrong:', e)
        break
    i=i+1000

if i >= len(df_measure_target):
    print('Data loading completed:', len(df_measure_target))
else:
    print('Data loading aborted')

Try: 0 1000
Try: 1000 2000
Try: 2000 3000
Try: 3000 4000
Try: 4000 5000
Try: 5000 6000
Try: 6000 7000
Try: 7000 8000
Try: 8000 9000
Data loading completed: 8286


In [None]:
# data from Pandas DataFrame df
with_nans = df.to_dict(orient="records")
​
# since np.nan != np.nan, can use it to filter. Essentially, knowing that the type is NOT a number doesn't mean both types ARE THE SAME non-number type.
​
# ALTERNATIVE 1: create new list of records without NaNs using dictionary comprehension
without_nans = []
for r in with_nans:
	new_record = { k:v for k,v in r.items() if v == v }
    without_nans.append(new_record)
    
# ALTERNATIVE 2: can take it a step further by combining with list comprehension
without_nans = [ { k:v for k,v in r.items() if v == v } for r in with_nans ]
​
# ALTERNATIVE 3: can put it all in one line
without_nans = [ { k:v for k,v in r.items() if v == v } for r in df.to_dict(orient="records") ]

In [30]:
# ckan.action.datastore_delete(id=res_id,filters={},force=True)

In [29]:
# ckan.action.resource_patch(id=res_id, datastore_active=True, is_preview=True)