In [246]:
import pandas as pd
import requests
import ckanapi
import math
import re

In [280]:
mapping = {
    "id": "measure_id",
    "m": "measure_name",
    "it": "interval_type",
    "vt": "value_type",
    # "ytd": "year_to_date_ind",
    # "ht": "has_target_ind",
    "v": "variance",
    "yv": "year_to_date_variance",
    "bv": "budget_variance",
    "da": "decimal_accuracy", # should this be precision?
    "dd": "desired_direction",
    "c": "category",
    "ds":"data_source_notes",
    "cp": "city_perspective_note",
    "y": "year",
    "p": "period_number_in_year",
    "v": "measure_value",
    "target":"target",
    "notes":"notes",
    "c": "category",
    # "kw": "keywords",
}

def get_category_measures(measures, category):
    subset = []
    for m in measures:
        assert len(m["c"]) == 1, f"Measure has more than 1 category: {m['c']}"
        if m["c"][0].lower() == category.lower():
            subset.append(m)
            
    return subset


def make_measures_records(measures):
    records = []
    
    for i in measures:
        item = { **i }
        data_points = item.pop("vs")
        
        assert len(i["c"]) == 1, f"Item '{i['m']}' ({i['id']}) belongs to more than 1 category: {item['c']}"
        
        item["c"] = item["c"][0]
        
        for dp in data_points:
            r = { k: v for k, v in {**item, **dp}.items() if v }
            r["m"] = r["m"].replace("\n", " ")
            r["ds"] = r["ds"].replace("&amp;", "&")
            r.pop("ytd")
            r.pop("ht")
            r.pop("kw")
            if "da" in r:
                r["da"] = int(r["da"])
            if "yv" in r:
                r["yv"] = float(r["yv"])
            if "bv" in r:
                r["bv"] = float(r["bv"])
            
            for original,updated in mapping.items():
                if original in r:
                    r[updated] = r.pop(original)

            records.append(r)
            
    return records

def join_narratives(records, narratives):
    new_records = []
    
    for record in records:
        r = {**record}
        
        n = narratives.get(f'{r[mapping["id"]]}')
        if n is not None and len(n)>1:
            r["notes"] = n.replace("<br />", "\n").strip("\n").strip()
            
        new_records.append(r)
    
    return new_records

def build_data_dict():
    data_dict = []
    
    for m in mapping.values():
        data_dict.append({
            "id": m,
            "type": "text",
        })
        
    for c in data_dict:
        if c["id"] in ["measure_id", "year_to_date_variance", "budget_variance", "measure_value","target"]:
            c["type"] = "float"
        # elif c["id"] in ["year_to_date_ind", "has_target_ind"]:
        #     c["type"] = "boolean"
        elif c["id"] in ["decimal_accuracy", "year", "period_number_in_year"]:
            c["type"] = "int"

        
    return  data_dict

def string_to_dict(string, pattern):
    regex = re.sub(r'{(.+?)}', r'(?P<_\1>.+)', pattern)
    values = list(re.search(regex, string).groups())
    keys = re.findall(r'{(.+?)}', pattern)
    _dict = dict(zip(keys, values))
    return _dict

def build_narratives_df(notes):
    p_map = {
        "January": 1,
        "February":2,
        "March":3,
        "April":4,
        "May":5,
        "June":6,
        "July":7,
        "August":8,
        "September":9,
        "October":10,
        "November":11,
        "December":12,
        "Spring":1,
        "Summer":2,
        "Fall":3,
        "Winter":4,
    }

    pattern1 = {"a":"^\[Quarter {period_number_in_year} {year}\]{note}$", "b":"\[Quarter \d \d{4}].*"}
    pattern2 = {"a":"^\[Annual {year}\]{note}$","b":"\[Annual \d{4}].*"}
    pattern3 = {"a":"^\[{period_number_in_year} {year}\]{note}$","b":"\[\w{3,15} \d{4}].*"}

    narratives=[]
    for k,v in notes.items():
        if len(v) > 10:
            for n in v.split('<br /><br />'):
                note = None
                nn = n.replace("<br />", "").strip()
                if re.fullmatch(pattern1["b"], nn, flags=0):
                    note = string_to_dict(nn,pattern1["a"])
                elif re.fullmatch(pattern2["b"], nn, flags=0):
                    note = string_to_dict(nn,pattern2["a"])
                    note["period_number_in_year"] = note["year"]
                elif re.fullmatch(pattern3["b"], nn, flags=0):
                    note = string_to_dict(nn,pattern3["a"])
                    note['period_number_in_year'] = p_map[note['period_number_in_year']]
                else:
                    None
                    # print("note does not match pattern:", n)

                if note:
                    note["year"] = int(note["year"])
                    note["period_number_in_year"] = int(note["period_number_in_year"])
                    note["measure_id"] = float(k)
                    narratives.append(note)

    return pd.DataFrame(naratives)

In [45]:
measures = requests.get("https://contrib.wp.intra.prod-toronto.ca/app_content/tpp_measures/").json()
notes = requests.get("https://contrib.wp.intra.prod-toronto.ca/app_content/tpp_narratives/").json()

tmp_records = make_measures_records(measures["measures"])
# records = join_narratives(tmp_records, notes)
fields = build_data_dict()

In [46]:
targets=measures["targets"][0]
df_target = pd.DataFrame()
for k, v in targets.items():
    df = pd.DataFrame(v)
    df["measure_id"] = float(k)
    df_target = df_target.append (df.rename(columns={"v":"target", "p":"period_number_in_year", "y":"year"}))
df_measure = pd.DataFrame(records)
df_measure_target = pd.merge(df_measure,df_target, how='left', on=['measure_id', 'year', 'period_number_in_year'])

In [47]:
df_measure_with_target = df_measure_target[df_measure_target['target'] == df_measure_target['target']][['measure_id', 'year', 'period_number_in_year','target']]
df_measure_with_target['matched']=True
print('total target number:', len(df_target), '\nmacthed:', len(df_measure_with_target))

total target number: 774 
macthed: 750


In [48]:
compare_df = pd.merge(df_target[['measure_id', 'year', 'period_number_in_year','target']], df_measure_with_target, how='left', on=['measure_id', 'year', 'period_number_in_year'])
df_target_wo_measure = compare_df[compare_df['matched'] != True][['measure_id','year','period_number_in_year','target_x']].rename(columns={"target_x":"target"})


In [49]:
df_target_wo_measure

Unnamed: 0,measure_id,year,period_number_in_year,target
399,2.12,2020,10,63526731.0
400,2.12,2020,11,67280274.0
401,2.12,2020,12,64429309.0
537,2.17,2021,4,712895.0
538,2.17,2021,5,664891.0
539,2.17,2021,6,739639.0
540,2.17,2021,7,733744.0
541,2.17,2021,8,645728.0
542,2.17,2021,9,742224.0
543,2.17,2021,10,710018.0


In [70]:
df_measure_wo_vs = df_measure_target.drop(columns=['year','period_number_in_year','measure_value','target']).drop_duplicates(keep='last')
df_measure_wo_vs['measure_value']=None


In [96]:
df_measure_wo_vs


Unnamed: 0,measure_id,measure_name,interval_type,value_type,year_to_date_variance,desired_direction,category,data_source_notes,notes,decimal_accuracy,city_perspective_note,budget_variance,measure_value
170,1.130,Number of Personal Bankruptcies (Ontario),m,n,0.05,Down,Community Vulnerability,Source: Industry Canada,[November 2020]\nDecrease in line with nationa...,,,,
341,1.120,Number of Business Bankruptcies (Ontario),m,n,0.05,Down,Economy,Source: Industry Canada,[November 2020]\nDecrease in line with nationa...,,,,
512,1.040,Employed City of Toronto Residents (000's),m,n,0.01,Up,Economy,Seasonally Adjusted 3-month moving average (S...,[January 2021]\nStatistics Canada rebased thei...,,,,
683,1.060,Percentage who are Self-Employed (Toronto Resi...,m,p,,Up,Economy,3 month moving average (Source: Statistics Ca...,,1.0,Colour scheme is based on the city 's perspect...,,
854,1.020,Unemployment Rate (Toronto Residents),m,p,,Down,Economy,Seasonally Adjusted 3-month moving average (S...,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8145,2.350,Sports and Recreation -Permit Activity - Numbe...,s,n,,Up,Services,"Source: City of Toronto, Parks, Forestry & Rec...",[Fall 2019]\nAt Riverlea Greenhouse the number...,,,,
8197,2.380,Free Centres – Registered Program Enrolment (#),s,n,,,Services,"Source: City of Toronto, Parks, Forestry & Rec...",,,,,
8249,2.390,Free Centres – Drop-in Attendance (#),s,n,,,Services,"Source: City of Toronto, Parks, Forestry & Rec...",,,,,
8267,2.041,TTC Annual Passenger Rides Peak (000s),y,n,,Up,Services,Source: Toronto Transit Commission,[Annual 2020]\nYear-end Note: Ridership for 20...,,,,


In [72]:
df_target_wo_vs = pd.merge(df_target_wo_measure,df_measure_wo_vs, how='left', on=['measure_id'])

In [226]:
# df_target_wo_vs

In [227]:
# df_target_wo_vs[df_measure_target.columns]

In [74]:
df = pd.concat([df_measure_target, df_target_wo_vs[df_measure_target.columns]])

In [250]:
df = df.drop(columns=["notes"])

In [289]:
len(df)

8310

In [281]:
df_narrative = build_narratives_df(notes)

In [291]:
len(df_narrative)

150

In [283]:
df_w_note = pd.merge(df,df_narrative, how='left', on=['measure_id', 'year', 'period_number_in_year'])

In [288]:
df_w_note[['measure_id', 'year', 'period_number_in_year','measure_value', 'target', 'note']][df_w_note['note']==df_w_note['note']]

Unnamed: 0,measure_id,year,period_number_in_year,measure_value,target,note
166,1.130,2020,11,747.0,,Decrease in line with national trend where Can...
337,1.120,2020,11,36.0,,Decrease in line with national trend where Can...
502,1.040,2020,5,1273.6,,Province wide emergency orders starting March ...
510,1.040,2021,1,1350.3,,Statistics Canada rebased their figures based ...
1362,1.110,2020,10,86940.0,,The Government of Canada made temporary change...
...,...,...,...,...,...,...
8041,2.330,2019,4,1348565.0,,Much of the decrease between 2018 and 2019 Q1 ...
8144,2.350,2019,3,733258.0,,At Riverlea Greenhouse the number of indoor pl...
8145,2.350,2019,4,895844.0,,Much of the increase between 2018 and 2019 Q1 ...
8267,2.041,2020,2020,96750.0,,Year-end Note: Ridership for 2020 was negative...


In [56]:
#verify some target 
df_measure[(df_measure['measure_id']==2.12) & (df_measure['year']==2020) & (df_measure['period_number_in_year']==12)]

Unnamed: 0,measure_id,measure_name,interval_type,value_type,measure_value,year_to_date_variance,desired_direction,category,data_source_notes,year,period_number_in_year,notes,decimal_accuracy,city_perspective_note,budget_variance


In [57]:
# df_measure_target[(df_measure_target['measure_id']==2.12) & (df_measure_target['year']==2020) & (df_measure_target['period_number_in_year']> 8)]

In [66]:
ckan = ckanapi.RemoteCKAN(
    address="https://ckanadmin0.intra.qa-toronto.ca/",
    apikey='2a38b27f-84e7-4669-87d4-b7a2afda87e3',
)

In [79]:
# find target package and resource and purge content, otherwise create resource
package_name='toronto-progress-portal'
res_name='Toronto progress portal - Key metrics'
res_id=None

try:
    package=ckan.action.package_show(id=package_name)
    resources=package.pop('resources')
    # find datastore resource with same name and delete data
    for r1 in resources:
        if r1['name']==res_name: # and r1['datastore_active']:
            res_id = r1['id']
            ckan.action.datastore_delete(id=res_id, filters={}, force=True)
            ckan.action.resource_patch(id=res_id, datastore_active=True, is_preview=True)
            break
except:
    print('Error finding package')

if not res_id:
    try:
        r = {'package_id': package['id'], 'format': 'CSV', 'extract_job': 'Python-progress','name': res_name, 'datastore_active': True,'is_preview': True}
        res = ckan.action.datastore_create( resource=r, fields=fields, primary_key='measure_id,year,period_number_in_year', force=True)
        res_id = res['resource_id']
    except:
        print('Error creating resource')

In [82]:
res_id

'12ae3f37-f786-435f-bfec-0914a2b521ef'

In [80]:
i=0
while i < len(df):
    rec=[ { k:v for k,v in r.items() if (v == v) and (v is not None) } for r in df[i:i+1000].to_dict(orient="records") ]
    try:
        print('Try:',i,i+1000)
        progress_insert=ckan.action.datastore_upsert(id=res_id, records=rec, force=True, method='upsert')
    except Exception as e:
        print('Something is wrong:', e)
        break
    i=i+1000

if i >= len(df):
    print('Data loading completed:', len(df))
else:
    print('Data loading aborted')

Try: 0 1000
Try: 1000 2000
Try: 2000 3000
Try: 3000 4000
Try: 4000 5000
Try: 5000 6000
Try: 6000 7000
Try: 7000 8000
Try: 8000 9000
Something is wrong: ['https://ckanadmin0.intra.qa-toronto.ca/api/action/datastore_upsert', 500, '{"help": "https://ckanadmin0.intra.qa-toronto.ca/api/3/action/help_show?name=datastore_upsert", "success": false, "error": {"message": "Internal Server Error", "__type": "Internal Server Error"}}']
Data loading aborted


In [None]:
# data from Pandas DataFrame df
with_nans = df.to_dict(orient="records")
​
# since np.nan != np.nan, can use it to filter. Essentially, knowing that the type is NOT a number doesn't mean both types ARE THE SAME non-number type.
​
# ALTERNATIVE 1: create new list of records without NaNs using dictionary comprehension
without_nans = []
for r in with_nans:
	new_record = { k:v for k,v in r.items() if v == v }
    without_nans.append(new_record)
    
# ALTERNATIVE 2: can take it a step further by combining with list comprehension
without_nans = [ { k:v for k,v in r.items() if v == v } for r in with_nans ]
​
# ALTERNATIVE 3: can put it all in one line
without_nans = [ { k:v for k,v in r.items() if v == v } for r in df.to_dict(orient="records") ]

In [30]:
# ckan.action.datastore_delete(id=res_id,filters={},force=True)

In [29]:
# ckan.action.resource_patch(id=res_id, datastore_active=True, is_preview=True)

In [244]:
# 

In [119]:
# note = "[January 2021]<br />Statistics Canada rebased their figures based on latest population estimate<br /><br />[May 2020]<br />Province wide emergency orders starting March as a result of COVID-19<br /><br />"
# note = note.split('<br /><br />')
# note = [v.replace("<br />", "") for v in notes if len(v)>1 ]

In [180]:
import re  

In [223]:
def string_to_dict(string, pattern):
    regex = re.sub(r'{(.+?)}', r'(?P<_\1>.+)', pattern)
    values = list(re.search(regex, string).groups())
    keys = re.findall(r'{(.+?)}', pattern)
    _dict = dict(zip(keys, values))
    return _dict

p_map = {
    "January": 1,
    "February":2,
    "March":3,
    "April":4,
    "May":5,
    "June":6,
    "July":7,
    "August":8,
    "September":9,
    "October":10,
    "November":11,
    "December":12,
    "Spring":1,
    "Summer":2,
    "Fall":3,
    "Winter":4,
}

pattern1 = {"a":"^\[Quarter {period_number_in_year} {year}\]{note}$", "b":"\[Quarter \d \d{4}].*"}
pattern2 = {"a":"^\[Annual {year}\]{note}$","b":"\[Annual \d{4}].*"}
pattern3 = {"a":"^\[{period_number_in_year} {year}\]{note}$","b":"\[\w{3,15} \d{4}].*"}



In [276]:
narratives=[]
for k,v in notes.items():
    if len(v) > 10:
        for n in v.split('<br /><br />'):
            note = None
            nn = n.replace("<br />", "").strip()
            if re.fullmatch(pattern1["b"], nn, flags=0):
                note = string_to_dict(nn,pattern1["a"])
            elif re.fullmatch(pattern2["b"], nn, flags=0):
                note = string_to_dict(nn,pattern2["a"])
                note["period_number_in_year"] = note["year"]
            elif re.fullmatch(pattern3["b"], nn, flags=0):
                note = string_to_dict(nn,pattern3["a"])
                note['period_number_in_year'] = p_map[note['period_number_in_year']]
            else:
                print("note does not match pattern:", n)
            if note:
                note["year"] = int(note["year"])
                note["period_number_in_year"] = int(note["period_number_in_year"])
                note["measure_id"] = float(k)
                narratives.append(note)
df_narrative=pd.DataFrame(narratives)
df_narrative

note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does not match pattern: 
note does 

Unnamed: 0,period_number_in_year,year,note,measure_id
0,1,2021,Statistics Canada rebased their figures based ...,1.040
1,5,2020,Province wide emergency orders starting March ...,1.040
2,4,2019,Average Actual Hours at Main Job could fluctua...,1.051
3,12,2020,Renewed lockdown measures and restrictions shu...,1.070
4,11,2020,Reduced exports of Machinery/Vehicles/Equipmen...,1.100
...,...,...,...,...
145,12,2020,Crime trends in 2020 have been impacted by the...,3.030
146,12,2020,Crime trends in 2020 have been impacted by the...,3.040
147,12,2020,Crime trends in 2020 have been impacted by the...,3.050
148,12,2020,Crime trends in 2020 have been impacted by the...,3.060


In [290]:
len(narratives)

150

In [149]:
string_to_dict(narrative,pattern2)

{'period_number_in_year': 'January',
 'year': '2021',
 'note': 'Statistics Canada rebased their figures based on latest population estimate'}

In [266]:
records[0]

{'measure_id': 1.13,
 'measure_name': 'Number of Personal Bankruptcies (Ontario)',
 'interval_type': 'm',
 'value_type': 'n',
 'measure_value': 2307,
 'year_to_date_variance': 0.05,
 'desired_direction': 'Down',
 'category': 'Community Vulnerability',
 'data_source_notes': 'Source: Industry Canada',
 'year': 2007,
 'period_number_in_year': 1,
 'notes': '[November 2020]\nDecrease in line with national trend where Canada saw a significant decrease in bankruptcies. Government support programs in place.'}