In [88]:
import pickle
from IPython.display import Markdown as md
from time import gmtime, strftime
from platform import platform
import pandas as pd

In [106]:
def success(message: str):
    
    return('%s<span style="color:green;"> ... ✓ </span>' % message)

def failure(message: str):
    
    return('%s<span style="color:red;"> ... X</span>' % message)

def check_conditional(key: str, value: str):
    
    if '_SUCCESS' in key:
    
        return(success(value))

    else:

        return(failure(value))

In [107]:
report = pickle.load( open( "../tmp/report.pickle", "rb" ) )

# WHO PHSM Cleaning Technical Report

Generated by **WHO_PHSM_Cleaning**. More information available on [GitHub](https://github.com/lshtm-gis/WHO_PHSM_Cleaning).

For more details on the processing routine, please see the [project documentation](https://lshtm-gis.github.io/WHO_PHSM_Cleaning/html/).

*See a problem with the data or processing routine? Please [open an issue](https://github.com/lshtm-gis/WHO_PHSM_Cleaning/issues/new).*

## Session Details
***


In [108]:
md('<b>Report generated:</b> ' + strftime("%Y-%m-%d %H:%M:%S", gmtime()))

<b>Report generated:</b> 2020-10-21 17:30:19

In [109]:
md('<b>Platform:</b> ' + platform())

<b>Platform:</b> Darwin-19.6.0-x86_64-i386-64bit

## Preprocessing
***

### Description

Ingests provider data, checks input data format. 

In [110]:
preprocess = pd.DataFrame(report['preprocess']).dropna(subset = ['key'])

### Summary

In [111]:
tot = 'Total input records = ' + [x['value'] for x in report['preprocess'] if x['key'] == 'INPUT_RECORDS'][0]
jh = 'JH_HIT Records = ' + [x['value'] for x in report['preprocess'] if x['key'] == 'JH_HIT_RECORDS'][0]
cdc = 'CDC_ITF Records = ' + [x['value'] for x in report['preprocess'] if x['key'] == 'CDC_ITF_RECORDS'][0]
acaps = 'ACAPS Records = ' + [x['value'] for x in report['preprocess'] if x['key'] == 'ACAPS_RECORDS'][0]

md('<br>'.join([tot, jh, cdc, acaps]))

Total input records = 38170<br>JH_HIT Records = 11762<br>CDC_ITF Records = 7714<br>ACAPS Records = 18694

### Checks

In [114]:
s = []
for k, row in preprocess.loc[['INPUT_CHECK' in x for x in preprocess['key']], :].iterrows():
    
    s.append(check_conditional(row['key'], row['value']))
    
md('<br>'.join(s))    

<span style="color:green;">JH_HIT input columns OK.</span><br><span style="color:green;">CDC_ITF input columns OK.</span><br><span style="color:green;">ACAPS input columns OK.</span>

## Processing
***
  
### Description

Transforms individual records. Converts column names, parses date formats, applies custom changes for each dataset.

In [117]:
process = pd.DataFrame(report['process']).dropna(subset = ['key'])

### Summary
Number of records for each dataset here

### Checks

In [119]:
s = []
for k, row in process.loc[['OUTPUT_CHECK' in x for x in process['key']], :].iterrows():
    
    s.append(check_conditional(row['key'], row['value']))
    
md('<br>'.join(s)) 

<span style="color:green;">Column names agree.</span><br><span style="color:green;">No Duplicate who_id.</span><br><span style="color:green;">No Duplicate prev_measure_number.</span><br><span style="color:green;">No Duplicate following_measure_number.</span><br><span style="color:green;">No unexpected values in admin_level.</span><br><span style="color:green;">No unexpected values in enforcement.</span><br><span style="color:green;">No unexpected values in keep.</span><br><span style="color:green;">No unexpected values in link_eng.</span><br><span style="color:red;">Unexpected values in measure_stage: Lift, Ease, Introduction / extension of measures, Strengthen, Impose, Pause, Phase-out measure.</span><br><span style="color:red;">Unexpected values in non_compliance_penalty: nan, Up to detention, Other, No, Up to Detention, Arrest/Detention, Other (add in comments), Not applicable, Yes, Legal Action, Legal action, Refusal to enter the country, Not Applicable, Refusal to Enter the Country, Fines, Yes , yes, Not available , No', Not Available, Not available.</span><br><span style="color:green;">No unexpected values in processed.</span><br><span style="color:green;">No unexpected values in reason_ended.</span><br><span style="color:green;">No unknown values in iso_3166_1_numeric.</span><br><span style="color:green;">No unknown values in who_region.</span><br><span style="color:green;">No unknown values in country_territory_area.</span><br><span style="color:green;">No unknown values in who_code.</span><br><span style="color:green;">No unknown values in who_measure.</span><br><span style="color:green;">No unknown values in who_subcategory.</span><br><span style="color:green;">No unknown values in who_category.</span>

## Postprocessing
***

### Description
  
Applies dataset-level changes. Applies custom changes that affect multiple records from a data provider. 

In [120]:
postprocess = pd.DataFrame(report['postprocess']).dropna(subset = ['key'])

### Summary
Number of records for each dataset here

### Checks

In [None]:
s = []
for k, row in postprocess.loc[['OUTPUT_CHECK' in x for x in postprocess['key']], :].iterrows():
    
    s.append(check_conditional(row['key'], row['value']))
    
md('<br>'.join(s)) 

## Combination
***

### Description
Combine update data with cleansed data.

In [None]:
postprocess = pd.DataFrame(report['combine']).dropna(subset = ['key'])

### Checks

In [None]:
s = []
for k, row in postprocess.loc[['OUTPUT_CHECK' in x for x in postprocess['key']], :].iterrows():
    
    s.append(check_conditional(row['key'], row['value']))
    
md('<br>'.join(s)) 