In [3]:
import json

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 
import logging

from functools import reduce

import itertools
import random
import psycopg2
from psycopg2.extras import Json
from sqlalchemy import create_engine, MetaData
import io
import pandas_profiling

## Get input JSON SChema

In [27]:
with open("schema/input/aqmSchema.complete.json") as f:
    d = json.load(f)

schema_df = pd.DataFrame()
schema_df = json_normalize(d)
schema_df

Unnamed: 0,$id,$schema,items.properties.__version__.pattern,items.properties.__version__.title,items.properties.__version__.type,items.properties._attachments.title,items.properties._attachments.type,items.properties._bamboo_dataset_id.pattern,items.properties._bamboo_dataset_id.title,items.properties._bamboo_dataset_id.type,...,items.properties.pregnancy/why_not_delivr_health_centr.pattern,items.properties.pregnancy/why_not_delivr_health_centr.title,items.properties.pregnancy/why_not_delivr_health_centr.type,items.properties.start.pattern,items.properties.start.title,items.properties.start.type,items.title,items.type,title,type
0,http://example.com/root.json,http://json-schema.org/draft-07/schema#,^(.*)$,The __version__ Schema,string,The _attachments Schema,array,^(.*)$,The _bamboo_dataset_id Schema,string,...,^(.*)$,The Pregnancy/why_not_delivr_health_centr Schema,string,^(.*)$,The Start Schema,string,The Items Schema,object,The Root Schema,array


In [801]:
list(schema_df)

['$id',
 '$schema',
 'items.properties.__version__.pattern',
 'items.properties.__version__.title',
 'items.properties.__version__.type',
 'items.properties._attachments.title',
 'items.properties._attachments.type',
 'items.properties._bamboo_dataset_id.pattern',
 'items.properties._bamboo_dataset_id.title',
 'items.properties._bamboo_dataset_id.type',
 'items.properties._geolocation.items.title',
 'items.properties._geolocation.items.type',
 'items.properties._geolocation.title',
 'items.properties._geolocation.type',
 'items.properties._id.title',
 'items.properties._id.type',
 'items.properties._notes.title',
 'items.properties._notes.type',
 'items.properties._status.pattern',
 'items.properties._status.title',
 'items.properties._status.type',
 'items.properties._submission_time.pattern',
 'items.properties._submission_time.title',
 'items.properties._submission_time.type',
 'items.properties._submitted_by.title',
 'items.properties._submitted_by.type',
 'items.properties._tags.t

## Get valid index List only

In [28]:
validIndexLists = []
for indexList in list(schema_df):
    if "._" not in indexList:
        if len(indexList.split(sep='.')) > 2:
            if indexList.split(sep='.')[-1] == 'type' or indexList.split(sep='.')[-1] == 'title':
                validIndexLists.append(indexList)
validIndexLists[:5]

['items.properties.dental_health/family_dental_problems.title',
 'items.properties.dental_health/family_dental_problems.type',
 'items.properties.dental_health/last_6_months_dental_fam_probl.title',
 'items.properties.dental_health/last_6_months_dental_fam_probl.type',
 'items.properties.dental_health/owning_toothbrush.title']

In [819]:
schema_df[['items.properties.dental_health/owning_toothbrush.title','items.properties.dental_health/owning_toothbrush.type']]

Unnamed: 0,items.properties.dental_health/owning_toothbrush.title,items.properties.dental_health/owning_toothbrush.type
0,The Dental_health/owning_toothbrush Schema,string


In [29]:
required_field_df = pd.DataFrame()
required_field_df = schema_df[validIndexLists]
required_field_df.T.head(4)

Unnamed: 0,0
items.properties.dental_health/family_dental_problems.title,The Dental_health/family_dental_problems Schema
items.properties.dental_health/family_dental_problems.type,string
items.properties.dental_health/last_6_months_dental_fam_probl.title,The Dental_health/last_6_months_dental_fam_pro...
items.properties.dental_health/last_6_months_dental_fam_probl.type,string


#### snippet code

In [856]:
required_field_df.T.head(4)[0]

items.properties.demographics.properties.active.title                           The Active Schema
items.properties.demographics.properties.active.type                                      boolean
items.properties.demographics.properties.address.properties.add_date.title    The Add_date Schema
items.properties.demographics.properties.address.properties.add_date.type                  string
Name: 0, dtype: object

## Clean values

In [34]:
def clean_value(x):
    return x.lower().replace("/", ".").replace("the", "").replace("schema", "").strip()



def clean_index(x):
    return x.lower().replace("/", ".").replace(".properties", "").replace("items.", "").strip()

newSchema_df = pd.DataFrame()
newSchema_df['value'] = required_field_df.T[0].apply(clean_value)
#newSchema_df['value'] = newSchema_df['value'].T.head(4)[0].apply(clean_value)

newSchema_df.reset_index(level=0, inplace=True)

newSchema_df.head(4)

Unnamed: 0,index,value
0,items.properties.dental_health/family_dental_p...,dental_health.family_dental_problems
1,items.properties.dental_health/family_dental_p...,string
2,items.properties.dental_health/last_6_months_d...,dental_health.last_6_months_dental_fam_probl
3,items.properties.dental_health/last_6_months_d...,string


In [37]:
newSchema_df['index'] = newSchema_df['index'].apply(clean_index)
newSchema_df.head(5)

Unnamed: 0,index,value
0,dental_health.family_dental_problems.title,dental_health.family_dental_problems
1,dental_health.family_dental_problems.type,string
2,dental_health.last_6_months_dental_fam_probl.t...,dental_health.last_6_months_dental_fam_probl
3,dental_health.last_6_months_dental_fam_probl.type,string
4,dental_health.owning_toothbrush.title,dental_health.owning_toothbrush


In [40]:
valueSchema_df = pd.DataFrame()

number = len(newSchema_df)
index = 0
counter = 0
counter1 = 1

array_key = []
array_type = []

while (counter < number):
    array_key.append(newSchema_df.iloc[counter]['index'].replace('.title',''))
    counter += 2
        
while (counter1 < number):
    array_type.append(newSchema_df.iloc[counter1]['value'])
    counter1 += 2

valueSchema_df['source_key'] = array_key
valueSchema_df['source_type'] = array_type


valueSchema_df.head(10)

Unnamed: 0,source_key,source_type
0,dental_health.family_dental_problems,string
1,dental_health.last_6_months_dental_fam_probl,string
2,dental_health.owning_toothbrush,string
3,dental_health.times_brush_teeth_daily,string
4,dental_health.used_for_cleaning_teeth,string
5,disability.types_of_disability,string
6,do_you_consent_awh,string
7,end,string
8,formhub.uuid,string
9,health_information.alcohol_within_week,string


## Get input data

In [48]:
with open("data/aqmData.partial.json") as f:
    d = json.load(f)

input_data = json_normalize(d)
input_data = input_data.fillna('no answer')
input_data

Unnamed: 0,__version__,_attachments,_bamboo_dataset_id,_geolocation,_id,_notes,_status,_submission_time,_submitted_by,_tags,...,identity_consent/town,meta/instanceID,personal_info/address_line_1,personal_info/civil_status,personal_info/date_of_birth,personal_info/first_name,personal_info/gender,personal_info/last_name,personal_info/type_of_registration,start
0,vEo56thCPymxS2RDBK2its,[],,"[None, None]",4296,[],submitted_via_web,2019-03-26T03:36:06,no answer,[],...,_,uuid:cfdfbff0-c6b5-4007-b20a-5f4794391ce9,Siam Reap,SI,2019-03-27,Houn,F,Hen,S,2019-03-26T11:38:36.149+08:00


## Get Mapping 

In [43]:
mapping_df = pd.read_csv('schema/map/map.csv', skiprows=0)
mapping_df

Unnamed: 0,source_key,destination_key
0,personal_info.date_of_birth,demographics.birthdate
1,personal_info.civil_status,demographics.civil_st


In [44]:
source_mapping_fields_l = list(mapping_df['source_key'])
source_mapping_fields_l

['personal_info.date_of_birth', 'personal_info.civil_status']

## Get values input data that are included mapping

In [983]:
input_data[['personal_info.date_of_birth', 'personal_info.civil_status']]

KeyError: "['personal_info.date_of_birth' 'personal_info.civil_status'] not in index"