In [2]:
import json

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 
import logging

from functools import reduce

import itertools
import random
import psycopg2
from psycopg2.extras import Json
from sqlalchemy import create_engine, MetaData
import io
import pandas_profiling

## Get output JSON SChema

In [16]:
with open("schema/output/elasticSchema.json") as f:
    d = json.load(f)

schema_df = pd.DataFrame()
schema_df = json_normalize(d)
schema_df

Unnamed: 0,demographics.properties.active.type,demographics.properties.address.properties.add_date.type,demographics.properties.address.properties.commnty.type,demographics.properties.address.properties.country.type,demographics.properties.address.properties.province.type,demographics.properties.address.properties.zip.type,demographics.properties.address.type,demographics.properties.are_you_currently_earning.type,demographics.properties.awh_id.type,demographics.properties.birth_date.format,...,demographics.properties.employed.properties.m_income.type,demographics.properties.employed.properties.nature.type,demographics.properties.employed.type,demographics.properties.org.type,demographics.properties.religion.type,demographics.properties.sex.type,demographics.properties.version.properties.date.type,demographics.properties.version.properties.number.type,demographics.properties.version.type,demographics.properties.where_income_from.type
0,boolean,date,keyword,keyword,keyword,keyword,object,keyword,keyword,MM/dd/yyyy || yyyy-MM-dd,...,float,keyword,object,keyword,keyword,keyword,date,integer,object,keyword


In [4]:
list(schema_df)

['$id',
 '$schema',
 'items.properties.__version__.pattern',
 'items.properties.__version__.title',
 'items.properties.__version__.type',
 'items.properties._attachments.title',
 'items.properties._attachments.type',
 'items.properties._bamboo_dataset_id.pattern',
 'items.properties._bamboo_dataset_id.title',
 'items.properties._bamboo_dataset_id.type',
 'items.properties._geolocation.items.title',
 'items.properties._geolocation.items.type',
 'items.properties._geolocation.title',
 'items.properties._geolocation.type',
 'items.properties._id.title',
 'items.properties._id.type',
 'items.properties._notes.title',
 'items.properties._notes.type',
 'items.properties._status.pattern',
 'items.properties._status.title',
 'items.properties._status.type',
 'items.properties._submission_time.pattern',
 'items.properties._submission_time.title',
 'items.properties._submission_time.type',
 'items.properties._submitted_by.title',
 'items.properties._submitted_by.type',
 'items.properties._tags.t

## Get valid index List only

In [27]:
validIndexLists = []
for indexList in list(schema_df):
    if "._" not in indexList:
        if len(indexList.split(sep='.')) > 2:
            if indexList.split(sep='.')[-1] == 'type' or indexList.split(sep='.')[-1] == 'title':
                validIndexLists.append(indexList)
validIndexLists[:5]

['demographics.properties.active.type',
 'demographics.properties.address.properties.add_date.type',
 'demographics.properties.address.properties.commnty.type',
 'demographics.properties.address.properties.country.type',
 'demographics.properties.address.properties.province.type']

In [6]:
schema_df[['items.properties.dental_health/owning_toothbrush.title','items.properties.dental_health/owning_toothbrush.type']]

Unnamed: 0,items.properties.dental_health/owning_toothbrush.title,items.properties.dental_health/owning_toothbrush.type
0,The Dental_health/owning_toothbrush Schema,string


In [23]:
required_field_df = pd.DataFrame()
required_field_df = schema_df[validIndexLists]
required_field_df.T

Unnamed: 0,0
demographics.properties.active.type,boolean
demographics.properties.address.properties.add_date.type,date
demographics.properties.address.properties.commnty.type,keyword
demographics.properties.address.properties.country.type,keyword
demographics.properties.address.properties.province.type,keyword
demographics.properties.address.properties.zip.type,keyword
demographics.properties.address.type,object
demographics.properties.are_you_currently_earning.type,keyword
demographics.properties.awh_id.type,keyword
demographics.properties.birth_date.type,date


#### snippet code

## Clean values

In [24]:
def clean_value(x):
    return x.lower().replace("/", ".").replace("the", "").replace("schema", "").strip()



def clean_index(x):
    return x.lower().replace("/", ".").replace(".properties", "").replace("items.", "").strip()

newSchema_df = pd.DataFrame()
newSchema_df['value'] = required_field_df.T[0].apply(clean_value)
#newSchema_df['value'] = newSchema_df['value'].T.head(4)[0].apply(clean_value)

newSchema_df.reset_index(level=0, inplace=True)

newSchema_df.head(4)

Unnamed: 0,index,value
0,demographics.properties.active.type,boolean
1,demographics.properties.address.properties.add...,date
2,demographics.properties.address.properties.com...,keyword
3,demographics.properties.address.properties.cou...,keyword


In [26]:
newSchema_df['index'] = newSchema_df['index'].apply(clean_index)
newSchema_df

Unnamed: 0,index,value
0,demographics.active.type,boolean
1,demographics.address.add_date.type,date
2,demographics.address.commnty.type,keyword
3,demographics.address.country.type,keyword
4,demographics.address.province.type,keyword
5,demographics.address.zip.type,keyword
6,demographics.address.type,object
7,demographics.are_you_currently_earning.type,keyword
8,demographics.awh_id.type,keyword
9,demographics.birth_date.type,date


In [21]:
valueSchema_df = pd.DataFrame()

number = len(newSchema_df)
index = 0
counter = 0
counter1 = 1

array_key = []
array_type = []

while (counter < number):
    array_key.append(newSchema_df.iloc[counter]['index'].replace('.title',''))
    counter += 2
        
while (counter1 < number):
    array_type.append(newSchema_df.iloc[counter1]['value'])
    counter1 += 2

valueSchema_df['source_key'] = array_key
valueSchema_df['source_type'] = array_type


valueSchema_df.head(10)

ValueError: Length of values does not match length of index

## Get input data

In [12]:
with open("data/aqmData.partial.json") as f:
    d = json.load(f)

input_data = json_normalize(d)
input_data = input_data.fillna('no answer')
input_data

Unnamed: 0,__version__,_attachments,_bamboo_dataset_id,_geolocation,_id,_notes,_status,_submission_time,_submitted_by,_tags,...,identity_consent/town,meta/instanceID,personal_info/address_line_1,personal_info/civil_status,personal_info/date_of_birth,personal_info/first_name,personal_info/gender,personal_info/last_name,personal_info/type_of_registration,start
0,vEo56thCPymxS2RDBK2its,[],,"[None, None]",4296,[],submitted_via_web,2019-03-26T03:36:06,no answer,[],...,_,uuid:cfdfbff0-c6b5-4007-b20a-5f4794391ce9,Siam Reap,SI,2019-03-27,Houn,F,Hen,S,2019-03-26T11:38:36.149+08:00


## Get Mapping 

In [13]:
mapping_df = pd.read_csv('schema/map/map.csv', skiprows=0)
mapping_df

Unnamed: 0,source_key,destination_key
0,personal_info.date_of_birth,demographics.birthdate
1,personal_info.civil_status,demographics.civil_st


In [14]:
source_mapping_fields_l = list(mapping_df['source_key'])
source_mapping_fields_l

['personal_info.date_of_birth', 'personal_info.civil_status']

## Get values input data that are included mapping

In [15]:
input_data[['personal_info.date_of_birth', 'personal_info.civil_status']]

KeyError: "['personal_info.date_of_birth' 'personal_info.civil_status'] not in index"