In [1366]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize 
import logging
from functools import reduce

## Configuration 

In [1367]:
#etl = 'kobo2elastic'
etl = 'curis2elastic'
#etl = 'oldcuris2newcuris'

input_schema_file = ''
input_data_file = ''
mapping_file = ''
    
if etl == 'curis2elastic':
    #old curis to elasticsearch
    input_schema_file = 'schema/input/curisSchema.1-item.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/couchbase2elastic.map.csv'
elif etl == 'kobo2elastic':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/aqmSchema.complete.json'
    input_data_file = 'data/aqmData.2-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'
elif etl == 'oldcuris2newcuris':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/curisData.1-Schema.avro.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'

In [1368]:

def flatten_json(nested_json):
    """
        Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '.')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + '[' + str(i) + '].')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [1369]:
def flattenDict(d, result=None):
    if result is None:
        result = {}
    for key in d:
        value = d[key]
        if isinstance(value, str):
            if "\n" in value:
                print('----------------',value) 
                value = value.replace("\n", ' and ')
                print('----------------',value) 

        if isinstance(value, dict):
        #if d['type'] =='object':
            value1 = {}
            for keyIn in value:
                value1[".".join([key,keyIn])]=value[keyIn]
            flattenDict(value1, result)
        elif isinstance(value, (list, tuple)):   
            for indexB, element in enumerate(value):
                if isinstance(element, dict):
                    value1 = {}
                    index = 0
                    for keyIn in element:
                        newkey = ".".join([key,keyIn])        
                        value1[".".join([key,keyIn])]=value[indexB][keyIn]
                        index += 1
                    for keyA in value1:
                        flattenDict(value1, result)   
        else:
            result[key]=value
    return result

## Get input JSON SChema

In [1370]:
d = {}
with open(input_schema_file) as f:
    d = json.load(f)

#d['items']['fields']
schema_df = pd.DataFrame()
schema_df = json_normalize(d)
schema_df.T

Unnamed: 0,0
$id,http://example.com/root.json
$schema,http://json-schema.org/draft-07/schema#
properties.address.$id,#/properties/address
properties.address.items.$id,#/properties/address/items
properties.address.items.properties.barangay.$id,#/properties/address/items/properties/barangay
properties.address.items.properties.barangay.pattern,^(.*)$
properties.address.items.properties.barangay.title,The Barangay Schema
properties.address.items.properties.barangay.type,string
properties.address.items.properties.country.$id,#/properties/address/items/properties/country
properties.address.items.properties.country.pattern,^(.*)$


In [1371]:
#flattenDict(d)
flatten_json(d)

{'$schema': 'http://json-schema.org/draft-07/schema#',
 '$id': 'http://example.com/root.json',
 'type': 'object',
 'title': 'The Root Schema',
 'properties.id.$id': '#/properties/id',
 'properties.id.type': 'string',
 'properties.id.title': 'The Id Schema',
 'properties.id.pattern': '^(.*)$',
 'properties.address.$id': '#/properties/address',
 'properties.address.type': 'array',
 'properties.address.title': 'The Address Schema',
 'properties.address.items.$id': '#/properties/address/items',
 'properties.address.items.type': 'object',
 'properties.address.items.title': 'The Items Schema',
 'properties.address.items.properties.barangay.$id': '#/properties/address/items/properties/barangay',
 'properties.address.items.properties.barangay.type': 'string',
 'properties.address.items.properties.barangay.title': 'The Barangay Schema',
 'properties.address.items.properties.barangay.pattern': '^(.*)$',
 'properties.address.items.properties.country.$id': '#/properties/address/items/properties/co

## Get valid index List only (for JSON Schema Only)

In [1372]:
validIndexLists = []
for indexList in list(schema_df):
    if "._" not in indexList:
        if len(indexList.split(sep='.')) > 2:
            if indexList.split(sep='.')[-1] == 'type' or indexList.split(sep='.')[-1] == 'title':
                validIndexLists.append(indexList)

In [1373]:
required_field_df = pd.DataFrame()
required_field_df = schema_df[validIndexLists]
required_field_df.T

Unnamed: 0,0
properties.address.items.properties.barangay.title,The Barangay Schema
properties.address.items.properties.barangay.type,string
properties.address.items.properties.country.title,The Country Schema
properties.address.items.properties.country.type,string
properties.address.items.properties.fax_number.items.properties.country_code.title,The Country_code Schema
properties.address.items.properties.fax_number.items.properties.country_code.type,string
properties.address.items.properties.fax_number.items.properties.number.title,The Number Schema
properties.address.items.properties.fax_number.items.properties.number.type,string
properties.address.items.properties.fax_number.items.title,The Items Schema
properties.address.items.properties.fax_number.items.type,object


## Clean index

In [1383]:
def clean_value(x):
    return x.lower().replace("/", ".").replace("the", "").replace("schema", "").strip()

def clean_index(x):
    return x.lower().replace("/", ".").replace("properties.", "").replace("items.", "").strip()

newSchema_df = pd.DataFrame()
newSchema_df['value'] = required_field_df.T[0].apply(clean_value)
newSchema_df.reset_index(level=0, inplace=True)
newSchema_df['index'] = newSchema_df['index'].apply(clean_index)
newSchema_df.sort_values(['index'])
newSchema_df.head(40)

Unnamed: 0,index,value
0,address.barangay.title,barangay
1,address.barangay.type,string
2,address.country.title,country
3,address.country.type,string
4,address.fax_number.country_code.title,country_code
5,address.fax_number.country_code.type,string
6,address.fax_number.number.title,number
7,address.fax_number.number.type,string
8,address.fax_number.title,items
9,address.fax_number.type,object


In [1382]:
valueSchema_df = pd.DataFrame()

number = len(newSchema_df)
index = 0
counter = 0
counter1 = 1

array_key = []
array_type = []

while (counter < number):
    array_key.append(newSchema_df.iloc[counter]['index'].replace('.title',''))
    counter += 2
        
while (counter1 < number):
    array_type.append(newSchema_df.iloc[counter1]['value'])
    counter1 += 2

valueSchema_df['source_key'] = array_key
valueSchema_df['source_type'] = array_type


valueSchema_df = valueSchema_df.sort_values(['source_key']).reset_index(drop=True)
valueSchema_df.loc[valueSchema_df['source_type'] == 'array']

Unnamed: 0,source_key,source_type
0,address,array
5,address.fax_number,array
9,address.landline_number,array
14,address.mobile_number,array
22,family_members,array
26,health_informations,array
43,health_informations.diagnosed,array
46,health_informations.family_history,array
55,households,array
58,households.amenities_present_in_house,array


In [1376]:
valueSchema_df.loc[valueSchema_df['source_type'] == 'object']

Unnamed: 0,source_key,source_type
1,address,object
4,address.fax_number,object
8,address.landline_number,object
13,address.mobile_number,object
25,health_informations,object
28,health_informations.blood_pressure,object
29,health_informations.blood_pressure.first_reading,object
32,health_informations.blood_pressure.second_reading,object
35,health_informations.blood_pressure.third_reading,object
56,households,object


In [1377]:
valueSchema_df.loc[valueSchema_df['source_type'] == 'string']

Unnamed: 0,source_key,source_type
2,address.barangay,string
3,address.country,string
6,address.fax_number.country_code,string
7,address.fax_number.number,string
10,address.landline_number.country_code,string
11,address.landline_number.number,string
12,address.lot_or_house_number,string
15,address.mobile_number.country_code,string
16,address.mobile_number.number,string
17,address.postal_code,string


In [1378]:
d = []
with open(input_data_file) as f:
    #d = json.load(f)
    d = json.load(f)
    print('----')
    print(d)
    print('---type--')
print(type(d))

----
{'id': '0003ff38-28fb-4005-9437-d276cbb9da4d', 'address': [{'barangay': 'naganacan', 'country': 'Philippines', 'lot_or_house_number': '', 'postal_code': '', 'province': 'santa maria isabela', 'fax_number': [{'country_code': '+63', 'number': '9914293423'}], 'mobile_number': [{'country_code': '+63', 'number': '9914293423'}], 'landline_number': [{'country_code': '+03', 'number': '64752233'}, {'country_code': '+03', 'number': '12345686'}, {'country_code': '+02', 'number': '34223212'}]}, {'barangay': 'siam', 'country': 'Reap', 'lot_or_house_number': '', 'postal_code': '', 'province': 'santa maria isabela', 'contact_number': [{'fax_number': [{'country_code': '+63', 'number': '9914293423'}]}, {'mobile_number': [{'country_code': '+63', 'number': '9914293423'}]}, {'landline_number': [{'country_code': '+22', 'number': '22222'}, {'country_code': '+22', 'number': '333333'}]}]}], 'birthdate': '09/19/1951', 'email_address': None, 'family_members': ['5289d20e-c80f-4c9e-9e79-7cd3cc2a3e90', '00021

## Get input data

In [1379]:

flat_json_0 = {}
flat_json_1 = {}
flat_json_2 = {}

flat_json_0 = flattenDict(d[0])
print('--flatten_dict_type--')
print(type(flat_json_0))
print('--flatten_dict_data--')
flat_json_0

print('--normalize flatten_dict--')
json_flat_norm_0 = json_normalize(flat_json_0)


flat_json_1 = flattenDict(d[1])
json_flat_norm_1 = json_normalize(flat_json_1)

flat_json_2 = flattenDict(d[2])
json_flat_norm_2 = json_normalize(flat_json_2)



KeyError: 0

In [None]:
json_flat_norm_0

In [None]:
json_flat_norm_1

In [None]:
json_flat_norm_2

## TESTING: flattendict with nested array of objects

In [None]:
dx = []
dx = {}
dx_list = []
dx_dict = {}
dx_normalize_df = pd.DataFrame()

input_data_file_test = 'data/curisData.1-items.json'
with open(input_data_file_test) as f:
    dx_list = json.load(f)
    #dx = json.load(f)

type(dx_list)

In [None]:
dx_flatten = flatten_json(dx_list[0])
dx_flatten

In [None]:
dx_normalize_df = json_normalize(dx_flatten)

dx_normalize_df.T.to_csv('processed_data.csv',sep=',')
type(dx_normalize_df)

In [None]:
list(dx_normalize_df.columns)

In [None]:
dx_normalize_df

## TODO: ITERATE OVER LIST and MERGE normalize headers

In [None]:
input_data_df = pd.DataFrame()
input_data_df = json_flat_norm_0
input_data_df = input_data_df.append(json_flat_norm_1, sort=True)
input_data_df = input_data_df.append(json_flat_norm_2, sort=True)
input_data_df.sort_values(['demographics.awh_id']).reset_index(drop=True)
#input_data_df.sort_values(p'd)

In [None]:
type(input_data_df)

## Clean input data column header

In [None]:
input_data_df.columns = input_data_df.columns.str.lower().str.replace('/','.')
input_data_df

## Get Mapping 

In [None]:
mapping_df = pd.read_csv(mapping_file, skiprows=0)
mapping_df

## Get values input data that are included mapping

In [None]:
selected_data_df = pd.DataFrame()
selected_data_df = input_data_df[list(mapping_df['source_key'])]
selected_data_df

## Renamce source header fields into destination header fields

In [None]:
selected_data_df.columns = list(mapping_df['destination_key'])
selected_data_df

## Encode to as Filesystem (HDFS) or S3 (avro) format)

## Decode for cleaning

## Decode to for computation

## Decode to for analytics transformation

## Transform flat file into output schema format

In [None]:
flat_json = ''
flat_json = selected_data_df.to_json(orient='records')
json_json = json.loads(flat_json)
json_json

## Convert dot notated fields into nested json

In [None]:
input_file = 'data/data.json'
with open(input_file) as f:
    d = (f)
 


In [None]:

def dot_to_json(a):
    output = {}
    for key, value in a.items():
        path = key.split('.')[1:]  # ignore the json. prefix
        #path = key
        target = reduce(lambda d, k: d.setdefault(k, {}), path[:-1], output)
        target[path[-1]] = value
    return output
 
data = {'json.message.status.time':50, 'json.message.code.response':80, 'json.time':100}
type(data)


data

## data

In [None]:
data

## dot data

In [None]:
dict_json = dot_to_json(data)
dict_json

## dictionary to json

In [None]:
def _dict2json(results):
    counter = 0
    data = []

    for row in results: 
        data.append(json.dumps(row))
        counter += 1
    
    return data

json_dict = _dict2json(dict_json) 
json_dict

In [None]:
iris = pd.DataFrame()
iris = pd.read_csv('data/sql.csv')
iris
#iris.to_json(orient='records')
list(iris['0'])

In [None]:
iris