In [1974]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize 
from pandas import read_csv
import logging
from functools import reduce
import csv

## Configuration 

In [1756]:
#etl = 'kobo2elastic'
etl = 'curis2elastic'
#etl = 'oldcuris2newcuris'

input_schema_file = ''
input_data_file = ''
mapping_file = ''
    
if etl == 'curis2elastic':
    #old curis to elasticsearch
    input_schema_file = 'schema/input/curisSchema.1-item.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/couchbase2elastic.map.csv'
elif etl == 'kobo2elastic':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/aqmSchema.complete.json'
    input_data_file = 'data/aqmData.2-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'
elif etl == 'oldcuris2newcuris':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/curisData.1-Schema.avro.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'

In [1757]:

def flatten_json(nested_json):
    """
        Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '.')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + '[' + str(i) + '].')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [1758]:
def flattenDict(d, result=None):
    if result is None:
        result = {}
    for key in d:
        value = d[key]
        if isinstance(value, str):
            if "\n" in value:
                print('----------------',value) 
                value = value.replace("\n", ' and ')
                print('----------------',value) 

        if isinstance(value, dict):
        #if d['type'] =='object':
            value1 = {}
            for keyIn in value:
                value1[".".join([key,keyIn])]=value[keyIn]
            flattenDict(value1, result)
        elif isinstance(value, (list, tuple)):   
            for indexB, element in enumerate(value):
                if isinstance(element, dict):
                    value1 = {}
                    index = 0
                    for keyIn in element:
                        newkey = ".".join([key,keyIn])        
                        value1[".".join([key,keyIn])]=value[indexB][keyIn]
                        index += 1
                    for keyA in value1:
                        flattenDict(value1, result)   
        else:
            result[key]=value
    return result

## Get input JSON SChema

In [1759]:
d = {}
with open(input_schema_file) as f:
    d = json.load(f)

#d['items']['fields']
schema_df = pd.DataFrame()
schema_df = json_normalize(d)
schema_df.T

Unnamed: 0,0
$id,http://example.com/root.json
$schema,http://json-schema.org/draft-07/schema#
properties.address.$id,#/properties/address
properties.address.items.$id,#/properties/address/items
properties.address.items.properties.barangay.$id,#/properties/address/items/properties/barangay
properties.address.items.properties.barangay.pattern,^(.*)$
properties.address.items.properties.barangay.title,The Barangay Schema
properties.address.items.properties.barangay.type,string
properties.address.items.properties.country.$id,#/properties/address/items/properties/country
properties.address.items.properties.country.pattern,^(.*)$


In [1760]:
#flattenDict(d)
flatten_json(d)

{'$schema': 'http://json-schema.org/draft-07/schema#',
 '$id': 'http://example.com/root.json',
 'type': 'object',
 'title': 'The Root Schema',
 'properties.id.$id': '#/properties/id',
 'properties.id.type': 'string',
 'properties.id.title': 'The Id Schema',
 'properties.id.pattern': '^(.*)$',
 'properties.address.$id': '#/properties/address',
 'properties.address.type': 'array',
 'properties.address.title': 'The Address Schema',
 'properties.address.items.$id': '#/properties/address/items',
 'properties.address.items.type': 'object',
 'properties.address.items.title': 'The Items Schema',
 'properties.address.items.properties.barangay.$id': '#/properties/address/items/properties/barangay',
 'properties.address.items.properties.barangay.type': 'string',
 'properties.address.items.properties.barangay.title': 'The Barangay Schema',
 'properties.address.items.properties.barangay.pattern': '^(.*)$',
 'properties.address.items.properties.country.$id': '#/properties/address/items/properties/co

## Get valid index List only (for JSON Schema Only)

In [1761]:
validIndexLists = []
for indexList in list(schema_df):
    if "._" not in indexList:
        if len(indexList.split(sep='.')) > 2:
            if indexList.split(sep='.')[-1] == 'type' or indexList.split(sep='.')[-1] == 'title':
                validIndexLists.append(indexList)

In [1762]:
required_field_df = pd.DataFrame()
required_field_df = schema_df[validIndexLists]
required_field_df.T

Unnamed: 0,0
properties.address.items.properties.barangay.title,The Barangay Schema
properties.address.items.properties.barangay.type,string
properties.address.items.properties.country.title,The Country Schema
properties.address.items.properties.country.type,string
properties.address.items.properties.fax_number.items.properties.country_code.title,The Country_code Schema
properties.address.items.properties.fax_number.items.properties.country_code.type,string
properties.address.items.properties.fax_number.items.properties.number.title,The Number Schema
properties.address.items.properties.fax_number.items.properties.number.type,string
properties.address.items.properties.fax_number.items.title,The Items Schema
properties.address.items.properties.fax_number.items.type,object


## Clean index

In [1763]:
def clean_value(x):
    return x.lower().replace("/", ".").replace("the", "").replace("schema", "").strip()

def clean_index(x):
    return x.lower().replace("/", ".").replace("properties.", "").replace("items.", "").strip()

newSchema_df = pd.DataFrame()
newSchema_df['value'] = required_field_df.T[0].apply(clean_value)
newSchema_df.reset_index(level=0, inplace=True)
newSchema_df['index'] = newSchema_df['index'].apply(clean_index)
newSchema_df.sort_values(['index'])
newSchema_df.head(60)

Unnamed: 0,index,value
0,address.barangay.title,barangay
1,address.barangay.type,string
2,address.country.title,country
3,address.country.type,string
4,address.fax_number.country_code.title,country_code
5,address.fax_number.country_code.type,string
6,address.fax_number.number.title,number
7,address.fax_number.number.type,string
8,address.fax_number.title,items
9,address.fax_number.type,object


## Create separate CSV files based on array types

In [1945]:
valueSchema_df = pd.DataFrame()

number = len(newSchema_df)
index = 0
counter = 0
counter1 = 1

array_key = []
array_type = []

while (counter < number):
    array_key.append(newSchema_df.iloc[counter]['index'].replace('.title',''))
    counter += 2
        
while (counter1 < number):
    array_type.append(newSchema_df.iloc[counter1]['value'])
    counter1 += 2

valueSchema_df['source_key'] = array_key
valueSchema_df['source_type'] = array_type

valueSchema_df = valueSchema_df.sort_values(['source_key']).reset_index(drop=True)
valueSchema_df.loc[valueSchema_df['source_type'] == 'array']

csv_filename_df = pd.DataFrame()
csv_filename_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'array']
#list(csv_filename_df['source_key'])
csv_filename_df = csv_filename_df.reset_index(drop=True)
csv_filename_df

Unnamed: 0,source_key,source_type
0,address,array
1,address.fax_number,array
2,address.landline_number,array
3,address.mobile_number,array
4,family_members,array
5,health_informations,array
6,health_informations.diagnosed,array
7,health_informations.family_history,array
8,households,array
9,households.amenities_present_in_house,array


## Create all filenames

In [2233]:
for i in list(csv_filename_df['source_key']):
    with open('file/' + i + '.csv', 'w'):
        pass

### have default csv filename (e.g. main, resident)

In [2234]:
with open('file/resident.csv', 'w'):
    pass

## Skip all object types

In [2215]:
csv_header_name_object_df = pd.DataFrame()
csv_header_name_object_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'object']
list(csv_header_name_object_df['source_key'])
csv_header_name_object_df

Unnamed: 0,source_key,source_type
1,address,object
4,address.fax_number,object
8,address.landline_number,object
13,address.mobile_number,object
25,health_informations,object
28,health_informations.blood_pressure,object
29,health_informations.blood_pressure.first_reading,object
32,health_informations.blood_pressure.second_reading,object
35,health_informations.blood_pressure.third_reading,object
56,households,object


In [1771]:
csv_header_name_df = pd.DataFrame()
csv_header_name_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'string']
csv_header_name_df

Unnamed: 0,source_key,source_type
2,address.barangay,string
3,address.country,string
6,address.fax_number.country_code,string
7,address.fax_number.number,string
10,address.landline_number.country_code,string
11,address.landline_number.number,string
12,address.lot_or_house_number,string
15,address.mobile_number.country_code,string
16,address.mobile_number.number,string
17,address.postal_code,string


In [2235]:
csv_filename_list = []
csv_header_name_list = []
csv_filename_list = list(csv_filename_df['source_key']) #array type
csv_header_name_list = list(csv_header_name_df['source_key']) #string type

main_header = []
array_header = []
object_header = []



        
for header in csv_header_name_list:
    for filename in csv_filename_list:
        tmp_header = header.split(sep='.')
        #print(tmp_header)
        if(len(tmp_header)) > 1:
            #pass
            #print(tmp_header[:-1])
            #tmp_string = '.'.join(tmp_header[:-1])
            tmp_string = header
            
            #print(tmp_string)
            
            
            #print('============')
            if tmp_string == filename: #for array list
                #print('---array_list---')
                #print(header)
                #print('--header/field: ' + tmp_string + ' == --filename: ' + filename)
                array_header.append(header)
        else:
            main_header.append(tmp_header[0])

            


## List CSV

In [2236]:
array_csv = []
array_csv = array_header
array_csv

['health_informations.diagnosed',
 'health_informations.family_history',
 'households.amenities_present_in_house',
 'households.sanitary_type']

In [2237]:
for header in array_csv:
    #print(header)
    array_csv_df  = pd.DataFrame(columns=['id', header])
    array_csv_df.to_csv('file/' + header + '.csv', encoding='utf-8', mode='a', index=False)
array_csv_df

Unnamed: 0,id,households.sanitary_type


## Array of object CSV

In [2238]:
object_array_csv = set()
object_array_csv = set(csv_header_name_list) - set(csv_filename_list) - set(main_header)
object_array_csv_list = list(object_array_csv)
object_array_csv

{'address.barangay',
 'address.country',
 'address.fax_number.country_code',
 'address.fax_number.number',
 'address.landline_number.country_code',
 'address.landline_number.number',
 'address.lot_or_house_number',
 'address.mobile_number.country_code',
 'address.mobile_number.number',
 'address.postal_code',
 'address.province',
 'health_informations.allergies',
 'health_informations.blood_pressure.first_reading.diastole',
 'health_informations.blood_pressure.first_reading.systole',
 'health_informations.blood_pressure.second_reading.diastole',
 'health_informations.blood_pressure.second_reading.systole',
 'health_informations.blood_pressure.third_reading.diastole',
 'health_informations.blood_pressure.third_reading.systole',
 'health_informations.blood_sign',
 'health_informations.blood_type',
 'health_informations.date_updated',
 'health_informations.exercise_in_a_week',
 'health_informations.fruits_in_a_week',
 'health_informations.high_cost_medicine',
 'health_informations.mainten

## object with filenames

In [1936]:
tmp = object_array_csv_list[0]
tmp

'health_informations.blood_type'

In [2184]:

other_header_object = []
for name in object_array_csv_list:
    str1 = name.split('.')
    str2 = '.'.join(str1[0:-1])
    if str2 in csv_filename_list:
        
        other_header_object.append(name)
other_header_object


['health_informations.blood_type',
 'health_informations.smoking_habit',
 'health_informations.fruits_in_a_week',
 'address.mobile_number.number',
 'health_informations.vegetables_in_a_week',
 'health_informations.maintenance_drugs',
 'address.barangay',
 'health_informations.blood_sign',
 'health_informations.date_updated',
 'households.neighborhood_description',
 'address.landline_number.country_code',
 'health_informations.exercise_in_a_week',
 'households.house_ownership',
 'households.date_updated',
 'households.type_of_house',
 'address.landline_number.number',
 'profiles.religion',
 'address.country',
 'address.postal_code',
 'address.mobile_number.country_code',
 'address.province',
 'address.fax_number.country_code',
 'health_informations.allergies',
 'health_informations.high_cost_medicine',
 'profiles.date_updated',
 'address.lot_or_house_number',
 'profiles.education',
 'address.fax_number.number',
 'households.sanitary_ownership',
 'profiles.civil_status']

In [2243]:

from collections import defaultdict

philip = defaultdict(list)

for header in other_header_object:
    #bars[filenames] = []
    
    filenames = header.split(sep=".")[0:-1]
    filenames = '.'.join(filenames)
    philip[filenames].append(header)
    #print('headername: ', header)
    #print('filenames: ', filenames)

philip['profiles']

['profiles.religion',
 'profiles.date_updated',
 'profiles.education',
 'profiles.civil_status']

In [2213]:
bars = { 
    'health_informations': ['health_informations.house_ownership', 'health_informations.type_of_house'],
    'profiles': ['profiles.eduction', 'profiles.religion'],
    'address.mobile_number': ['address.mobile_number.number','address.mobile_number.code']
    }
bars

{'health_informations': ['health_informations.house_ownership',
  'health_informations.type_of_house'],
 'profiles': ['profiles.eduction', 'profiles.religion'],
 'address.mobile_number': ['address.mobile_number.number',
  'address.mobile_number.code']}

In [2244]:
for header in philip:
    print(philip[header])
    tmp = list(philip[header])
    print(str(tmp))
    object_array_csv_df  = pd.DataFrame(columns= tmp)
    object_array_csv_df.to_csv('file/' + header + '.csv', encoding='utf-8', mode='a', index=False)
object_array_csv_df

['health_informations.blood_type', 'health_informations.smoking_habit', 'health_informations.fruits_in_a_week', 'health_informations.vegetables_in_a_week', 'health_informations.maintenance_drugs', 'health_informations.blood_sign', 'health_informations.date_updated', 'health_informations.exercise_in_a_week', 'health_informations.allergies', 'health_informations.high_cost_medicine']
['health_informations.blood_type', 'health_informations.smoking_habit', 'health_informations.fruits_in_a_week', 'health_informations.vegetables_in_a_week', 'health_informations.maintenance_drugs', 'health_informations.blood_sign', 'health_informations.date_updated', 'health_informations.exercise_in_a_week', 'health_informations.allergies', 'health_informations.high_cost_medicine']
['address.mobile_number.number', 'address.mobile_number.country_code']
['address.mobile_number.number', 'address.mobile_number.country_code']
['address.barangay', 'address.country', 'address.postal_code', 'address.province', 'addres

Unnamed: 0,address.fax_number.country_code,address.fax_number.number


## object without filenames

In [2239]:
main_header_object = []
for name in object_array_csv_list:
    #print(name)
    if name.split(sep='.')[0] not in csv_filename_list:
        main_header_object.append(name)
main_header_object

['identification.id2.type',
 'user-cam.id',
 'user-cam.owner',
 'profile_picture.path',
 'identification.id3.type',
 'identification.id1.type',
 'profile_picture.name']

## Object CSV

In [2240]:
object_csv = set()
object_csv = set(csv_header_name_object_list) - set(csv_filename_list)
object_csv

{'health_informations.blood_pressure',
 'health_informations.blood_pressure.first_reading',
 'health_informations.blood_pressure.second_reading',
 'health_informations.blood_pressure.third_reading',
 'identification',
 'identification.id1',
 'identification.id2',
 'identification.id3',
 'profile_picture',
 'profiles.employment',
 'user-cam'}

## Main (e.g. resident) csv file

In [2241]:
main_header_list = list(dict.fromkeys(main_header))
main_header_list = main_header_list  + main_header_object
main_header_list

['birthdate',
 'family_members',
 'first_name',
 'gender',
 'id',
 'last_name',
 'last_name_suffix',
 'middle_name',
 'nhid',
 'organization',
 'registered_at',
 'type',
 'identification.id2.type',
 'user-cam.id',
 'user-cam.owner',
 'profile_picture.path',
 'identification.id3.type',
 'identification.id1.type',
 'profile_picture.name']

In [2242]:
main_resident_df  = pd.DataFrame(columns = main_header_list)
#main_resident_df = main_resident_df.iloc[0:0]
#main_resident_df
#main_resident_df.columns = main_header

#TODO: check if header/columns already exist
main_resident_df.to_csv('file/resident.csv', encoding='utf-8', mode='a', index=False)
main_resident_df

Unnamed: 0,birthdate,family_members,first_name,gender,id,last_name,last_name_suffix,middle_name,nhid,organization,registered_at,type,identification.id2.type,user-cam.id,user-cam.owner,profile_picture.path,identification.id3.type,identification.id1.type,profile_picture.name


## split ang get first string, 
### if string is equal to csv filename, include as csv header in same csv filename
### if string not equal, include as csv header in main csv

In [None]:
headers = []
headers =  list(csv_header_name_df['source_key'])

empty_df = pd.DataFrame()
empty_df.columns = ["Sequence", "Start", "End", "Coverage"]
#empty_df.to_csv('file/resident.csv', encoding='utf-8', mode='a', header=None, index=False)

In [None]:


#df.to_csv('file/resident.csv', encoding='utf-8', mode='w', header=header, index=False)


In [None]:
d = []
with open(input_data_file) as f:
    #d = json.load(f)
    d = json.load(f)
    print('----')
    print(d)
    print('---type--')
print(type(d))

## Get input data

In [None]:

flat_json_0 = {}
flat_json_1 = {}
flat_json_2 = {}

flat_json_0 = flattenDict(d[0])
print('--flatten_dict_type--')
print(type(flat_json_0))
print('--flatten_dict_data--')
flat_json_0

print('--normalize flatten_dict--')
json_flat_norm_0 = json_normalize(flat_json_0)


flat_json_1 = flattenDict(d[1])
json_flat_norm_1 = json_normalize(flat_json_1)

flat_json_2 = flattenDict(d[2])
json_flat_norm_2 = json_normalize(flat_json_2)



In [None]:
json_flat_norm_0

In [None]:
json_flat_norm_1

In [None]:
json_flat_norm_2

## TESTING: flattendict with nested array of objects

In [None]:
dx = []
dx = {}
dx_list = []
dx_dict = {}
dx_normalize_df = pd.DataFrame()

input_data_file_test = 'data/curisData.1-items.json'
with open(input_data_file_test) as f:
    dx_list = json.load(f)
    #dx = json.load(f)

type(dx_list)

In [None]:
dx_flatten = flatten_json(dx_list[0])
dx_flatten

In [None]:
dx_normalize_df = json_normalize(dx_flatten)

dx_normalize_df.T.to_csv('processed_data.csv',sep=',')
type(dx_normalize_df)

In [None]:
list(dx_normalize_df.columns)

In [None]:
dx_normalize_df

## TODO: ITERATE OVER LIST and MERGE normalize headers

In [None]:
input_data_df = pd.DataFrame()
input_data_df = json_flat_norm_0
input_data_df = input_data_df.append(json_flat_norm_1, sort=True)
input_data_df = input_data_df.append(json_flat_norm_2, sort=True)
input_data_df.sort_values(['demographics.awh_id']).reset_index(drop=True)
#input_data_df.sort_values(p'd)

In [None]:
type(input_data_df)

## Clean input data column header

In [None]:
input_data_df.columns = input_data_df.columns.str.lower().str.replace('/','.')
input_data_df

## Get Mapping 

In [None]:
mapping_df = pd.read_csv(mapping_file, skiprows=0)
mapping_df

## Get values input data that are included mapping

In [None]:
selected_data_df = pd.DataFrame()
selected_data_df = input_data_df[list(mapping_df['source_key'])]
selected_data_df

## Renamce source header fields into destination header fields

In [None]:
selected_data_df.columns = list(mapping_df['destination_key'])
selected_data_df

## Encode to as Filesystem (HDFS) or S3 (avro) format)

## Decode for cleaning

## Decode to for computation

## Decode to for analytics transformation

## Transform flat file into output schema format

In [None]:
flat_json = ''
flat_json = selected_data_df.to_json(orient='records')
json_json = json.loads(flat_json)
json_json

## Convert dot notated fields into nested json

In [None]:
input_file = 'data/data.json'
with open(input_file) as f:
    d = (f)
 


In [None]:

def dot_to_json(a):
    output = {}
    for key, value in a.items():
        path = key.split('.')[1:]  # ignore the json. prefix
        #path = key
        target = reduce(lambda d, k: d.setdefault(k, {}), path[:-1], output)
        target[path[-1]] = value
    return output
 
data = {'json.message.status.time':50, 'json.message.code.response':80, 'json.time':100}
type(data)


data

## data

In [None]:
data

## dot data

In [None]:
dict_json = dot_to_json(data)
dict_json

## dictionary to json

In [None]:
def _dict2json(results):
    counter = 0
    data = []

    for row in results: 
        data.append(json.dumps(row))
        counter += 1
    
    return data

json_dict = _dict2json(dict_json) 
json_dict

In [None]:
iris = pd.DataFrame()
iris = pd.read_csv('data/sql.csv')
iris
#iris.to_json(orient='records')
list(iris['0'])

In [None]:
iris