In [45]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize 
from pandas import read_csv
import logging
from functools import reduce
import csv

## Configuration 

In [46]:
etl = 'curis2elastic'

input_schema_file = ''
input_data_file = ''
mapping_file = ''
    
if etl == 'curis2elastic':
    #old curis to elasticsearch
    input_schema_file = 'schema/input/curisSchema.1-item.json'
    #input_data_file = 'data/curisData.2-actual-items.json'
    input_data_file = 'data/source.53-items.json'
    
    mapping_file = 'schema/map/couchbase2elastic.map.csv'
elif etl == 'kobo2elastic':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/aqmSchema.complete.json'
    input_data_file = 'data/aqmData.2-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'
elif etl == 'oldcuris2newcuris':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/curisData.1-Schema.avro.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'

## Flatten json

In [47]:
def _flatten_json(nested_json):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '.')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + '' + str(i) + '.')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

## header filters

In [48]:
## OPTIMIZE TO ACCEPT LIST NOT DATAFRAME
def filter_key(x):
    lists = format_key(x)
    lists = exclude_digit(lists)
    lists = list2string(lists)
    return lists

def filter_index_map(x):
    lists = format_key(x)
    lists = include_digit(lists)
    lists = list2string(lists)
    return lists

def format_key(items):
    #return list(map(lambda x:x.lower().split(sep='.'), items ))
    return items.lower().split(sep='.')

def include_digit(items):
    return [item for item in items if item.isdigit()]

def exclude_digit(items):
    return [item for item in items if not item.isdigit()]

def list2string(lists):
    return '.'.join(lists)

## File loader

In [49]:
def open_file():
    with open(input_data_file) as f:
        return json.load(f)
    
def _dict_to_dataframe(dict_object):
    return pd.DataFrame.from_dict({'value': dict_object})

def _add_custom_colums(dataframe_object):
    dataframe_object['key'] = list(dataframe_object.index)
    dataframe_object['key'] = dataframe_object['key'].apply(filter_key)

    dataframe_object['_index_map'] = list(dataframe_object.index)
    dataframe_object['_index_map'] = dataframe_object['_index_map'].apply(filter_index_map)

    dataframe_object['_id'] = dataframe_object[dataframe_object['key'] == 'id']['value'].values[0]

    dataframe_object = dataframe_object.reset_index(drop=True)

    return dataframe_object

In [50]:
def _get_csv_headers(filename):
    file_dir = 'file/'
    return pd.read_csv( file_dir + filename + '.csv',nrows=0) #get header only

In [51]:
def _get_required_data(input_data_df,csv_header_df):
    required_header_list = [] 

    for data in input_data_df['key']:
        if data in csv_header_df.columns:
            required_header_list.append(data)
             
    return input_data_df[input_data_df['key'].isin(required_header_list)]

def _rename_index(data_df):
    if 'key' in data_df.columns:
        data_df.index = list(data_df['key'])
        data_df = data_df.drop('key', axis=1)
    return data_df 

def _columnar_to_row(input_data_df,csv_header_df):
    
    for header in list(input_data_df.index):
        if header:
            csv_header_df.at['',header] = input_data_df.loc[header]['value'] 
    

        csv_header_df.at['','_id'] = input_data_df['_id'][header] 
        csv_header_df.at['','_index_map'] = input_data_df['_index_map'][header] 
    return csv_header_df
    
def _write_to_csv(data_csv_df, filename='resident'):
    file_dir = 'file/'
    #if data_csv_df:
    return data_csv_df.to_csv( file_dir + filename + '.csv', encoding='utf-8', mode='a', header=False,index=False)

## Load Data

In [52]:
data_list = []
data_list = open_file()

## Iterate

In [53]:
'''

data_flat_dict = {}
data_flat_dict = _flatten_json(data_list[0])

data_flat_df = pd.DataFrame()
data_flat_df = _dict_to_dataframe(data_flat_dict)
data_flat_df = _add_custom_colums(data_flat_df)
'''

'\n\ndata_flat_dict = {}\ndata_flat_dict = _flatten_json(data_list[0])\n\ndata_flat_df = pd.DataFrame()\ndata_flat_df = _dict_to_dataframe(data_flat_dict)\ndata_flat_df = _add_custom_colums(data_flat_df)\n'

## Iterate functions for raw data

## Filter index_map

In [54]:
'''

filename = 'health_informations.diagnosed'
root_data_df = data_flat_df[data_flat_df['_index_map'] == '0' ]

root_csv_df = pd.DataFrame()
root_csv_df = _get_csv_headers(filename)

new_data_df = pd.DataFrame()
new_data_df = _get_required_data(root_data_df, root_csv_df)
new_data_df = _rename_index(new_data_df)
new_data_df = _columnar_to_row(new_data_df, root_csv_df)
new_data_df = _write_to_csv(new_data_df, filename)

new_data_df
'''

"\n\nfilename = 'health_informations.diagnosed'\nroot_data_df = data_flat_df[data_flat_df['_index_map'] == '0' ]\n\nroot_csv_df = pd.DataFrame()\nroot_csv_df = _get_csv_headers(filename)\n\nnew_data_df = pd.DataFrame()\nnew_data_df = _get_required_data(root_data_df, root_csv_df)\nnew_data_df = _rename_index(new_data_df)\nnew_data_df = _columnar_to_row(new_data_df, root_csv_df)\nnew_data_df = _write_to_csv(new_data_df, filename)\n\nnew_data_df\n"

In [55]:
schema_csv = pd.read_csv('file/schema.csv', skiprows=0)
filenames_list = schema_csv['file_name'].values[0]
filenames_list = filenames_list.split(sep=",")
filenames_list

['date_visits',
 'family_members',
 'health_informations.diagnosed',
 'health_informations.family_history',
 'households.amenities_present_in_house',
 'households.sanitary_type',
 'profiles',
 'health_informations',
 'households',
 'resident']

In [56]:
def _main(data_flat_df):

    index_map_list = [] 
    index_map_list = list(data_flat_df['_index_map'].unique())
    print(index_map_list)
    
    schema_csv = pd.read_csv('file/schema.csv', skiprows=0)
    filenames_list = schema_csv['file_name'].values[0]
    filenames_list = filenames_list.split(sep=",")
    filenames_list

    for index in index_map_list:
        
        for filename in filenames_list:
            print('INDEX: ',index)
            print('Filename: ',filename)
            
            
            root_data_df = pd.DataFrame()
            root_data_df = data_flat_df[data_flat_df['_index_map'] == index ]
            #print(root_data_df)
            #root_data_df
            
            root_csv_df = pd.DataFrame()
            root_csv_df = _get_csv_headers(filename)
        
            new_data_df = pd.DataFrame()

            new_data_df = _get_required_data(root_data_df, root_csv_df)
            new_data_df = _rename_index(new_data_df)
            new_data_df = _columnar_to_row(new_data_df, root_csv_df)
            new_data_df = _write_to_csv(new_data_df, filename)
            

In [57]:
def init(data_list):
    data_flat_dict = {}
    
    #print(data_list)
    
    for datum in data_list:
        print(datum)
        data_flat_dict = _flatten_json(datum)

        data_flat_df = pd.DataFrame()
        data_flat_df = _dict_to_dataframe(data_flat_dict)
        data_flat_df = _add_custom_colums(data_flat_df)
        
        _main(data_flat_df)
        print('----')

In [58]:
init(data_list)

{'address': {'barangay': 'burgos gengos', 'country': 'Philippines', 'lot_or_house_number': '', 'postal_code': '5022', 'province': 'iloilo'}, 'email_address': None, 'family_members': ['9c4de471-a33a-4bfa-899c-3fd63613042e', 'e8fc7444-8316-4e26-9d67-b66b475b269b', '73c49cac-16f7-4b93-908b-4617d0bca91c', 'fd90bfb0-0d06-4e99-840a-60cee0b28cff', '0004d599-f8c8-46c4-9476-b51debae7271', '9497f7af-642e-4c8e-b61d-aa9372cc6f5d'], 'health_informations': [], 'households': [], 'id': '0004d599-f8c8-46c4-9476-b51debae7271', 'identification': {'id1': {'identifier': None, 'type': '4Ps'}, 'id2': {'identifier': None, 'type': '4Ps'}, 'id3': {'identifier': None, 'type': '4Ps'}}, 'organization': 'Guimbal RHU', 'profile_picture': {'name': '6e74d789-00f3-4800-8528-ab22cbea7ba4', 'path': '/data/data/com.awh.health.curis/app_images/images_resident'}, 'profiles': [], 'registered_at': '05/17/2018 at 09:23:07 am GMT+08:00', 'type': 'user-resident', 'user_cam': {'id': 'burgos-gengos@gmail.com', 'owner': 'burgos-gen

## =================================================

## Get Index_map unique values

## Get filenames