In [3]:
import pandas as pd
import json
import re

transaction_list_path = "C:\\Users\\paulcassidy\\OneDrive - Archdiocese of Chicago\\Documents\\JSON_Parsing\\RawData\\TransactionList.json"
general_ledger_path   = "C:\\Users\\paulcassidy\\OneDrive - Archdiocese of Chicago\\Documents\\JSON_Parsing\\RawData\\GeneralLedger.json"

with open(transaction_list_path) as f:
    transaction_list = json.load(f)

with open(general_ledger_path) as f:
    general_ledger = json.load(f)


In [2]:
def parse_record_gl(data):
    """
    Iterate through ColData or Header.ColData data frame. 
    Check what kind of data each data frame contains.
    Parse each record in data frame accordingly and return values.
    """
    
    # regular expression matching date format
    rex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")

    # check which ColData data frame is being parsed
    for row in range(data.shape[0]):
        # transaction detail data frame
        if re.search(rex, data['value'][0]):
            td   = data['value'][0]
            tt   = data['value'][1]
            ttid = data['id'][1]
            dn   = data['value'][2]
            vn   = data['value'][3]
            vid  = data['id'][3]
            m    = data['value'][4]
            s    = data['value'][5]
            sid  = data['id'][5]
            ta   = data['value'][6]
            nab  = data['value'][7]
            
            vals = {
                'TransactionDate' : td,
                'TransactionType' : tt,
                'TransactionTypeID' : ttid,
                'DocumentNumber' : dn,
                'VendorName' : vn,
                'VendorID' : vid,
                'Memo' : m,
                'Split' : s,
                'SplitID' : sid,
                'TransactionAmount' : ta,
                'NewAccountBalance' : nab
            }

            # replace missing values           
            for k in vals:
                if not vals[k]:
                    vals[k] = "NA"
            
            return vals

        # beginning balance data frame
        elif data['value'][0] == "Beginning Balance":
            bb   = data['value'][7]
            return {
                'BeginningBalance' : bb
            }

        # name data frame
        else:
            if 'id' in data:
                an  = data['value'][0]
                aid = data['id'][0]
            else:
                an  = data['value'][0]
                aid = "NA"
                
            return {
                'AccountName' : an,
                'AccountID' : aid
            } 

def json_crawler_gl(data):
    # If data is dictionary returned by json.load(), then flatten it to initial pandas data frame
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        return_value = json_crawler_gl_tl(data)
        return return_value 
    
    # Crawl through JSON file, normalizing by different keys to find account data and save to lst
    else:
        # Initialize list to hold account data objects
        lst = []
        acct_names = []
        
        for row in range(data.shape[0]):
            # Normalize by 'ColData'
            try:
                record_data = pd.json_normalize(data['ColData'][row])
                lst.append(parse_record_gl(record_data))
            except:
                pass
            # Normalize by 'Header.ColData'
            try:
                record_data = pd.json_normalize(data['Header.ColData'][row])
                lst.append(parse_record_gl(record_data))
                # acct_names.append(
                #     {'AccountName' : parse_record_gl_tl(record_data)['AccountName']},
                #     {'AccountID' : parse_record_gl_tl(record_data)['AccountID']})
            except:
                pass
            # Normalize by 'Rows.Row'
            try:
                row_data = pd.json_normalize(data['Rows.Row'][row])
                # Capture lst and return value before recursive function call
                return_value = json_crawler_gl(row_data)
                lst = lst + return_value
            except:
                pass

        return lst

def list_df_csv(lst, prefix, filename):
    df = pd.DataFrame(lst)
    df.to_csv(f"{prefix}/{filename}.csv", index = False)

def parse_record_tl(data):
    """
    Iterate through ColData or Header.ColData data frame. 
    Check what kind of data each data frame contains.
    Parse each record in data frame accordingly and return values.
    """

    # regular expression matching date format
    rex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")

    # check which ColData data frame is being parsed
    for row in range(data.shape[0]):
        # transaction detail data frame
        if re.search(rex, data['value'][0]):
            td   = data['value'][0]
            tt   = data['value'][1]
            ttid = data['id'][1]
            dn   = data['value'][2]
            p    = data['value'][3]
            vn   = data['value'][4]
            vid  = data['id'][4]
            m    = data['value'][5]
            s    = data['value'][6]
            sid  = data['id'][6]
            ta   = data['value'][7]
            
            vals = {
                'TransactionDate' : td,
                'TransactionType' : tt,
                'TransactionTypeID' : ttid,
                'DocumentNumber' : dn,
                'Posting' : p,
                'VendorName' : vn,
                'VendorID' : vid,
                'Memo' : m,
                'Split' : s,
                'SplitID' : sid,
                'TransactionAmount' : ta
                }

            # replace missing values           
            for k in vals:
                if not vals[k]:
                    vals[k] = "NA"
            
            return vals

def json_crawler_tl(data):
    # If data is dictionary returned by json.load(), then flatten it to initial pandas data frame
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        return_value = json_crawler_tl(data)
        return return_value 
    
    # Crawl through JSON file, normalizing by different keys to find account data and save to lst
    else:
        # Initialize list to hold account data objects
        lst = []
        acct_names = []
        
        for row in range(data.shape[0]):
            # Normalize by 'ColData'
            try:
                record_data = pd.json_normalize(data['ColData'][row])
                lst.append(parse_record_tl(record_data))
            except:
                pass
            # Normalize by 'Header.ColData'
            try:
                record_data = pd.json_normalize(data['Header.ColData'][row])
                lst.append(parse_record_tl(record_data))
                # acct_names.append(
                #     {'AccountName' : parse_record_gl_tl(record_data)['AccountName']},
                #     {'AccountID' : parse_record_gl_tl(record_data)['AccountID']})
            except:
                pass
            # Normalize by 'Rows.Row'
            try:
                row_data = pd.json_normalize(data['Rows.Row'][row])
                # Capture lst and return value before recursive function call
                return_value = json_crawler_tl(row_data)
                lst = lst + return_value
            except:
                pass

        return lst

def list_df_csv(lst, prefix, filename):
    df = pd.DataFrame(lst)
    df.to_csv(f"{prefix}/{filename}.csv", index = False)

# We need a way to store AccountName and AccountID and add them to the same object that the account detail and beginning balance goes in.
# We also need to impute AccountCategory

In [5]:
count = 0

In [None]:
def parse_record_gl(data):
    """
    Iterate through ColData or Header.ColData data frame. 
    Check what kind of data each data frame contains.
    Parse each record in data frame accordingly and return values.
    """
    
    # regular expression matching date format
    rex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")

    # check which ColData data frame is being parsed
    for row in range(data.shape[0]):
        # transaction detail data frame
        if re.search(rex, data['value'][0]):
            td   = data['value'][0]
            tt   = data['value'][1]
            ttid = data['id'][1]
            dn   = data['value'][2]
            vn   = data['value'][3]
            vid  = data['id'][3]
            m    = data['value'][4]
            s    = data['value'][5]
            sid  = data['id'][5]
            ta   = data['value'][6]
            nab  = data['value'][7]
            
            vals = {
                'TransactionDate' : td,
                'TransactionType' : tt,
                'TransactionTypeID' : ttid,
                'DocumentNumber' : dn,
                'VendorName' : vn,
                'VendorID' : vid,
                'Memo' : m,
                'Split' : s,
                'SplitID' : sid,
                'TransactionAmount' : ta,
                'NewAccountBalance' : nab
            }

            # replace missing values           
            for k in vals:
                if not vals[k]:
                    vals[k] = "NA"
            
            return vals

        # beginning balance data frame
        elif data['value'][0] == "Beginning Balance":
            bb   = data['value'][7]
            return {
                'BeginningBalance' : bb
            }

        # name data frame
        else:
            if 'id' in data:
                an  = data['value'][0]
                aid = data['id'][0]
            else:
                an  = data['value'][0]
                aid = "NA"
                
            return {
                'AccountName' : an,
                'AccountID' : aid
            } 

def json_crawler_gl(data):
    # initialize global COUNT
    global count

    # If data is dictionary returned by json.load(), then flatten it to initial pandas data frame
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        return_value = json_crawler_gl_tl(data)
        return return_value 
    
    # Crawl through JSON file, normalizing by different keys to find account data and save to lst
    else:
        # Initialize list to hold account data objects
        lst = []
        acct_names = []
        
        for row in range(data.shape[0]):
            # Normalize by 'ColData'
            try:
                record_data = pd.json_normalize(data['ColData'][row])
                lst.append(parse_record_gl(record_data))
            except:
                pass
            # Normalize by 'Header.ColData'
            try:
                record_data = pd.json_normalize(data['Header.ColData'][row])
                lst.append(parse_record_gl(record_data))
                count += 1
                # acct_names.append(
                #     {'AccountName' : parse_record_gl_tl(record_data)['AccountName']},
                #     {'AccountID' : parse_record_gl_tl(record_data)['AccountID']})
            except:
                pass
            # Normalize by 'Rows.Row'
            try:
                row_data = pd.json_normalize(data['Rows.Row'][row])
                # Capture lst and return value before recursive function call
                return_value = json_crawler_gl(row_data)
                lst = lst + return_value
            except:
                pass

        return lst

Account Names and IDs only come in the `Header.ColData` object. We can leverage that.

### Can you append to a dictionary within a list??

In [8]:
lst = [{'foo' : 'bar', 'fizz' : 'buzz', 'par' : 'birdie'}]
new_obj = {'boom' : 'pow'}

lst[0].update(new_obj)
print(lst)

[{'foo': 'bar', 'fizz': 'buzz', 'par': 'birdie', 'boom': 'pow'}]


yes you can
new change on line 89 below

In [39]:
def parse_record_gl(data):
    """
    Iterate through ColData or Header.ColData data frame. 
    Check what kind of data each data frame contains.
    Parse each record in data frame accordingly and return values.
    """
    
    # regular expression matching date format
    date_rex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")

    # check which ColData data frame is being parsed
    for row in range(data.shape[0]):
        # transaction detail data frame
        if re.search(date_rex, data['value'][0]):
            td   = data['value'][0]
            tt   = data['value'][1]
            ttid = data['id'][1]
            dn   = data['value'][2]
            vn   = data['value'][3]
            vid  = data['id'][3]
            m    = data['value'][4]
            s    = data['value'][5]
            sid  = data['id'][5]
            ta   = data['value'][6]
            nab  = data['value'][7]
            
            vals = {
                'TransactionDate' : td,
                'TransactionType' : tt,
                'TransactionTypeID' : ttid,
                'DocumentNumber' : dn,
                'VendorName' : vn,
                'VendorID' : vid,
                'Memo' : m,
                'Split' : s,
                'SplitID' : sid,
                'TransactionAmount' : ta,
                'NewAccountBalance' : nab
            }

            # replace missing values           
            for k in vals:
                if not vals[k]:
                    vals[k] = "NA"
            
            return vals

        # beginning balance data frame
        elif data['value'][0] == "Beginning Balance":
            bb   = data['value'][7]
            return {
                'BeginningBalance' : bb
            }

        # name data frame
        else:
            if 'id' in data:
                an  = data['value'][0]
                aid = data['id'][0]
            else:
                an  = data['value'][0]
                aid = "NA"
                
            return {
                'AccountName' : an,
                'AccountID' : aid
            } 

acc_name_id = {}

def json_crawler_gl(data):
    # initialize global COUNT
    # global count
    global acc_name_id

    # If data is dictionary returned by json.load(), then flatten it to initial pandas data frame
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        return_value = json_crawler_gl(data)
        return return_value 
    
    # Crawl through JSON file, normalizing by different keys to find account data and save to lst
    else:
        # Initialize list to hold account data objects
        lst = []
        
        for row in range(data.shape[0]):
            # Normalize by 'Header.ColData'
            try:
                record_data = pd.json_normalize(data['Header.ColData'][row])
                lst.append(parse_record_gl(record_data))
                # count += 1
                acc_name_id = parse_record_gl(record_data)
            except:
                pass
            # Normalize by 'ColData'
            try:
                record_data = pd.json_normalize(data['ColData'][row])
                lst.append(parse_record_gl(record_data))
                # lst[count].update(parse_record_gl(record_data))
            except:
                pass
            # Normalize by 'Rows.Row'
            try:
                row_data = pd.json_normalize(data['Rows.Row'][row])
                # Capture lst and return value before recursive function call
                return_value = json_crawler_gl(row_data)
                lst = lst + return_value
            except:
                pass

        return lst

In [41]:
list_df_csv(json_crawler_gl(general_ledger), "C:\\Users\\paulcassidy\\OneDrive - Archdiocese of Chicago\\Documents\\JSON_Parsing\\Output", "test_gl_1")

## going to have to do a thing with levels  
structure:

In [None]:
row = {
    'AccountType' : at,
    'AccountLevel1' : al1,
    'AccountLevel1ID' : al1id,
    'AccountLevel2' : al2,
    'AccountLevel2ID' : al2id,
    'AccountLevel3' : al3,
    'AccountLevel3ID' : al3id,
    'BeginningBalance' : bb,
    'TransactionDate' : td,
    'TransactionType' : tt,
    'TransactionTypeID' : ttid,
    'DocumentNumber' : dn,
    'VendorName' : vn,
    'VendorID' : vid,
    'Memo' : m,
    'Split' : s,
    'SplitID' : sid,
    'TransactionAmount' : ta,
    'NewAccountBalance' : nab
}

In [43]:
# import pandas module for data frame
import pandas as pd

# Create dataframe for student data in different colleges
subjectsdata = {'Name': ['sravan', 'sravan', 'sravan', 'sravan',
						'sravan', 'sravan', 'sravan', 'sravan',
						'Ojaswi', 'Ojaswi', 'Ojaswi', 'Ojaswi',
						'Ojaswi', 'Ojaswi', 'Ojaswi', 'Ojaswi',
						'Rohith', 'Rohith', 'Rohith', 'Rohith',
						'Rohith', 'Rohith', 'Rohith', 'Rohith'],
				
				'college': ['VFSTRU', 'VFSTRU', 'VFSTRU', 'VFSTRU',
							'VFSTRU', 'VFSTRU', 'VFSTRU', 'VFSTRU',
							'VIT', 'VIT', 'VIT', 'VIT', 'VIT', 'VIT',
							'VIT', 'VIT', 'IIT-Bhu', 'IIT-Bhu', 'IIT-Bhu',
							'IIT-Bhu', 'IIT-Bhu', 'IIT-Bhu', 'IIT-Bhu',
							'IIT-Bhu'],
				
				'subject': ['java', 'dbms', 'dms', 'coa', 'python', 'dld',
							'android', 'iot', 'java', 'dbms', 'dms', 'coa',
							'python', 'dld', 'android', 'iot', 'java',
							'dbms', 'dms', 'coa', 'python', 'dld', 'android',
							'iot']
				}

# Convert into data frame
df = pd.DataFrame(subjectsdata)

# print the data(student records)
print(df)

# Set the hierarchical index
df = df.set_index(['Name', 'college'], drop=False)

# # print data frame
# df

# # setting index
# df = df.set_index(['Name', 'college'])

# # print data frame
# df


df.to_csv("C:\\Users\\paulcassidy\\OneDrive - Archdiocese of Chicago\\Documents\\JSON_Parsing\\Output\\hierarchy.csv", index = False)

      Name  college  subject
0   sravan   VFSTRU     java
1   sravan   VFSTRU     dbms
2   sravan   VFSTRU      dms
3   sravan   VFSTRU      coa
4   sravan   VFSTRU   python
5   sravan   VFSTRU      dld
6   sravan   VFSTRU  android
7   sravan   VFSTRU      iot
8   Ojaswi      VIT     java
9   Ojaswi      VIT     dbms
10  Ojaswi      VIT      dms
11  Ojaswi      VIT      coa
12  Ojaswi      VIT   python
13  Ojaswi      VIT      dld
14  Ojaswi      VIT  android
15  Ojaswi      VIT      iot
16  Rohith  IIT-Bhu     java
17  Rohith  IIT-Bhu     dbms
18  Rohith  IIT-Bhu      dms
19  Rohith  IIT-Bhu      coa
20  Rohith  IIT-Bhu   python
21  Rohith  IIT-Bhu      dld
22  Rohith  IIT-Bhu  android
23  Rohith  IIT-Bhu      iot
