In [1]:
import pandas as pd
import json

with open("ProfitAndLoss.json") as f:
    rawdata = json.load(f)

In [2]:
def parse_json(data):
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        # print(data.shape[0])
        for row in range(data.shape[0]):
                if 'Rows.Row' in data:
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        parse_json(row_data)
                    except:
                        continue
                elif 'ColData' in data:
                    try:
                        col_data = pd.json_normalize(data['ColData'][row])
                        # HERE
                        acc_name = col_data['value'][0] # local variable
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"{acc_name}, {acc_id}, {acc_val}")
                    except:
                        continue
                elif 'Header.ColData' in data:
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                            continue
                        elif isinstance(head_data, pd.core.frame.DataFrame): 
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"{acc_name}, {acc_id}, {acc_val}")
                    except:
                        continue

### Let's run the actual code

In [3]:
parse_json(rawdata)

3013.1 Counting Error, 692, 452.45
3016 Holy Day Collection Envelopes, 281, 5408.00
3017 Holy Day Collect. Loose Ck/Cash, 280, 568.00
3042 Flowers Collection, 558, 1173.00
3043 School Support Collection, 282, 140.00
3045 Almsgiving, 369, 5828.00
3101 Tuition - Current, 698, 3014281.05
3102 Tuition - Prior, 648, 30883.00
3125A 3rd Party Rev., 658, 216.00
3125B 3rd Party Exp, 612, 5440.00
3301 School Meal Program, 37, 413.00
3351 Facility Lease, 595, 45900.00
3352 Facility Rental, 630, 200.00
3407.01 Tax Credit Scholarship, 358, 65526.50
3461.03 SJS Annual Fund Drive, 678, 1515.00
3462.01 SFXS Annual School Appeal, 650, 2000.00
3501 Interest-Arch Regular Savings, 41, 1646.24
3502 Interest-Arch Cert of Deposit, 42, 671.46
3503 Interest-Arch Building Fund, 43, 2146.75
3504 Invest. Gains/Losses Arch Endow, 44, 548814.09
3505 Invest. Gains/Loss Arch Spc Inv, 147, 23768.66
3506 Interest-Checking Accounts, 148, 46.03
3507 Other Interest Account, 45, 212.75
3603.02 SFX Men's Club, 460, 23300.38

Oh my stars! It actually does something right! It's not all the data though... let's figure out what it got and what it missed.

<span style="color:blue">*(another note: the code did exactly what I thought it would do based on stepping through it up above. so that's a partial win!)*</span>

So what it does is  
- it keeps normalizing when it finds a rows.row in a data frame, and will only print out the column data when it has reach the last Rows.Row. Which makse sense from the code
- also, it doesn't seem to go "back up to the top?"  to then do it again? I'll try to show what I mean.

### Troubleshooting

What do we want it to do? Do we want it to:
- go down the rabbit hole first, then come back out, then go down a different rabbit hole?
- or just get everything as we go deeper and deeper?

Let's try the latter option first.   
Let's swith the order of ColData and Rows.Row

In [5]:
def parse_json(data):
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        for row in range(data.shape[0]):
                if 'ColData' in data:
                    try:
                        col_data = pd.json_normalize(data['ColData'][row])
                        # HERE
                        acc_name = col_data['value'][0] # local variable
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"{acc_name}, {acc_id}, {acc_val}")
                    except:
                        continue
                elif 'Rows.Row' in data:
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        parse_json(row_data)
                    except:
                        continue
                elif 'Header.ColData' in data:
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                            continue
                        elif isinstance(head_data, pd.core.frame.DataFrame): 
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"{acc_name}, {acc_id}, {acc_val}")
                    except:
                        continue

In [6]:
parse_json(rawdata)

3020 Christmas Collection, 110, 350328.00
3030 Easter Collection, 111, 750.00
4030 Health Insurance-Employer Paid, 163, 524972.00
4040 Employer FICA, 164, 226912.54
4400 Telephone, 60, 10050.32
4410 Heating Fuel, 61, 32037.55
4420 Electricity, 62, 50452.71
4760 PRMAA Assessment, 145, 62225.82
4770 OCS Assessment, 146, 26055.00
4780 Property/Casualty Insurance, 390, 153969.53
4790 Auto Insur-Priest Owned Vehicle, 72, 3855.00
5001 Casualty Claims, 220, 9560.00
5011.1 St. VIncent de Paul, 516, 45309.00
5012.02 St. Ignatius, 424, 14002.00
5013.01 St. Thomas of Canterbury, 421, 46138.00
5017 Haiti-Rodlin, 248, 4202.95
5031 Latin America, 81, 1401.00
5032 Seminaries, 87, 3035.00
5033 Annual Catholic Appeal, 82, -1132.00
5034 Holy Land, 83, 120.00
5035 Aid Churches Centl/East Europe, 86, 1785.00
5036 Catholic Charities, 84, 2060.00
5037 Peter's Pence, 85, 755.00
5039 World Mission Sunday, 88, 2148.00
5040 Cath Campaign for Human Develop, 89, 5240.00
5041 Religious Retirement, 90, 5066.00
5042

**hmmm** we get a lot less data that way. the first two are the ColData from the 3rd data frame:

In [10]:
df1 = pd.json_normalize(rawdata)
df2 = pd.json_normalize(df1['Rows.Row'][0])
df3 = pd.json_normalize(df2['Rows.Row'][0])
df3_c1 = pd.json_normalize(df3['ColData'][1])
df3_c2 = pd.json_normalize(df3['ColData'][2])

In [11]:
df3

Unnamed: 0,type,Header.ColData,Rows.Row,Summary.ColData,ColData
0,Section,[{'value': '3000 Sunday and Holy Day Collectio...,[{'Header': {'ColData': [{'value': '3011 Sunda...,[{'value': 'Total 3000 Sunday and Holy Day Col...,
1,Data,,,,"[{'value': '3020 Christmas Collection', 'id': ..."
2,Data,,,,"[{'value': '3030 Easter Collection', 'id': '11..."
3,Section,"[{'value': '3040 Other Collections', 'id': '11...",[{'ColData': [{'value': '3042 Flowers Collecti...,"[{'value': 'Total 3040 Other Collections'}, {'...",
4,Section,"[{'value': '3100 Tuition', 'id': '392'}, {'val...",[{'ColData': [{'value': '3101 Tuition - Curren...,"[{'value': 'Total 3100 Tuition'}, {'value': '3...",
5,Section,"[{'value': '3110 Fees', 'id': '28'}, {'value':...",[{'ColData': [{'value': '3111 Registration Fee...,"[{'value': 'Total 3110 Fees'}, {'value': '4753...",
6,Section,"[{'value': '3300 Food Services', 'id': '36'}, ...",[{'ColData': [{'value': '3301 School Meal Prog...,"[{'value': 'Total 3300 Food Services'}, {'valu...",
7,Section,"[{'value': '3350 Lease and/or Rental Income', ...","[{'ColData': [{'value': '3351 Facility Lease',...",[{'value': 'Total 3350 Lease and/or Rental Inc...,
8,Section,"[{'value': '3400 Outside Funding Sources', 'id...",[{'Header': {'ColData': [{'value': '3407 Other...,[{'value': 'Total 3400 Outside Funding Sources...,
9,Section,"[{'value': '3450 Fund Raising Net Income', 'id...",[{'Header': {'ColData': [{'value': '3461 St. J...,[{'value': 'Total 3450 Fund Raising Net Income...,


In [12]:
df3_c1

Unnamed: 0,value,id
0,3020 Christmas Collection,110.0
1,350328.00,


In [13]:
df3_c2

Unnamed: 0,value,id
0,3030 Easter Collection,111.0
1,750.00,


Where does "4030 Health Insurance-Employer Paid, 163, 524972.00" come from?

Instead of trying to figure this out every time, I think I should write some print statements to show where we are. I'll code that now

In [44]:
global count
count = 0
def parse_json(data):
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        print("Initialize Dataframe\n")
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        for row in range(data.shape[0]):
                if 'ColData' in data:
                    try:
                        col_data = pd.json_normalize(data['ColData'][row])
                        # count += 1
                        print(f"Norm by ColData, row {row}")
                        acc_name = col_data['value'][0] # local variable
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"**Data: {acc_name}, {acc_id}, {acc_val}\n")
                    except:
                        continue
                elif 'Rows.Row' in data:
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        print(f"Norm by Rows.Row, row {row}\n")
                        parse_json(row_data)
                    except:
                        continue
                elif 'Header.ColData' in data:
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        print(f"Norm by Header.ColData, row {row}")
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                            continue
                        elif isinstance(head_data, pd.core.frame.DataFrame): 
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"**Data:{acc_name}, {acc_id}, {acc_val}")
                    except:
                        continue

In [45]:
parse_json(rawdata)

Initialize Dataframe

Norm by Rows.Row, row 0

Norm by Rows.Row, row 0

Norm by ColData, row 1
**Data: 3020 Christmas Collection, 110, 350328.00

Norm by ColData, row 2
**Data: 3030 Easter Collection, 111, 750.00

Norm by Rows.Row, row 2

Norm by ColData, row 1
**Data: 4030 Health Insurance-Employer Paid, 163, 524972.00

Norm by ColData, row 2
**Data: 4040 Employer FICA, 164, 226912.54

Norm by ColData, row 9
**Data: 4400 Telephone, 60, 10050.32

Norm by ColData, row 10
**Data: 4410 Heating Fuel, 61, 32037.55

Norm by ColData, row 11
**Data: 4420 Electricity, 62, 50452.71

Norm by ColData, row 19
**Data: 4760 PRMAA Assessment, 145, 62225.82

Norm by ColData, row 20
**Data: 4770 OCS Assessment, 146, 26055.00

Norm by ColData, row 21
**Data: 4780 Property/Casualty Insurance, 390, 153969.53

Norm by ColData, row 22
**Data: 4790 Auto Insur-Priest Owned Vehicle, 72, 3855.00

Norm by Rows.Row, row 4

Norm by Rows.Row, row 0

Norm by ColData, row 0
**Data: 5001 Casualty Claims, 220, 9560.00



Let's see what happens if I put header.coldata before rows.row

In [46]:
def parse_json(data):
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        print("Initialize Dataframe\n")
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        for row in range(data.shape[0]):
                if 'ColData' in data:
                    try:
                        col_data = pd.json_normalize(data['ColData'][row])
                        # count += 1
                        print(f"Norm by ColData, row {row}")
                        acc_name = col_data['value'][0] # local variable
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"**Data: {acc_name}, {acc_id}, {acc_val}\n")
                    except:
                        continue
                elif 'Header.ColData' in data:
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        print(f"Norm by Header.ColData, row {row}")
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                            continue
                        elif isinstance(head_data, pd.core.frame.DataFrame): 
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"**Data:{acc_name}, {acc_id}, {acc_val}")
                    except:
                        continue
                elif 'Rows.Row' in data:
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        print(f"Norm by Rows.Row, row {row}\n")
                        parse_json(row_data)
                    except:
                        continue

In [47]:
parse_json(rawdata)

Initialize Dataframe

Norm by Rows.Row, row 0

Norm by Header.ColData, row 0
Norm by Header.ColData, row 2
Norm by Header.ColData, row 4
Norm by Header.ColData, row 5


There aren't recursions in this. I wonder where I can put them. like in the Header.ColData block, it just has the iterations, no recursions. and the ColData one doesn't have recursions either.  
I'm going to add `parse_json(col_data)` and `parse_json(head_data)`

In [54]:
def parse_json(data):
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        print("Initialize Dataframe\n")
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        for row in range(data.shape[0]):
                if 'ColData' in data:
                    try:
                        col_data = pd.json_normalize(data['ColData'][row])
                        # count += 1
                        print(f"Norm by ColData, row {row}")
                        acc_name = col_data['value'][0] # local variable
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"**Data: {acc_name}, {acc_id}, {acc_val}\n")
                        parse_json(col_data)
                    except:
                        continue
                elif 'Header.ColData' in data:
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        print(f"Norm by Header.ColData, row {row}")
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                            continue
                        elif isinstance(head_data, pd.core.frame.DataFrame): 
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"**Data:{acc_name}, {acc_id}, {acc_val}")
                            parse_json(head_data)
                    except:
                        continue
                elif 'Rows.Row' in data:
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        print(f"Norm by Rows.Row, row {row}\n")
                        parse_json(row_data)
                    except:
                        continue

In [55]:
parse_json(rawdata)

Initialize Dataframe

Norm by Rows.Row, row 0

Norm by Header.ColData, row 0
Norm by Header.ColData, row 2
Norm by Header.ColData, row 4
Norm by Header.ColData, row 5


Nothing changed...

add a recursion in the `if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                            continue`

In [106]:
def parse_json(data):
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        print("Initialize Dataframe\n")
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        for row in range(data.shape[0]):
                if 'ColData' in data:
                    try:
                        col_data = pd.json_normalize(data['ColData'][row])
                        # count += 1
                        print(f"Norm by ColData, row {row}")
                        acc_name = col_data['value'][0] # local variable
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"**Data: {acc_name}, {acc_id}, {acc_val}\n")
                        parse_json(col_data)
                    except:
                        continue
                elif 'Header.ColData' in data:
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        print(f"Norm by Header.ColData, row {row}")
                        # print(head_data)
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                           # print("got to the id block")
                            parse_json(head_data)
                        else: 
                            print("got to the assignment block")
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"**Data:{acc_name}, {acc_id}, {acc_val}")
                            parse_json(head_data)
                    except TypeError:
                        print(f"Bypassed row {row}") # show when a NaN value ot bypassed
                elif 'Rows.Row' in data:
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        print(f"Norm by Rows.Row, row {row}\n")
                        parse_json(row_data)
                    except:
                        continue

In [107]:
parse_json(rawdata)

Initialize Dataframe

Norm by Rows.Row, row 0

Norm by Header.ColData, row 0
Bypassed row 1
Norm by Header.ColData, row 2
Bypassed row 3
Norm by Header.ColData, row 4
Norm by Header.ColData, row 5
Bypassed row 6
Bypassed row 7


some more stuff

In [138]:
df_count = 0

def parse_json(data):
    global df_count
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        df_count += 1
        print(f"Initializing DataFrame No. {df_count}...\n")
        print("RECURSION")
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        for row in range(data.shape[0]):
                if 'ColData' in data:
                    # print("Normalizing ColData...")
                    try:
                        col_data = pd.json_normalize(data['ColData'][row])
                        # count += 1
                        print(f"ColData, df {df_count}, row {row}")
                        acc_name = col_data['value'][0] # local variable
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"**Data: {acc_name}, {acc_id}, {acc_val}\n")
                        #print("RECURSION")
                        #parse_json(col_data)
                    except:
                        print(f"ColData: Bypassed row {row}") # show when a NaN value ot bypassed
                        continue
                elif 'Header.ColData' in data:
                    # print("Normalizing Header.ColData...")
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        print(f"Header.ColData, df {df_count}, row {row}")
                        # print(head_data)
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                           # print("got to the id block")
                            head_data = pd.json_normalize(data['Rows.Row'][row])
                            print("RECURSION")
                            parse_json(head_data)
                        else: 
                            print("got to the assignment block")
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"**Data:{acc_name}, {acc_id}, {acc_val}")
                            print("RECURSION")
                            parse_json(head_data)
                    except TypeError:
                        print(f"Header.ColData: Bypassed row {row}") # show when a NaN value ot bypassed
                        continue
                elif 'Rows.Row' in data:
                    df_count += 1
                    # print(f"Normalizing Rows.Row, entering DataFrame No. {df_count}...")
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        print(f"Rows.Row, df {df_count}r, row {row}\n")
                        print("RECURSION")
                        parse_json(row_data)
                    except:
                        continue

In [139]:
parse_json(rawdata)

Initializing DataFrame No. 1...

RECURSION
Rows.Row, df 2, row 0

RECURSION
Header.ColData, df 2, row 0
RECURSION
ColData: Bypassed row 0
ColData, df 2, row 1
**Data: 3020 Christmas Collection, 110, 350328.00

ColData, df 2, row 2
**Data: 3030 Easter Collection, 111, 750.00

ColData: Bypassed row 3
ColData: Bypassed row 4
ColData: Bypassed row 5
ColData: Bypassed row 6
ColData: Bypassed row 7
ColData: Bypassed row 8
ColData: Bypassed row 9
ColData: Bypassed row 10
ColData: Bypassed row 11
ColData: Bypassed row 12
Header.ColData: Bypassed row 1
Header.ColData, df 2, row 2
RECURSION
ColData: Bypassed row 0
ColData, df 2, row 1
**Data: 4030 Health Insurance-Employer Paid, 163, 524972.00

ColData, df 2, row 2
**Data: 4040 Employer FICA, 164, 226912.54

ColData: Bypassed row 3
ColData: Bypassed row 4
ColData: Bypassed row 5
ColData: Bypassed row 6
ColData: Bypassed row 7
ColData: Bypassed row 8
ColData, df 2, row 9
**Data: 4400 Telephone, 60, 10050.32

ColData, df 2, row 10
**Data: 4410 Hea

this is getting too complicated

In [None]:
def parse_json(data):
    if isinstance(data, dict):
        data = pd.json_normalize(data)
        parse_json(data)
    elif isinstance(data, pd.core.frame.DataFrame):
        for row in range(data.shape[0] + 1):
                if 'ColData' in data:
                    try:
                        # normalize data frame
                        col_data = pd.json_normalize(data['ColData'][row])
                        # assign and print values
                        acc_name = col_data['value'][0]
                        acc_id   = col_data['id'][0]
                        acc_val  = col_data['value'][1]
                        print(f"**Data: {acc_name}, {acc_id}, {acc_val}\n")
                    except:
                        # handle NaN cells
                        print(f"ColData: Bypassed row {row}")
                    try:
                        data = pd.json_normalize(data['Rows.Row'][row-data.shape[0]])
                elif 'Header.ColData' in data:
                    # print("Normalizing Header.ColData...")
                    try:
                        head_data = pd.json_normalize(data['Header.ColData'][row])
                        print(f"Header.ColData, df {df_count}, row {row}")
                        # print(head_data)
                        if isinstance(head_data, pd.core.frame.DataFrame) and 'id' not in head_data:
                           # print("got to the id block")
                            head_data = pd.json_normalize(data['Rows.Row'][row])
                            print("RECURSION")
                            parse_json(head_data)
                        else: 
                            print("got to the assignment block")
                            acc_name = head_data['value'][0]
                            acc_id   = head_data['id'][0]
                            acc_val  = head_data['value'][1]
                            print(f"**Data:{acc_name}, {acc_id}, {acc_val}")
                            print("RECURSION")
                            parse_json(head_data)
                    except TypeError:
                        print(f"Header.ColData: Bypassed row {row}") # show when a NaN value ot bypassed
                        continue
                elif 'Rows.Row' in data:
                    df_count += 1
                    # print(f"Normalizing Rows.Row, entering DataFrame No. {df_count}...")
                    try:
                        row_data = pd.json_normalize(data['Rows.Row'][row])
                        print(f"Rows.Row, df {df_count}r, row {row}\n")
                        print("RECURSION")
                        parse_json(row_data)
                    except:
                        continue