**Purpose:** The purpose of this notebook is to ingest the patient_profiles in json format to CSV without losing file integrity.

In [1]:
import pandas as pd
import os
import json
from pandas.io.json import json_normalize

In [2]:
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

### Load patient_profiles.json

In [3]:
file = 'patient_profiles.json'
filepath = os.getcwd()+'/input/'+file
#df = pd.read_json(filepath)

with open(filepath) as jsonfile:
    data = json.load(jsonfile)
df = pd.DataFrame.from_dict(json_normalize(data), orient='columns')

In [4]:
df

Unnamed: 0,cohort_id,institution,patient_profiles
0,14562556998,Saint Penelope Medical Center,"[{'patient_id': '102bb8fae', 'demographics': {..."
1,14562556998,"BioLab, Inc.","[{'patient_id': '100688fb9', 'demographics': {..."
2,14562556998,University Hospital System,"[{'patient_id': '1002df1d3', 'demographics': {..."
3,14562556998,Goodfellow Research Institute,"[{'patient_id': '104fc5e3c', 'demographics': {..."
4,14562556998,Montague Hospital,"[{'patient_id': '1010441f', 'demographics': {'..."
5,14562556998,Johnson & Bloom Hospitals,"[{'patient_id': '103278b88', 'demographics': {..."
6,14562556998,Medical Information Exchange,"[{'patient_id': '1002cb1e8', 'demographics': {..."


In [5]:
df['patient_profiles'][0]

[{'patient_id': '102bb8fae',
  'demographics': {'gender': 'Female', 'age': 68},
  'status': {'disease_sub_type': 'A',
   'comorbidity_index': 0,
   'cohort_qualifier': True,
   'smoking_status': 'never',
   'months_since_diagnosis': 1}},
 {'patient_id': '10e32947f',
  'demographics': {'gender': 'Female', 'age': 66, 'race': 'White'},
  'status': {'disease_sub_type': 'A',
   'comorbidity_index': None,
   'cohort_qualifier': True,
   'smoking_status': 'former',
   'months_since_diagnosis': 0}},
 {'patient_id': '11156e14a',
  'demographics': {'gender': 'Male', 'age': 61, 'race': 'White'},
  'status': {'disease_sub_type': 'A',
   'comorbidity_index': 1,
   'cohort_qualifier': True,
   'smoking_status': 'never',
   'months_since_diagnosis': 16}},
 {'patient_id': '113d8066d',
  'demographics': {'gender': 'Male', 'age': 62},
  'status': {'disease_sub_type': 'B',
   'comorbidity_index': None,
   'cohort_qualifier': True,
   'smoking_status': 'current',
   'months_since_diagnosis': 0}},
 {'patie

In [5]:
#flattens nested json
def flatten_json(nested_json, exclude=['']):
    out = {}
    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(nested_json)
    return out

#run above in quick loop to create final dataframe & CSV
i = 0
df_comp = pd.DataFrame(columns=['demographics_age','demographics_gender','demographics_race','patient_id','status_cohort_qualifier','status_comorbidity_index','status_disease_sub_type','status_months_since_diagnosis','status_smoking_status','institution'])
inst_list = list(df['institution'].unique())
while i < len(df):
    df_temp = pd.DataFrame([flatten_json(x) for x in df['patient_profiles'][i]])
    df_temp['institution'] = inst_list[i]
    df_comp = pd.concat([df_comp,df_temp])
    i+=1 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [6]:
file = 'patient_profiles.csv'
df_comp.to_csv(os.getcwd()+'/output/'+file, index=False)
df_comp.head()

Unnamed: 0,demographics_age,demographics_gender,demographics_race,institution,patient_id,status_alcohol_usage,status_bmi_level,status_cohort_qualifier,status_comorbidity_index,status_days_since_diagnosis,status_disease_sub_type,status_exercise_frequency,status_months_since_diagnosis,status_smoking_status
0,68,Female,,Saint Penelope Medical Center,102bb8fae,,,True,0.0,,A,,1,never
1,66,Female,White,Saint Penelope Medical Center,10e32947f,,,True,,,A,,0,former
2,61,Male,White,Saint Penelope Medical Center,11156e14a,,,True,1.0,,A,,16,never
3,62,Male,,Saint Penelope Medical Center,113d8066d,,,True,,,B,,0,current
4,59,Male,,Saint Penelope Medical Center,113ec3f1,,,True,,,A,,9,former
