In [1]:
#import in built libraries
import json

#import 3rd party libraries
import pandas as pd
import numpy as np

In [2]:
def process_csv(file):
    
    """Pre processes the input .csv file to replace NaN's with np.na,
    capitalizes strings in the dataframe to standardize it as per parsed
    json file
    
    Arguments:
    Input - file(string)
    Output - Dataframe (Pandas)
    
    Returns a pandas dataframe
    
    """
    
    raw_df = pd.read_csv(file)
    raw_df.index = raw_df.index + 1
    raw_df = raw_df.fillna(value= np.nan)

    raw_df.street = raw_df.street.map(lambda x:capitalize(x))
    raw_df.street_2 = raw_df.street_2.map(lambda x:capitalize(x))
    raw_df.city = raw_df.city.map(lambda x:capitalize(x))
    raw_df.state = raw_df.state.map(lambda x: x.swapcase() if x is not np.nan and x.islower() else x)
    
    return raw_df

def collect_data(file):
    
    """
    Load json file in a list
    
    Arguments:
    Input - filename (str)
    Output - data ([List])
    
    returns records of json in a nested list
    """
    
    data = []
    with open('source_data.json') as f:
        for line in f:
            data.append(json.loads(line))
        
    return data


def data_df(data):
    
    """
    Arguments:
    Input - data ([List])
    Output - Dataframe (Pandas)

    Returns a pandas dataframe

    Converts json data list to a pandas dataframe by concating doctors
    and practices
    """
    
    doctor = [data[i]['doctor'] for i in range(len(data))]
    practices = [data[i]['practices'][0] for i in range(len(data))]
    
    doctor_df = pd.DataFrame(doctor, index=range(1,len(doctor) + 1))
    practices_df = pd.DataFrame(practices, index=range(1,len(practices) + 1))
    
    json_df = pd.DataFrame(pd.concat([doctor_df, practices_df], axis = 1)).drop(['lat','lon'],axis = 1)
    return json_df

def capitalize(strs):
    
    """
    Capitalizes first character of word
    
    Arguments:
    Input - string (str)
    Output - string (str)
    
    Return capitalized string if valid else returns np.nan
    """
    
    if strs is not np.nan:
        return strs.title()
    
    else:
        return strs


In [7]:
if __name__ == '__main__':
    
    #Read the .csv and pre process it
    raw_df = process_csv('match_file.csv')

    #Collect data from json file
    data = collect_data('source_data.json')
    json_df = data_df(data)
    json_df = json_df[list(raw_df.columns)]
    
    print('No of documents scanned - 1 (Only one document in shared folder)')
    
    #Matched
    print('No of Doctors matched with NPI - {}'.format(
        len(pd.merge(raw_df, json_df, how='inner', on=['npi']))))

    print('No of Doctors matched with names and full address - {}'.format(
        len(pd.merge(raw_df, json_df, how='inner', on=['first_name', 'last_name','street','street_2','state','city','zip']))))

    print('No of Practice matched with address - {}'.format(
        len(pd.merge(raw_df, json_df, how='inner', on=['street','street_2','state','city','zip']))))
    
    print('No of documents not matched - 0 (Only one document in shared folder)')

No of documents scanned - 1 (Only one document in shared folder)
No of Doctors matched with NPI - 864
No of Doctors matched with names and full address - 550
No of Practice matched with address - 550
No of documents not matched - 0 (Only one document in shared folder)
