## Reading the JSON

source_data.json has clean normalized data used as the source of the matching.

We need to load it and then we want to make it a pandas DataFrame, as it will make much easier all future data transaction. This is a fixed cost we must pay once for running all the functions.

In [1]:
import json
from pprint import pprint

def extract_source_data(source_file):
    ''' Form a list of dictionaries from a file with a json doc per line''' 
    source_data = []
    with open(source_file) as f:
        for line in f:
            source_data.append(json.loads(line))
    return source_data

source_data = extract_source_data("source_data.json")
assert(len(source_data) == 11231)
pprint(source_data[:2])

[{'doctor': {'first_name': 'Dean',
             'last_name': 'Israel',
             'npi': '85103080143784778415'},
  'practices': [{'city': 'Port Demetris',
                 'lat': '-79.8757664338564',
                 'lon': '84.31253504872467',
                 'state': 'LA',
                 'street': '271 Annabelle Fort',
                 'street_2': 'Apt. 404',
                 'zip': '53549'}]},
 {'doctor': {'first_name': 'Quinton',
             'last_name': 'Mollie',
             'npi': '36233383542350521233'},
  'practices': [{'city': 'Nealville',
                 'lat': '81.37417480720865',
                 'lon': '-95.33450729432164',
                 'state': 'OR',
                 'street': '8496 Kennedi Inlet',
                 'street_2': 'Suite 815',
                 'zip': '52665-6811'},
                {'city': 'Rashadborough',
                 'lat': '69.84837521604314',
                 'lon': '87.36942972635728',
                 'state': 'UT',
                 'st

### Unwinding the json

We want to unwind the practices in order to:

- Look for number of doctors from match_file.csv in which first name, last name and full adress match with the source_data.
- Look for number of practices from match_file.csv in which the full adress match with the source_data.

In [2]:
import pandas as pd
from pandas.io.json import json_normalize

def transform_source_data(source_data):
    '''Form a DataFrame from a list of dictionaries. 
    Rename columns to follow name conventions as the csv'''
    fields_to_unwind = ["practices"]
    not_unwinded_fields_path = [["doctor", "first_name"], 
                           ["doctor", "last_name"], 
                           ["doctor", "npi"]]
    not_unwinded_naming_map = {'doctor.first_name':'first_name', 
                               'doctor.last_name':'last_name',
                               'doctor.npi':'npi'}
    return json_normalize(source_data, 
                         fields_to_unwind, 
                         not_unwinded_fields_path).rename(
                             columns = not_unwinded_naming_map)

source_unwinded_df = transform_source_data(source_data)
assert(source_unwinded_df.shape == (22443, 10))
source_unwinded_df

Unnamed: 0,city,lat,lon,state,street,street_2,zip,npi,first_name,last_name
0,Port Demetris,-79.8757664338564,84.31253504872467,LA,271 Annabelle Fort,Apt. 404,53549,85103080143784778415,Dean,Israel
1,Nealville,81.37417480720865,-95.33450729432164,OR,8496 Kennedi Inlet,Suite 815,52665-6811,36233383542350521233,Quinton,Mollie
2,Rashadborough,69.84837521604314,87.36942972635728,UT,29483 Nader Wall,Apt. 748,46006-3437,36233383542350521233,Quinton,Mollie
3,South Daronland,84.90377842497296,177.28706015725533,AK,2122 Wintheiser Valleys,Suite 855,99372,36233383542350521233,Quinton,Mollie
4,West Lonnieberg,52.12502086274685,109.12414094328233,GA,210 Walsh Island,Suite 839,59104,68951826121607537145,Vincent,Abbie
5,Port Angieborough,89.41473074638557,-38.22151510102702,KY,460 Ortiz Points,Suite 609,60776-9928,68951826121607537145,Vincent,Abbie
6,Nyasiaburgh,0.7514069044332956,93.56993517086102,NH,13810 Pfannerstill Pike,Apt. 165,71167-1710,68951826121607537145,Vincent,Abbie
7,Grantborough,78.53231427000821,12.229188372184922,MN,1262 O'Keefe Ford,Apt. 790,39283,92442805782715742535,Gerardo,Piper
8,East Ozella,25.541057391873352,-32.342152333557465,PA,591 Gretchen Fields,Apt. 523,15472,92442805782715742535,Gerardo,Piper
9,New Fredy,-4.541598251928605,-41.46795232079714,IL,98764 Mante Trafficway,Suite 356,43570,83029151715578341587,Dean,Francesco


We can also find a slightly more complicated version that is more generic, allowing us specify several fields at once to unwind

In [3]:
import pandas as pd
from pandas.io.json import json_normalize

def get_formatted_not_unwinded_fields(source_series, fields_to_unwind):
    '''Form a list of lists with the path to the fields inside fields_to_unwind,
    and a dict mapping this path to the subfields 
    
    These are needed in order to use the json_normalize in pandas.io.json.
    
    Example: {"doctor": {"last_name": "Doe", "first_name": "John"}} will return
    -not_unwinded_fields_path = [["doctor", "first_name"], ["doctor", "last_name"]]
    -not_unwinded_naming_map = {"doctor.first_name": "first_name", "doctor.last_name": "last_name"}'''
    not_unwinded_fields_path = []
    not_unwinded_naming_map = {}
    for field in source_series:
        if field not in fields_to_unwind:
            for subfield in source_series[field]:
                not_unwinded_fields_path.append([field, subfield])
                not_unwinded_naming_map["{}.{}".format(field, subfield)] = subfield
    return not_unwinded_fields_path, not_unwinded_naming_map

def transform_source_data(source_data, fields_to_unwind):
    '''Form a DataFrame from a list of dictionaries. 
    Rename columns to follow name conventions as the csv'''
    if len(source_data) != 0:
        not_unwinded_fields_path, not_unwinded_naming_map = \
            get_formatted_not_unwinded_fields(source_data[0], fields_to_unwind)
    else:
        return None
    return json_normalize(source_data, 
                         fields_to_unwind, 
                         not_unwinded_fields_path).rename(
                             columns = not_unwinded_naming_map)

source_unwinded_df = transform_source_data(source_data, fields_to_unwind=["practices"])
assert(source_unwinded_df.shape == (22443, 10))
source_unwinded_df

Unnamed: 0,city,lat,lon,state,street,street_2,zip,npi,first_name,last_name
0,Port Demetris,-79.8757664338564,84.31253504872467,LA,271 Annabelle Fort,Apt. 404,53549,85103080143784778415,Dean,Israel
1,Nealville,81.37417480720865,-95.33450729432164,OR,8496 Kennedi Inlet,Suite 815,52665-6811,36233383542350521233,Quinton,Mollie
2,Rashadborough,69.84837521604314,87.36942972635728,UT,29483 Nader Wall,Apt. 748,46006-3437,36233383542350521233,Quinton,Mollie
3,South Daronland,84.90377842497296,177.28706015725533,AK,2122 Wintheiser Valleys,Suite 855,99372,36233383542350521233,Quinton,Mollie
4,West Lonnieberg,52.12502086274685,109.12414094328233,GA,210 Walsh Island,Suite 839,59104,68951826121607537145,Vincent,Abbie
5,Port Angieborough,89.41473074638557,-38.22151510102702,KY,460 Ortiz Points,Suite 609,60776-9928,68951826121607537145,Vincent,Abbie
6,Nyasiaburgh,0.7514069044332956,93.56993517086102,NH,13810 Pfannerstill Pike,Apt. 165,71167-1710,68951826121607537145,Vincent,Abbie
7,Grantborough,78.53231427000821,12.229188372184922,MN,1262 O'Keefe Ford,Apt. 790,39283,92442805782715742535,Gerardo,Piper
8,East Ozella,25.541057391873352,-32.342152333557465,PA,591 Gretchen Fields,Apt. 523,15472,92442805782715742535,Gerardo,Piper
9,New Fredy,-4.541598251928605,-41.46795232079714,IL,98764 Mante Trafficway,Suite 356,43570,83029151715578341587,Dean,Francesco


### Without unwinding

In case we want to look for number of doctors from match_file.csv in which the npi matches with the source_data, we canot have the unwinding in place for practices.

If we use the unwinded dataframe and N is the length of the list of practices for a doctor where we match the npi, we would have N matches. 

In other usecases not part of the assignment, we have to be careful when using the unwinded version to not repeat the doctors.

In [4]:
source_not_unwinded_df = json_normalize(source_data)

source_not_unwinded_df.columns = ['first_name', 'last_name', 'npi', 'practices']

source_not_unwinded_df

Unnamed: 0,first_name,last_name,npi,practices
0,Dean,Israel,85103080143784778415,"[{'city': 'Port Demetris', 'lon': '84.31253504..."
1,Quinton,Mollie,36233383542350521233,"[{'city': 'Nealville', 'lon': '-95.33450729432..."
2,Vincent,Abbie,68951826121607537145,"[{'city': 'West Lonnieberg', 'lon': '109.12414..."
3,Gerardo,Piper,92442805782715742535,"[{'city': 'Grantborough', 'lon': '12.229188372..."
4,Dean,Francesco,83029151715578341587,"[{'city': 'New Fredy', 'lon': '-41.46795232079..."
5,Marshall,Cole,18233577393219566041,"[{'city': 'Lake Sheila', 'lon': '-71.549823565..."
6,Lawson,Lilliana,78792788275411915642,"[{'city': 'North Daija', 'lon': '-78.533601299..."
7,Coty,Brad,50391514247237749255,"[{'city': 'West Calistaside', 'lon': '-90.3310..."
8,Billy,Gennaro,10032670447666263763,"[{'city': 'West Penelope', 'lon': '-142.565539..."
9,Deion,Mae,36556623055822736995,"[{'city': 'Streichchester', 'lon': '-17.361548..."


## Reading the csv

match_file.csv contains raw source data that needs to be parsed and normalized


In [5]:
import pandas as pd

# match_file.csv: Raw source data that needs to be parsed and normalized.
raw_data_df = pd.read_csv("match_file.csv")

raw_data_df

Unnamed: 0,first_name,last_name,npi,street,street_2,city,state,zip
0,Ruthe,Laverne,44843147983186317848,569 glenda islands,suite 163,willport,nj,23453
1,Marshall,Cole,18233577393219566041,59944 adaline harbor,apt. 862,keelingstad,al,94189-5965
2,Lawson,Lilliana,78792788275411915642,36175 amina mount,apt. 256,north daija,de,30997-4476
3,Martine,Kiana,23583155472740817761,188 walsh flat,apt. 891,yasmeenstad,nv,83568
4,Leatha,Freida,,43796 gutmann plains,suite 341,vonmouth,fl,10500
5,Justyn,Abbie,78362387662864903554,,,,,
6,Granville,Benton,17871640342222098849,95496 dare rue,suite 203,octaviastad,il,45294-0751
7,Brenda,Lenna,88137148807320232511,361 justyn meadow,suite 635,steuberhaven,la,71148-1931
8,Juliana,Benedict,,798 katarina street,apt. 817,north florida,ri,10547-0556
9,Marjory,Ulices,13251241236387155567,5356 hane mountains,suite 254,elmiraborough,ny,80179-1235


We can see that there are values that are NaN, so we need to be careful.

In addition, some string fields don't have the appropriate capitalization. Let's fix that. 

In [6]:
def transform_match_file(df, fields_to_title_case, fields_to_upper_case):
    '''Change strings of certain DataFrame columns to title case and upper case'''
    for field in fields_to_upper_case:
        df[field] = df[field].apply(lambda x: x.title() if isinstance(x, str) else x)
        
    for field in fields_to_title_case:
        df[field] = df[field].apply(lambda x: x.upper() if isinstance(x, str) else x)    


fields_to_title_case = ["state"]
fields_to_upper_case = ["street", "street_2", "city"]

transform_match_file(raw_data_df, fields_to_title_case, fields_to_upper_case)

raw_data_df

Unnamed: 0,first_name,last_name,npi,street,street_2,city,state,zip
0,Ruthe,Laverne,44843147983186317848,569 Glenda Islands,Suite 163,Willport,NJ,23453
1,Marshall,Cole,18233577393219566041,59944 Adaline Harbor,Apt. 862,Keelingstad,AL,94189-5965
2,Lawson,Lilliana,78792788275411915642,36175 Amina Mount,Apt. 256,North Daija,DE,30997-4476
3,Martine,Kiana,23583155472740817761,188 Walsh Flat,Apt. 891,Yasmeenstad,NV,83568
4,Leatha,Freida,,43796 Gutmann Plains,Suite 341,Vonmouth,FL,10500
5,Justyn,Abbie,78362387662864903554,,,,,
6,Granville,Benton,17871640342222098849,95496 Dare Rue,Suite 203,Octaviastad,IL,45294-0751
7,Brenda,Lenna,88137148807320232511,361 Justyn Meadow,Suite 635,Steuberhaven,LA,71148-1931
8,Juliana,Benedict,,798 Katarina Street,Apt. 817,North Florida,RI,10547-0556
9,Marjory,Ulices,13251241236387155567,5356 Hane Mountains,Suite 254,Elmiraborough,NY,80179-1235


## Doctor Match by NPI

In [7]:
def df_merge_by_npi(left_df, right_df):
     return pd.merge(left_df, right_df, how="inner", on="npi")

def number_of_doctor_matches_by_npi(left_df, right_df):
    return df_merge_by_npi(left_df, right_df).shape[0]
    
print("Number of matches: {}".format(number_of_doctor_matches_by_npi(source_not_unwinded_df, raw_data_df)))

Number of matches: 864


In [8]:
import timeit
timeit.timeit('number_of_doctor_matches_by_npi(source_not_unwinded_df, raw_data_df)', setup="from __main__ import number_of_doctor_matches_by_npi, source_not_unwinded_df, raw_data_df", number=10)

0.05324934199961717

## Doctor Match by first name, last name and full adress

In [9]:
def df_merge_by_name_and_full_address(left_df, right_df):
    '''For a df by merging two dfs by name and full address'''
    return pd.merge(left_df, right_df, how="inner", 
                    on=["first_name", "last_name", "street", 
                        "street_2", "city", "state", "zip"])

def number_of_doctor_matches_by_name_and_full_address(left_df, right_df):
    '''Return the number of doctor matches by name and adress'''
    return df_merge_by_name_and_full_address(left_df, right_df).shape[0]


print("Number of matches: {}".format(number_of_doctor_matches_by_name_and_full_address(source_unwinded_df, raw_data_df)))

Number of matches: 912


In [10]:
import timeit
timeit.timeit('number_of_doctor_matches_by_name_and_full_address(source_unwinded_df, raw_data_df)', setup="from __main__ import number_of_doctor_matches_by_name_and_full_address, source_unwinded_df, raw_data_df", number=10)

0.24497802299993054

## Practice Match by full address

In [11]:
def df_merge_by_full_address(left_df, right_df):
     return pd.merge(left_df, right_df, how="inner", on=["street", "street_2", "city", "state", "zip"])

def number_of_practices_by_full_address(left_df, right_df):
    return df_merge_by_full_address(left_df, right_df).shape[0]
    
print("Number of matches: {}".format(number_of_practices_by_full_address(source_unwinded_df, raw_data_df)))

Number of matches: 912


In [12]:
import timeit
timeit.timeit('number_of_practices_by_full_address(source_unwinded_df, raw_data_df)', setup="from __main__ import number_of_practices_by_full_address, source_unwinded_df, raw_data_df", number=10)

0.23703593099980935

## Number of documents that could not be matched

The assumption is that the problem statement talks about the number of rows from the match_file.csv that could not be matched in source_data.json by none of the criteria above.

In [16]:
source_unwinded_df

Unnamed: 0,city,lat,lon,state,street,street_2,zip,npi,first_name,last_name
0,Port Demetris,-79.8757664338564,84.31253504872467,LA,271 Annabelle Fort,Apt. 404,53549,85103080143784778415,Dean,Israel
1,Nealville,81.37417480720865,-95.33450729432164,OR,8496 Kennedi Inlet,Suite 815,52665-6811,36233383542350521233,Quinton,Mollie
2,Rashadborough,69.84837521604314,87.36942972635728,UT,29483 Nader Wall,Apt. 748,46006-3437,36233383542350521233,Quinton,Mollie
3,South Daronland,84.90377842497296,177.28706015725533,AK,2122 Wintheiser Valleys,Suite 855,99372,36233383542350521233,Quinton,Mollie
4,West Lonnieberg,52.12502086274685,109.12414094328233,GA,210 Walsh Island,Suite 839,59104,68951826121607537145,Vincent,Abbie
5,Port Angieborough,89.41473074638557,-38.22151510102702,KY,460 Ortiz Points,Suite 609,60776-9928,68951826121607537145,Vincent,Abbie
6,Nyasiaburgh,0.7514069044332956,93.56993517086102,NH,13810 Pfannerstill Pike,Apt. 165,71167-1710,68951826121607537145,Vincent,Abbie
7,Grantborough,78.53231427000821,12.229188372184922,MN,1262 O'Keefe Ford,Apt. 790,39283,92442805782715742535,Gerardo,Piper
8,East Ozella,25.541057391873352,-32.342152333557465,PA,591 Gretchen Fields,Apt. 523,15472,92442805782715742535,Gerardo,Piper
9,New Fredy,-4.541598251928605,-41.46795232079714,IL,98764 Mante Trafficway,Suite 356,43570,83029151715578341587,Dean,Francesco


In [17]:
raw_data_df

Unnamed: 0,first_name,last_name,npi,street,street_2,city,state,zip
0,Ruthe,Laverne,44843147983186317848,569 Glenda Islands,Suite 163,Willport,NJ,23453
1,Marshall,Cole,18233577393219566041,59944 Adaline Harbor,Apt. 862,Keelingstad,AL,94189-5965
2,Lawson,Lilliana,78792788275411915642,36175 Amina Mount,Apt. 256,North Daija,DE,30997-4476
3,Martine,Kiana,23583155472740817761,188 Walsh Flat,Apt. 891,Yasmeenstad,NV,83568
4,Leatha,Freida,,43796 Gutmann Plains,Suite 341,Vonmouth,FL,10500
5,Justyn,Abbie,78362387662864903554,,,,,
6,Granville,Benton,17871640342222098849,95496 Dare Rue,Suite 203,Octaviastad,IL,45294-0751
7,Brenda,Lenna,88137148807320232511,361 Justyn Meadow,Suite 635,Steuberhaven,LA,71148-1931
8,Juliana,Benedict,,798 Katarina Street,Apt. 817,North Florida,RI,10547-0556
9,Marjory,Ulices,13251241236387155567,5356 Hane Mountains,Suite 254,Elmiraborough,NY,80179-1235


In [19]:
df_merge_by_full_address(source_unwinded_df, raw_data_df)
df_merge_by_name_and_full_address(source_unwinded_df, raw_data_df)
df_merge_by_npi(source_not_unwinded_df, raw_data_df)



Unnamed: 0,first_name_x,last_name_x,npi,practices,first_name_y,last_name_y,street,street_2,city,state,zip
0,Marshall,Cole,18233577393219566041,"[{'city': 'Lake Sheila', 'lon': '-71.549823565...",Marshall,Cole,59944 Adaline Harbor,Apt. 862,Keelingstad,AL,94189-5965
1,Lawson,Lilliana,78792788275411915642,"[{'city': 'North Daija', 'lon': '-78.533601299...",Lawson,Lilliana,36175 Amina Mount,Apt. 256,North Daija,DE,30997-4476
2,Martine,Kiana,23583155472740817761,"[{'city': 'Jacquesville', 'lon': '105.33018901...",Martine,Kiana,188 Walsh Flat,Apt. 891,Yasmeenstad,NV,83568
3,Justyn,Abbie,78362387662864903554,"[{'city': 'Hortensetown', 'lon': '27.518097739...",Justyn,Abbie,,,,,
4,Granville,Benton,17871640342222098849,"[{'city': 'Octaviastad', 'lon': '86.2622697806...",Granville,Benton,95496 Dare Rue,Suite 203,Octaviastad,IL,45294-0751
5,Brenda,Lenna,88137148807320232511,"[{'city': 'Lisandrofort', 'lon': '-17.81937107...",Brenda,Lenna,361 Justyn Meadow,Suite 635,Steuberhaven,LA,71148-1931
6,Marjory,Ulices,13251241236387155567,"[{'city': 'Elmiraborough', 'lon': '148.3577743...",Marjory,Ulices,5356 Hane Mountains,Suite 254,Elmiraborough,NY,80179-1235
7,Celia,Joany,53517451823105334497,"[{'city': 'West Rick', 'lon': '68.704357880417...",Celia,Joany,,,,,
8,Isabelle,Nils,41196810585374325420,"[{'city': 'Daphneeburgh', 'lon': '74.280696101...",Isabelle,Nils,590 Cummings Union,Apt. 390,Daphneeburgh,IN,56970
9,Madaline,Shirley,34629375530320352466,"[{'city': 'Kundechester', 'lon': '-0.459013440...",Madaline,Shirley,62088 Krajcik Summit,Apt. 534,Kundechester,WA,30826-4983


In [24]:
df_1 = df_merge_by_full_address(source_unwinded_df, raw_data_df)
df_2 = df_merge_by_name_and_full_address(source_unwinded_df, raw_data_df)
df_3 = df_merge_by_npi(source_not_unwinded_df, raw_data_df)

df_12 = pd.merge(df_1, df_2, how="outer",
         on=["street", "street_2", "city", "state", "zip"])

df_3


Unnamed: 0,first_name_x,last_name_x,npi,practices,first_name_y,last_name_y,street,street_2,city,state,zip
0,Marshall,Cole,18233577393219566041,"[{'city': 'Lake Sheila', 'lon': '-71.549823565...",Marshall,Cole,59944 Adaline Harbor,Apt. 862,Keelingstad,AL,94189-5965
1,Lawson,Lilliana,78792788275411915642,"[{'city': 'North Daija', 'lon': '-78.533601299...",Lawson,Lilliana,36175 Amina Mount,Apt. 256,North Daija,DE,30997-4476
2,Martine,Kiana,23583155472740817761,"[{'city': 'Jacquesville', 'lon': '105.33018901...",Martine,Kiana,188 Walsh Flat,Apt. 891,Yasmeenstad,NV,83568
3,Justyn,Abbie,78362387662864903554,"[{'city': 'Hortensetown', 'lon': '27.518097739...",Justyn,Abbie,,,,,
4,Granville,Benton,17871640342222098849,"[{'city': 'Octaviastad', 'lon': '86.2622697806...",Granville,Benton,95496 Dare Rue,Suite 203,Octaviastad,IL,45294-0751
5,Brenda,Lenna,88137148807320232511,"[{'city': 'Lisandrofort', 'lon': '-17.81937107...",Brenda,Lenna,361 Justyn Meadow,Suite 635,Steuberhaven,LA,71148-1931
6,Marjory,Ulices,13251241236387155567,"[{'city': 'Elmiraborough', 'lon': '148.3577743...",Marjory,Ulices,5356 Hane Mountains,Suite 254,Elmiraborough,NY,80179-1235
7,Celia,Joany,53517451823105334497,"[{'city': 'West Rick', 'lon': '68.704357880417...",Celia,Joany,,,,,
8,Isabelle,Nils,41196810585374325420,"[{'city': 'Daphneeburgh', 'lon': '74.280696101...",Isabelle,Nils,590 Cummings Union,Apt. 390,Daphneeburgh,IN,56970
9,Madaline,Shirley,34629375530320352466,"[{'city': 'Kundechester', 'lon': '-0.459013440...",Madaline,Shirley,62088 Krajcik Summit,Apt. 534,Kundechester,WA,30826-4983


In [66]:
c1 = 0
c2 = 0
c3 = 0
def check(row):
    global c1, c2, c3
    
    for record in source_data:
        npi_match = False
        practice_match = False
        if record["doctor"]["npi"] == row["npi"]:
            c1 += 1
            npi_match = True
        for practice in record["practices"]:
            full_address = ["street", "street_2", "city", "state", "zip"]
            if all(practice[x] == row[x] for x in full_address):
                c2 += 1
                practice_match = True
                break
        if practice_match or npi_match:
            c3 += 1
            break

raw_data_df.apply(check, axis = 1)
print(c1, c2, c3)

864 912 1091


If the doctor is matched by full name and address, the practice will match by the address (since it is a subset of the conditions).

Therefore, we only need to see the intersection between the number of documents that don't match by npi and the numbers of documents that don't match by full address, this will be the number of documents that aren't good.



In [86]:
merged_by_npi_df = pd.merge(raw_data_df, source_not_unwinded_df, indicator=True, how="outer", on="npi")
not_matched_by_npi_df = merged_by_npi_df[merged_by_npi_df['_merge'] == 'left_only']
print("Number of documents not matched by npi {}".format(not_matched_by_npi_df.shape[0]))

Number of documents not matched by npi 401


In [87]:
merged_by_address_df = pd.merge(raw_data_df, source_unwinded_df, indicator=True, how="outer", on=["street", "street_2", "city", "state", "zip"])
not_matched_by_address_df = merged_by_address_df[merged_by_address_df['_merge'] == 'left_only']
print("Number of documents not matched by address {}".format(not_matched_by_address_df.shape[0]))

Number of documents not matched by address 353


In [97]:
not_matched_by_npi_df[["first_name_x", "last_name_x", "npi", "street", "street_2", "city", "state", "zip"]].rename(columns = {"first_name_x": "first_name", "last_name_x": "last_name"})

Unnamed: 0,first_name,last_name,npi,street,street_2,city,state,zip
0,Ruthe,Laverne,44843147983186317848,569 Glenda Islands,Suite 163,Willport,NJ,23453
4,Leatha,Freida,,43796 Gutmann Plains,Suite 341,Vonmouth,FL,10500
5,Juliana,Benedict,,798 Katarina Street,Apt. 817,North Florida,RI,10547-0556
6,Johnnie,Johnathon,,541 Nora Hill,Apt. 833,South Erwinborough,UT,24212
7,Audra,Imogene,,9228 Rodriguez Knolls,Apt. 544,Jonesside,IN,20864
8,Sylvia,Obie,,,,,,
9,Victoria,Kaleb,,79341 Destin Springs,Apt. 561,Port Norbertohaven,LA,74019
10,Erling,Ellsworth,,72949 Wyman Valley,Suite 721,Shanahanton,MN,41462-4632
11,Floyd,Toney,,133 Kris Corners,Apt. 662,Lake Deon,UT,53630
12,Destiney,Wilson,,191 Jacobs Mill,Apt. 525,Port Sallie,OH,23631


In [98]:
not_matched_by_address_df

Unnamed: 0,first_name_x,last_name_x,npi_x,street,street_2,city,state,zip,lat,lon,npi_y,first_name_y,last_name_y,_merge
0,Ruthe,Laverne,44843147983186317848,569 Glenda Islands,Suite 163,Willport,NJ,23453,,,,,,left_only
5,Justyn,Abbie,78362387662864903554,,,,,,,,,,,left_only
6,Celia,Joany,53517451823105334497,,,,,,,,,,,left_only
7,Heather,Tracy,75216887016624818206,,,,,,,,,,,left_only
8,Sylvia,Obie,,,,,,,,,,,,left_only
9,Pascale,Ryder,75137145868784228122,,,,,,,,,,,left_only
10,Millie,Amani,53407188811357432743,,,,,,,,,,,left_only
11,Elvis,Lenna,14455777372842761255,,,,,,,,,,,left_only
12,Mario,Richard,,,,,,,,,,,,left_only
13,Estell,Harvey,34268632282866767982,,,,,,,,,,,left_only


In [84]:
not_matched_by_address_df

Unnamed: 0,first_name_x,last_name_x,npi_x,street,street_2,city,state,zip,lat,lon,npi_y,first_name_y,last_name_y,_merge
0,Ruthe,Laverne,44843147983186317848,569 Glenda Islands,Suite 163,Willport,NJ,23453,,,,,,left_only
5,Justyn,Abbie,78362387662864903554,,,,,,,,,,,left_only
6,Celia,Joany,53517451823105334497,,,,,,,,,,,left_only
7,Heather,Tracy,75216887016624818206,,,,,,,,,,,left_only
8,Sylvia,Obie,,,,,,,,,,,,left_only
9,Pascale,Ryder,75137145868784228122,,,,,,,,,,,left_only
10,Millie,Amani,53407188811357432743,,,,,,,,,,,left_only
11,Elvis,Lenna,14455777372842761255,,,,,,,,,,,left_only
12,Mario,Richard,,,,,,,,,,,,left_only
13,Estell,Harvey,34268632282866767982,,,,,,,,,,,left_only
