## JSON file

source_data.json has clean normalized data used as the source of the matching.

In [3]:
import json
from pprint import pprint

source_data = []
with open("source_data.json") as f:
    for line in f:
        source_data.append(json.loads(line))
    
print(len(source_data))
pprint(source_data[:2])


11231
[{'doctor': {'first_name': 'Dean',
             'last_name': 'Israel',
             'npi': '85103080143784778415'},
  'practices': [{'city': 'Port Demetris',
                 'lat': '-79.8757664338564',
                 'lon': '84.31253504872467',
                 'state': 'LA',
                 'street': '271 Annabelle Fort',
                 'street_2': 'Apt. 404',
                 'zip': '53549'}]},
 {'doctor': {'first_name': 'Quinton',
             'last_name': 'Mollie',
             'npi': '36233383542350521233'},
  'practices': [{'city': 'Nealville',
                 'lat': '81.37417480720865',
                 'lon': '-95.33450729432164',
                 'state': 'OR',
                 'street': '8496 Kennedi Inlet',
                 'street_2': 'Suite 815',
                 'zip': '52665-6811'},
                {'city': 'Rashadborough',
                 'lat': '69.84837521604314',
                 'lon': '87.36942972635728',
                 'state': 'UT',
              

Each element of this list with 11231 record has a doctor and one or more of its practices. 

- doctor: contains first and last name, and an identifier called npi
- practices: list of the practices associated to the doctor. Each contains location data (address and coordinates).

## Raw source data

match_file.csv contains raw source data that needs to be parsed and normalized

In [4]:
import pandas as pd

# match_file.csv: Raw source data that needs to be parsed and normalized.
raw_data_df = pd.read_csv("match_file.csv")

raw_data_df

Unnamed: 0,first_name,last_name,npi,street,street_2,city,state,zip
0,Ruthe,Laverne,44843147983186317848,569 glenda islands,suite 163,willport,nj,23453
1,Marshall,Cole,18233577393219566041,59944 adaline harbor,apt. 862,keelingstad,al,94189-5965
2,Lawson,Lilliana,78792788275411915642,36175 amina mount,apt. 256,north daija,de,30997-4476
3,Martine,Kiana,23583155472740817761,188 walsh flat,apt. 891,yasmeenstad,nv,83568
4,Leatha,Freida,,43796 gutmann plains,suite 341,vonmouth,fl,10500
5,Justyn,Abbie,78362387662864903554,,,,,
6,Granville,Benton,17871640342222098849,95496 dare rue,suite 203,octaviastad,il,45294-0751
7,Brenda,Lenna,88137148807320232511,361 justyn meadow,suite 635,steuberhaven,la,71148-1931
8,Juliana,Benedict,,798 katarina street,apt. 817,north florida,ri,10547-0556
9,Marjory,Ulices,13251241236387155567,5356 hane mountains,suite 254,elmiraborough,ny,80179-1235


## Match the data based on the following criteria:

### Doctor Match

#### NPI


In [98]:
def match_doctor_by_npi(raw_data_df, npi):
    return raw_data_df["npi"].isin([str(npi)]).any()

def number_of_matches_by_npi(source_data, raw_data_df):
    match_counter = 0
    for doctor_record in source_data:
        match_counter += match_doctor_by_npi(raw_data_df, doctor_record["doctor"]["npi"])
    return match_counter

print("Number of doctors matched with NPI {}".format(number_of_matches_by_npi(source_data, raw_data_df)))
print("Total number of doctors in source_data {}".format(len(source_data)))
print("Total number of doctors in match_df {}".format(len(raw_data_df)))

Number of doctors matched with NPI 864
Total number of doctors in source_data 11231
Total number of doctors in match_df 1265


In [99]:
import timeit
timeit.timeit('number_of_matches_by_npi(source_data, raw_data_df)', setup="from __main__ import number_of_matches_by_npi, source_data, raw_data_df", number=10)

20.970521792005457

This seems a little bit slow due to the use of the for. Let's try with apply.

In [100]:
def number_of_matches_by_npi(source_data, raw_data_df):
    source_npis = [x["doctor"]["npi"] for x in source_data]
    return raw_data_df.apply(lambda x: x["npi"] in source_npis, axis=1).sum()

number_of_matches_by_npi(source_data, raw_data_df)

864

In [101]:
import timeit
timeit.timeit('number_of_matches_by_npi(source_data, raw_data_df)', setup="from __main__ import number_of_matches_by_npi, source_data, raw_data_df", number=10)

2.5114713110015146

Almost 7 times better! And if we choose to do the apply on a pandas series instead of in the whole dataframe, it is even faster.

In [102]:
def number_of_matches_by_npi(source_data, raw_data_df):
    source_npis = [x["doctor"]["npi"] for x in source_data]
    return raw_data_df["npi"].apply(lambda x: x in source_npis).sum()

number_of_matches_by_npi(source_data, raw_data_df)

864

In [103]:
import timeit
timeit.timeit('number_of_matches_by_npi(source_data, raw_data_df)', setup="from __main__ import number_of_matches_by_npi, source_data, raw_data_df", number=10)

2.29575916199974

If instead of a list (with O(N) lookups) we use a set (O(1) lookups), the final function is O(N) instead of O(N^2), and much much faster.

In [106]:
def number_of_matches_by_npi(source_data, raw_data_df):
    source_npis = {x["doctor"]["npi"] for x in source_data}
    return raw_data_df["npi"].apply(lambda x: x in source_npis).sum()

number_of_matches_by_npi(source_data, raw_data_df)

864

In [107]:
import timeit
timeit.timeit('number_of_matches_by_npi(source_data, raw_data_df)', setup="from __main__ import number_of_matches_by_npi, source_data, raw_data_df", number=10)

0.08321979799802648

### first name + last name + full address


In [77]:
source_data

[{'doctor': {'first_name': 'Dean',
   'last_name': 'Israel',
   'npi': '85103080143784778415'},
  'practices': [{'city': 'Port Demetris',
    'lat': '-79.8757664338564',
    'lon': '84.31253504872467',
    'state': 'LA',
    'street': '271 Annabelle Fort',
    'street_2': 'Apt. 404',
    'zip': '53549'}]},
 {'doctor': {'first_name': 'Quinton',
   'last_name': 'Mollie',
   'npi': '36233383542350521233'},
  'practices': [{'city': 'Nealville',
    'lat': '81.37417480720865',
    'lon': '-95.33450729432164',
    'state': 'OR',
    'street': '8496 Kennedi Inlet',
    'street_2': 'Suite 815',
    'zip': '52665-6811'},
   {'city': 'Rashadborough',
    'lat': '69.84837521604314',
    'lon': '87.36942972635728',
    'state': 'UT',
    'street': '29483 Nader Wall',
    'street_2': 'Apt. 748',
    'zip': '46006-3437'},
   {'city': 'South Daronland',
    'lat': '84.90377842497296',
    'lon': '177.28706015725533',
    'state': 'AK',
    'street': '2122 Wintheiser Valleys',
    'street_2': 'Suite 8

In [86]:
def get_formatted_address(practices):
    return [{"street": x["street"]} for x in practices]

def number_of_matches_by_full_name_and_address(source_data, raw_data_df):
    source_cleaned = [{"first_name": x["doctor"]["first_name"], \
                    "last_name": x["doctor"]["last_name"], \
                    "address": get_formatted_address(x["practices"])} for x in source_data]
#     raw_data_df.apply(lambda x: print(x["first_name"]), axis=1)
    return raw_data_df.apply(lambda x: x["first_name"] in source_cleaned, axis=1).sum()


number_of_matches_by_full_name_and_address(source_data, raw_data_df)

# source_data

0

In [97]:
def number_of_matches_by_full_name_and_address(source_data, raw_data_df):
    source_first_name = {x["doctor"]["first_name"] for x in source_data}
    source_last_name = {x["doctor"]["first_name"] for x in source_data}
    source_street = {x["street"] for y in source_data for x in y["practices"]}
#     [x for b in a for x in b]
    for idx, elem in enumerate(source_street):
        print(elem)
        if idx == 5:
            break
        
    return raw_data_df.apply(lambda x: x["first_name"] in source_first_name\
                             and x["last_name"] in source_last_name, axis=1\
                            and x["street"]).sum()


number_of_matches_by_full_name_and_address(source_data, raw_data_df)


4729 Hans Meadow
9083 Borer Mountain
8166 Blair Landing
8634 Windler Court
893 Fritz Island
560 Miller Parkway


1233

In [None]:
def number_of_matches_by_full_name_and_address(source_data, raw_data_df):
    if source_data
number_of_matches_by_full_name_and_address(source_data, raw_data_df)


In [47]:
def get_name_and_full_address_from_source_data_record(doctor_record)
    name_and_full_address = {
        "first_name": doctor_record["doctor"]["first_name"],
        "last_name": doctor_record["doctor"]["last_name"]
        ""
    }
    pd.Series
    for practices in doctor_record["practices"]
    return 

SyntaxError: invalid syntax (<ipython-input-47-a60d5ae0ebc0>, line 1)

In [17]:
counter = 0
for doctor_record in source_data:
    print(doctor_record["doctor"])
    for practice in doctor_record["practices"]: 
        print(practice["street"])
        print(practice["street_2"])
        
    counter += 1
    if counter == 4:
        break

{'npi': '85103080143784778415', 'last_name': 'Israel', 'first_name': 'Dean'}
271 Annabelle Fort
Apt. 404
{'npi': '36233383542350521233', 'last_name': 'Mollie', 'first_name': 'Quinton'}
8496 Kennedi Inlet
Suite 815
29483 Nader Wall
Apt. 748
2122 Wintheiser Valleys
Suite 855
{'npi': '68951826121607537145', 'last_name': 'Abbie', 'first_name': 'Vincent'}
210 Walsh Island
Suite 839
460 Ortiz Points
Suite 609
13810 Pfannerstill Pike
Apt. 165
{'npi': '92442805782715742535', 'last_name': 'Piper', 'first_name': 'Gerardo'}
1262 O'Keefe Ford
Apt. 790
591 Gretchen Fields
Apt. 523


In [18]:
for index, row in raw_data_df.iterrows():
    print(row["first_name"])
    print(row["last_name"])
    print(row["street"])
    print(row["street_2"])
    print(row["city"])
    print(row["state"])
    print(row["zip"])

Ruthe
Laverne
569 glenda islands
suite 163
willport
nj
23453
Marshall
Cole
59944 adaline harbor
apt. 862
keelingstad
al
94189-5965
Lawson
Lilliana
36175 amina mount
apt. 256
north daija
de
30997-4476
Martine
Kiana
188 walsh flat
apt. 891
yasmeenstad
nv
83568
Leatha
Freida
43796 gutmann plains
suite 341
vonmouth
fl
10500
Justyn
Abbie
nan
nan
nan
nan
nan
Granville
Benton
95496 dare rue
suite 203
octaviastad
il
45294-0751
Brenda
Lenna
361 justyn meadow
suite 635
steuberhaven
la
71148-1931
Juliana
Benedict
798 katarina street
apt. 817
north florida
ri
10547-0556
Marjory
Ulices
5356 hane mountains
suite 254
elmiraborough
ny
80179-1235
Celia
Joany
nan
nan
nan
nan
nan
Kathryn
Shany
817 hammes harbor
apt. 701
dickinsonview
ca
97730
Isabelle
Nils
590 cummings union
apt. 390
daphneeburgh
in
56970
Johnnie
Johnathon
541 nora hill
apt. 833
south erwinborough
ut
24212
Madaline
Shirley
62088 krajcik summit
apt. 534
kundechester
wa
30826-4983
Khalid
Crystel
89133 schoen ferry
apt. 830
richieland
sc
73