# Filter, Clean, and Join the NPPES Data w/ DocGraph Data

### Imports

In [None]:
import graphlab as gl
from pyspark import SparkConf,SparkContext
import os

### Initialize Spark Instance

In [1]:
# Configure Spark Settings
conf=SparkConf()
conf.set("spark.executor.memory", "10g")
conf.set("spark.cores.max", "1")
conf.setAppName("My App")

## Initialize SparkContext
sc = SparkContext('local[*]', conf=conf)

A newer version of GraphLab Create (v1.4.0) is available! Your current version is v1.3.0.
New features in 1.4:
- Nearest neighbor classifier
- Label propagation toolkit
- Model visualization in GraphLab Canvas
- New model parameter search API
- New feature engineering transformers (TF-IDF and Numeric Imputation)
- Java and JavaScript client libraries for Predictive Service
- Support deploying scikit-learn models to Predictive Service

Significant speedup and improvements in 1.4:
- Faster SFrame execution with query optimization 
- Pagerank and ConnectedComponents
- K-Means with large K 
- GraphLab Canvas histograms and heat map
- Better cache control for Predictive Service

For detailed release notes please visit:
https://dato.com/products/create/release-notes.htmlYou can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.[INFO] Start server at: ipc:///tmp/graphlab_server-87433 - Server binary: /Users/astuckey002/anaconda/l

In [None]:
with open('../../../../data/npidata_20050523-20150208.csv','r') as f:
    test = f.readlines()

In [None]:
with open('../../../../data/nppes_fixed.csv','w') as f:
    for x in test:
        f.write(x.replace('\","',"\t").replace('\"',''))

In [32]:
raw_data = sc.textFile('../../../../Analytics/DocGraph/Data/NPPES_Data_Dissemination_February_2015/nppes_fixed.csv').map(lambda x: x.split('\t'))

In [33]:
header = raw_data.first()

In [35]:
def safe_cast(val, to_type, default=0):
    try:
        return to_type(val)
    except ValueError:
        return default

In [36]:
def clean(line):
    new_line=[]
    for i in line:
        i=i.replace('\"','')
        new_line.append(i)
    return new_line

In [37]:
def filter_npis(line):
    npi = safe_cast(clean(line)[0],int)
    rep_npi = safe_cast(clean(line)[2],int)
    
    if (npi in npi_dict) or (rep_npi in npi_dict):
        return True
    else:
        return False

In [17]:
npi_list = gl.SFrame.read_csv('../../../../data/unique_npis.csv', verbose = False)

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/astuckey002/Documents/data/unique_npis.csv
PROGRESS: Parsing completed. Parsed 25921 lines in 0.018879 secs.


In [25]:
npi_dict = {}
for line in npi_list['npi']:
    npi = safe_cast(line,int)
    npi_dict[npi] = 0

In [38]:
filtered_data = raw_data.filter(lambda line: filter_npis(line))

In [39]:
nppes_sf = gl.SFrame.from_rdd(filtered_data)

In [41]:
nppes_sf_u = nppes_sf.unpack('X1')

In [45]:
header_dict = {}
y = 0
for x in header:
    header_dict[y] = x
    y += 1

In [55]:
for col in nppes_sf_u.column_names():
    nppes_sf_u.rename({col:header_dict[int(col[3:])]})

In [60]:
for col in nppes_sf_u.column_names():
    counter = 0
    temp_data = nppes_sf_u[col]
    for cell in temp_data:
        if cell != "":
            counter += 1
        if counter > 0:
            break
    
    if counter == 0:
        nppes_sf_u.remove_column(col)
        print "Removing column:", col
    continue

Removing column: Replacement NPI
Removing column: NPI Deactivation Reason Code
Removing column: Other Provider Identifier Issuer_20
Removing column: Other Provider Identifier Issuer_25
Removing column: Other Provider Identifier Issuer_26
Removing column: Other Provider Identifier Issuer_27
Removing column: Other Provider Identifier Issuer_29
Removing column: Other Provider Identifier Issuer_30
Removing column: Other Provider Identifier Issuer_31
Removing column: Other Provider Identifier Issuer_32
Removing column: Other Provider Identifier Issuer_33
Removing column: Other Provider Identifier Issuer_34
Removing column: Other Provider Identifier_35
Removing column: Other Provider Identifier Type Code_35
Removing column: Other Provider Identifier State_35
Removing column: Other Provider Identifier Issuer_35
Removing column: Other Provider Identifier_36
Removing column: Other Provider Identifier Type Code_36
Removing column: Other Provider Identifier State_36
Removing column: Other Provide

In [63]:
nppes_sf_u.rename({'NPI':'npi'})

npi,Entity Type Code,Employer Identification Number (EIN) ...,Provider Organization Name (Legal Business ...,Provider Last Name (Legal Name) ...
1871596817,1,,,ABIAD
1679576862,2,<UNAVAIL>,ALGONQUIN FIRE PROTECTION DISTRICT ...,
1790788792,1,,,BUCHAR
1316940430,1,,,FERSTL
1619970613,1,,,WLEKLINSKI
1508869744,1,,,KAHN
1457354656,1,,,BEHM
1306849567,2,<UNAVAIL>,VILLAGE OF ARLINGTON HEIGHTS A ...,
1487657524,1,,,VANROEKEL
1124021266,2,<UNAVAIL>,ALVERNO LAKESIDE CORPORATION ...,

Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text
HOMER,,,,M.D.
,,,,
WILLIAM,L.,DR.,III,D.C.
JOSEPH,F,DR.,,DC DABCO
LEON,ROBERT,DR.,,D.C.
STASIA,E,,,M.D.
FREDERICK,G.,DR.,,MD
,,,,
VIVIAN,S,,,M.D.
,,,,

Provider Other Organization Name ...,Provider Other Organization Name Type ...,Provider Other Last Name,Provider Other First Name,Provider Other Middle Name ...
,,,,
ALGONQUIN LAKE IN THE HILLS FIRE PROTECTION ...,3.0,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
ALVERNO ADVANCED LIFE SUPPORT SERVICE ...,3.0,,,

Provider Other Name Prefix Text ...,Provider Other Name Suffix Text ...,Provider Other Credential Text ...,Provider Other Last Name Type Code ...
,,,
,,,
,,,
,,,
,,,
,,,
,,,
,,,
,,,
,,,

Provider First Line Business Mailing Address ...,Provider Second Line Business Mailing Address ...,Provider Business Mailing Address City Name ...,Provider Business Mailing Address State Name ...
15900 CICERO AVE,,OAK FOREST,IL
PO BOX 457,,WHEELING,IL
1803 WEHRLI RD,,NAPERVILLE,IL
1557 WEATHERSTONE LN,,ELGIN,IL
1420 RENAISSANCE DR,STE 206,PARK RIDGE,IL
327 GUNDERSEN DR,STE C,CAROL STREAM,IL
840 S WOOD ST,"DEPARTMENT OF PATHOLOGY, MC847 ...",CHICAGO,IL
PO BOX 95349,,PALATINE,IL
2900 FOXFIELD RD,,ST CHARLES,IL
20201 CRAWFORD AVE,,OLYMPIA FIELDS,IL

Provider Business Mailing Address Postal Code ...,Provider Business Mailing Address Country Code (If ...,Provider Business Mailing Address Telephone Number ...,Provider Business Mailing Address Fax Number ...
604524006,US,7086334133,7086333029
600900457,US,8475778811,8475779515
605659306,US,6305279100,6305279129
601232019,US,8477413355,8477413597
600681342,US,8472983565,8472983770
601882402,US,6306682416,6306682713
606124325,US,3129963150,3124130156
600950349,US,8475778811,8475779515
601745799,US,6303156790,6303156799
604611010,US,7087474000,7087553392

Provider First Line Business Practice ...,Provider Second Line Business Practice ...,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1
15900 CICERO AVE,,OAK FOREST,IL
1020 W ALGONQUIN RD,,LAKE IN THE HILLS,IL
1803 WEHRLI RD,,NAPERVILLE,IL
1557 WEATHERSTONE LN,,ELGIN,IL
1420 RENAISSANCE DR,STE 206,PARK RIDGE,IL
327 GUNDERSEN DR,STE C,CAROL STREAM,IL
840 S WOOD ST,"DEPARTMENT OF PATHOLOGY, MC847 ...",CHICAGO,IL
33 S ARLINGTON HEIGHTS RD,,ARLINGTON HEIGHTS,IL
2900 FOXFIELD RD,,ST CHARLES,IL
20201 CRAWFORD AVE,,OLYMPIA FIELDS,IL

Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Business Practice Location Add ....3
604524006,US,7086334133,7086333029
601563500,US,8476588233,8478542609
605659306,US,6305279100,6305279129
601232019,US,8477413355,8477413597
600681342,US,8472983565,8472983770
601882402,US,6306682416,6306682713
606124325,US,3129963150,3124130156
600051403,US,8473685000,8473685995
601745799,US,6303156790,6303156799
604611010,US,7087561200,7084815583

Provider Enumeration Date,Last Update Date,NPI Deactivation Date,NPI Reactivation Date,Provider Gender Code,...
05/23/2005,07/21/2014,,,M,...
05/23/2005,05/19/2008,,,,...
05/23/2005,07/08/2007,,,M,...
05/23/2005,01/18/2011,,,M,...
05/23/2005,10/20/2010,,,M,...
05/23/2005,04/30/2012,,,F,...
05/23/2005,12/05/2011,,,M,...
05/23/2005,12/02/2009,,,,...
05/23/2005,04/07/2014,,,F,...
05/23/2005,01/10/2011,,,,...


## Read in the DG Chicago Nodes

In [64]:
dg_chicago_nodes = gl.SFrame.read_csv('../../../../data/dg_nodes_chicago.csv', verbose = False)

PROGRESS: Finished parsing file /Users/astuckey002/Documents/data/dg_nodes_chicago.csv
PROGRESS: Parsing completed. Parsed 100 lines in 0.964909 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,int,int,int,float,float,float,float,float,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/astuckey002/Documents/data/dg_nodes_chicago.csv
PROGRESS: Parsing completed. Parsed 264209 lines in 1.30301 secs.


## Join the data on the NPI

In [65]:
joined_data = dg_chicago_nodes.join(nppes_sf_u,on = 'npi')

In [68]:
joined_data.save('../../../../data/dg_nppes_chicago_joined.csv')