# Filter, Clean, and Join the NPPES Data w/ DocGraph Data

### Imports

In [1]:
import graphlab as gl
from pyspark import SparkConf,SparkContext
import os

[INFO] Start server at: ipc:///tmp/graphlab_server-58208 - Server binary: /Users/astuckey002/anaconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1431704894.log
[INFO] GraphLab Server Version: 1.3.0


### Initialize Spark Instance

In [2]:
# Configure Spark Settings
conf=SparkConf()
conf.set("spark.executor.memory", "10g")
conf.set("spark.cores.max", "1")
conf.setAppName("My App")

## Initialize SparkContext
sc = SparkContext('local[*]', conf=conf)

### Helper Functions

In [7]:
# function to type cast variables
def safe_cast(val, to_type, default=0):
    try:
        return to_type(val)
    except ValueError:
        return default

In [8]:
# cleans unwanted quotes around data
def clean(line):
    new_line=[]
    for i in line:
        i=i.replace('\"','')
        new_line.append(i)
    return new_line

In [9]:
# filters the NPIs to the ones we care about
def filter_npis(line):
    npi = safe_cast(clean(line)[0],int)
    rep_npi = safe_cast(clean(line)[2],int)
    
    if (npi in npi_dict) or (rep_npi in npi_dict):
        return True
    else:
        return False

### Do some preliminary cleaning to the NPPES data

In [10]:
# open the NPPES data
with open('data/npidata_20050523-20150208.csv','r') as f:
    nppes_pre = f.readlines()

In [11]:
# save the data back to a new CSV file as tab delimited
with open('data/nppes_fixed.csv','w') as f:
    for line in nppes_pre:
        f.write(line.replace('\","',"\t").replace('\"',''))

### Ingest the pre-cleaned data

In [12]:
# read the CSV file into Spark
raw_data = sc.textFile('data/nppes_fixed.csv').map(lambda x: x.split('\t'))

In [13]:
# get the header for the file
header = raw_data.first()

In [14]:
# get the unique NPI list from our data
npi_list = gl.SFrame.read_csv('data/unique_npis.csv', verbose = False)

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/astuckey002/Documents/Analytics/DocGraph/Dataset Builder/data/unique_npis.csv
PROGRESS: Parsing completed. Parsed 25921 lines in 0.019065 secs.


In [15]:
# put the unique NPI list into a dictionary for quicker python processing speeds
npi_dict = {}
for line in npi_list['npi']:
    npi = safe_cast(line,int)
    npi_dict[npi] = 0

In [16]:
# filter the data by only the NPIs in Chicago
filtered_data = raw_data.filter(lambda line: filter_npis(line))

### Convert the RDD to an SFrame in GL

In [17]:
# convert rdd to SFrame
nppes_sf = gl.SFrame.from_rdd(filtered_data)

In [18]:
# the data is interpreted as one column
# we can use the unpack function in GL to create multiple columns
nppes_sf_u = nppes_sf.unpack('X1')

In [19]:
# create a header dictionary
header_dict = {}
y = 0
for x in header:
    header_dict[y] = x
    y += 1

In [20]:
# use the header dictionary to rename the columns
for col in nppes_sf_u.column_names():
    nppes_sf_u.rename({col:header_dict[int(col[3:])]})

In [21]:
# remove columns that have zero data
for col in nppes_sf_u.column_names():
    counter = 0
    temp_data = nppes_sf_u[col]
    for cell in temp_data:
        if cell != "":
            counter += 1
        if counter > 0:
            break
    
    if counter == 0:
        nppes_sf_u.remove_column(col)
        print "Removing column:", col
    continue

Removing column: Replacement NPI
Removing column: NPI Deactivation Reason Code
Removing column: Other Provider Identifier Issuer_20
Removing column: Other Provider Identifier Issuer_25
Removing column: Other Provider Identifier Issuer_26
Removing column: Other Provider Identifier Issuer_27
Removing column: Other Provider Identifier Issuer_29
Removing column: Other Provider Identifier Issuer_30
Removing column: Other Provider Identifier Issuer_31
Removing column: Other Provider Identifier Issuer_32
Removing column: Other Provider Identifier Issuer_33
Removing column: Other Provider Identifier Issuer_34
Removing column: Other Provider Identifier_35
Removing column: Other Provider Identifier Type Code_35
Removing column: Other Provider Identifier State_35
Removing column: Other Provider Identifier Issuer_35
Removing column: Other Provider Identifier_36
Removing column: Other Provider Identifier Type Code_36
Removing column: Other Provider Identifier State_36
Removing column: Other Provide

In [22]:
# rename the joining column name to match the DG dataset
nppes_sf_u = nppes_sf_u.rename({'NPI':'npi'})

### Read in the DG Chicago Nodes

In [23]:
# read in the docgraph data
dg_chicago_nodes = gl.SFrame.read_csv('data/dg_nodes_chicago.csv', verbose = False)

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,int,int,int,float,float,float,float,float,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/astuckey002/Documents/Analytics/DocGraph/Dataset Builder/data/dg_nodes_chicago.csv
PROGRESS: Parsing completed. Parsed 264209 lines in 1.68642 secs.


### Join the data on the NPI

In [24]:
# use join to combine the two files via the unique identifier (NPI)
joined_data = dg_chicago_nodes.join(nppes_sf_u,on = 'npi')

### Save the resulting data

In [25]:
# save the resulting dataset
joined_data.save('data/dg_nppes_chicago_joined.csv')