In [6]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import udf
from more_pyspark import *

In [2]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

22/11/08 11:54:33 WARN Utils: Your hostname, jt7372wd222 resolves to a loopback address: 127.0.1.1; using 172.23.236.204 instead (on interface eth0)
22/11/08 11:54:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/08 11:54:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Assigning long strings to meaningful variables
poverty_data_path = "./Data/PovertyEstimates.csv"
crosswalk_data_path = "./Data/CMS_facility_to_fips_crosswalk.csv"
hospital_data_path = "./Data/Timely_and_Effective_Care-Hospital.csv"

hospital_data_cols = ['Facility ID', 'State', 'County Name', 'Condition', 'Measure Name', 'Score']
hospital_data_measure_names = ['Left before being seen', 'Average (median) time patients spent in the emergency department before leaving from the visit A lower number of minutes is better']

In [4]:
poverty_data = spark.read.csv(poverty_data_path, header=True, inferSchema=True)
crosswalk_data = spark.read.csv(crosswalk_data_path, header=True, inferSchema=True)
hospital_data = spark.read.csv(hospital_data_path, header=True, inferSchema=True)

In [5]:
hospital_data >> pprint_schema

StructType([StructField('Facility ID', DoubleType(), True),
            StructField('Facility Name', StringType(), True),
            StructField('Address', StringType(), True),
            StructField('City', StringType(), True),
            StructField('State', StringType(), True),
            StructField('ZIP Code', IntegerType(), True),
            StructField('County Name', StringType(), True),
            StructField('Phone Number', StringType(), True),
            StructField('Condition', StringType(), True),
            StructField('Measure ID', StringType(), True),
            StructField('Measure Name', StringType(), True),
            StructField('Score', StringType(), True),
            StructField('Sample', StringType(), True),
            StructField('Footnote', StringType(), True),
            StructField('Start Date', StringType(), True),
            StructField('End Date', StringType(), True)])


In [18]:
hospital_schema = StructType([StructField('Facility ID', StringType(), True),
            StructField('Facility Name', StringType(), True),
            StructField('Address', StringType(), True),
            StructField('City', StringType(), True),
            StructField('State', StringType(), True),
            StructField('ZIP Code', IntegerType(), True),
            StructField('County Name', StringType(), True),
            StructField('Phone Number', StringType(), True),
            StructField('Condition', StringType(), True),
            StructField('Measure ID', StringType(), True),
            StructField('Measure Name', StringType(), True),
            StructField('Score', StringType(), True),
            StructField('Sample', StringType(), True),
            StructField('Footnote', StringType(), True),
            StructField('Start Date', DateType(), True),
            StructField('End Date', DateType(), True)])

hospital_data_dateTime = 'MM/dd/yyyy'

In [8]:
crosswalk_data >> pprint_schema

StructType([StructField('Final FIPS', IntegerType(), True),
            StructField('Facility ID', DoubleType(), True),
            StructField('State', StringType(), True),
            StructField('County Name', StringType(), True)])


In [9]:
crosswalk_schema = StructType([StructField('Final FIPS', IntegerType(), True),
            StructField('Facility ID', StringType(), True),
            StructField('State', StringType(), True),
            StructField('County Name', StringType(), True)])

In [10]:
poverty_data >> pprint_schema

StructType([StructField('FIPS_code', IntegerType(), True),
            StructField('Stabr', StringType(), True),
            StructField('Area_name', StringType(), True),
            StructField('Rural-urban_Continuum_Code_2003', IntegerType(), True),
            StructField('Urban_Influence_Code_2003', IntegerType(), True),
            StructField('Rural-urban_Continuum_Code_2013', IntegerType(), True),
            StructField('Urban_Influence_Code_2013', IntegerType(), True),
            StructField('POVALL_2020', StringType(), True),
            StructField('CI90LBALL_2020', StringType(), True),
            StructField('CI90UBALL_2020', StringType(), True),
            StructField('PCTPOVALL_2020', DoubleType(), True),
            StructField('CI90LBALLP_2020', DoubleType(), True),
            StructField('CI90UBALLP_2020', DoubleType(), True),
            StructField('POV017_2020', StringType(), True),
            StructField('CI90LB017_2020', StringType(), True),
            Struc

In [19]:
crosswalk_data = spark.read.csv(crosswalk_data_path, header=True, schema=crosswalk_schema)
hospital_data = spark.read.csv(hospital_data_path, header=True, schema=hospital_schema, dateFormat=hospital_data_dateTime)

In [12]:
from pyspark.sql.functions import array, explode, struct, lit, col

@pipeable
def spread(val_col, var_col, group_by_col, df):
    return  (df
             .groupBy(group_by_col)
             .pivot(val_col)
             .sum(var_col))

In [36]:
cleaned_hospital_data = ((
    hospital_data
    .select(hospital_data_cols)
    .where(col('Condition') == "Emergency Department")
    .where(col('Measure Name').contains(hospital_data_measure_names[0]) | col('Measure Name').contains(hospital_data_measure_names[1]))
    .drop(col('Condition'))
    .withColumn("Score", when(col('Score') == 'Not Available', np.NaN).otherwise(col('Score').astype('float')))
) >> spread('Measure Name', 'Score', ['Facility ID', 'State', 'County Name']))

In [37]:
cleaned_poverty_data = (
    poverty_data
    .select(col('FIPS_code'), col('PCTPOVALL_2020'))
)

In [67]:
fix_fips = lambda fips: str(fips).zfill(5)
five_digit_fips = udf(fix_fips, StringType())

joined_data = (
    cleaned_hospital_data
    .join(crosswalk_data, "Facility ID", how='inner')
    .withColumn('FIPS_code', col('Final FIPS'))
    .drop(col("Final FIPS"))
    .join(cleaned_poverty_data, "FIPS_code", how='left')
    .withColumn('FIPS_code', five_digit_fips(col('FIPS_code')))
    .collect()
) >> to_pandas

                                                                                

In [68]:
joined_data

Unnamed: 0,FIPS_code,Facility ID,State,County Name,Average (median) time patients spent in the emergency department before leaving from the visit A lower number of minutes is better,Left before being seen,PCTPOVALL_2020
0,04013,030030,AZ,MARICOPA,164.0,1.0,11.6
1,05119,040036,AR,PULASKI,199.0,2.0,15.3
2,06037,050112,CA,LOS ANGELES,138.0,1.0,13.2
3,06077,050122,CA,SAN JOAQUIN,155.0,3.0,13.9
4,20173,170186,KS,SEDGWICK,,,13.4
...,...,...,...,...,...,...,...
4706,36059,330182,NY,NASSAU,168.0,0.0,5.7
4707,41023,381305,OR,GRANT,142.0,8.0,13.9
4708,39007,360245,OH,ASHTABULA,,,16.5
4709,55081,521305,WI,MONROE,83.0,0.0,10.5


In [69]:
joined_data.to_csv("./Data/timely_care_w_poverty_percent_spark.csv")