In [118]:
import pyspark
from pyspark.sql.functions import *
sc.setLogLevel("ERROR")

## Spark application:

In [119]:
# Read the data file into a Spark DataFrame
# https://www.cms.gov/files/zip/medicare-covid-19-data-snapshot-data-file.zip
df1 = spark.read.csv('COVID-19-2021-02-20.csv', header='true')

In [120]:
print(type(df1))
df1.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- Claims_Thru_Dt: string (nullable = true)
 |-- Measure_Level: string (nullable = true)
 |-- Measure_Element: string (nullable = true)
 |-- Measure_Unit: string (nullable = true)
 |-- Value: string (nullable = true)



In [121]:
# Produce a count of all rows
total = df1.count()
print("Total Rows " + str(total))

Total Rows 962


In [122]:
df1.createOrReplaceTempView("covid")

In [123]:
# Sample Data
SQL = "SELECT * FROM covid LIMIT 10"
df = spark.sql(SQL)
df.show(10, False)

+--------------+-----------------------+--------------------+-----------------+-------+
|Claims_Thru_Dt|Measure_Level          |Measure_Element     |Measure_Unit     |Value  |
+--------------+-----------------------+--------------------+-----------------+-------+
|02/20/2021    |COVID-19 Cases         |Overall             |Beneficiary Count|3860957|
|02/20/2021    |COVID-19 Cases by State|Alabama             |Beneficiary Count|86496  |
|02/20/2021    |COVID-19 Cases by State|Alaska              |Beneficiary Count|3023   |
|02/20/2021    |COVID-19 Cases by State|Arizona             |Beneficiary Count|100260 |
|02/20/2021    |COVID-19 Cases by State|Arkansas            |Beneficiary Count|45366  |
|02/20/2021    |COVID-19 Cases by State|California          |Beneficiary Count|338346 |
|02/20/2021    |COVID-19 Cases by State|Colorado            |Beneficiary Count|40837  |
|02/20/2021    |COVID-19 Cases by State|Connecticut         |Beneficiary Count|45536  |
|02/20/2021    |COVID-19 Cases b

In [124]:
# Generate and output a small `DataFrame` containing all the "overall case" counts 
# and their corresponding dates
SQL = """
SELECT Claims_Thru_Dt, CAST(SUM(Value) AS INT) overall_case
FROM covid 
WHERE Measure_Level = 'COVID-19 Cases'
  AND Measure_Element = 'Overall'
  AND Measure_Unit = 'Beneficiary Count'
GROUP BY Claims_Thru_Dt
"""

df = spark.sql(SQL)
df.show(10, False)

+--------------+------------+
|Claims_Thru_Dt|overall_case|
+--------------+------------+
|02/20/2021    |3860957     |
+--------------+------------+



In [142]:
# Generate and output a small `DataFrame` containing all the "overall case" counts and their corresponding dates
SQL = """
SELECT to_date(Measure_Element, 'MM/dd/yyyy') AS date, 
        CAST(SUM(Value) AS INT) overall_case
FROM covid 
WHERE Measure_Level LIKE 'COVID-19 Weekly Cases%'
GROUP BY date
ORDER BY date DESC
"""

df = spark.sql(SQL)
df.show(99, False)


+----------+------------+
|date      |overall_case|
+----------+------------+
|2021-02-20|40369       |
|2021-02-13|60966       |
|2021-02-06|91285       |
|2021-01-30|114880      |
|2021-01-23|138264      |
|2021-01-16|162906      |
|2021-01-09|191995      |
|2021-01-02|199866      |
|2020-12-26|157472      |
|2020-12-19|174050      |
|2020-12-12|170625      |
|2020-12-05|179092      |
|2020-11-28|136126      |
|2020-11-21|143263      |
|2020-11-14|128670      |
|2020-11-07|96774       |
|2020-10-31|85426       |
|2020-10-24|69066       |
|2020-10-17|61174       |
|2020-10-10|56131       |
|2020-10-03|50310       |
|2020-09-26|43225       |
|2020-09-19|42491       |
|2020-09-12|37274       |
|2020-09-05|43229       |
|2020-08-29|44347       |
|2020-08-22|46209       |
|2020-08-15|50440       |
|2020-08-08|53508       |
|2020-08-01|67434       |
|2020-07-25|65578       |
|2020-07-18|69609       |
|2020-07-11|65886       |
|2020-07-04|59567       |
|2020-06-27|49271       |
|2020-06-20|

## Analyze a specific date:

In [89]:
# Choose a single `Claims_Thru_Dt` with 
# `Measure_Level` equal to `COVID-19 Cases by State` 
# and `Measure_Unit` equal to `Beneficiary Count`
SQL = """
SELECT Claims_Thru_Dt, Measure_Level, Measure_Unit, Value
FROM covid 
WHERE Claims_Thru_Dt = '02/20/2021' 
  AND Measure_Level = 'COVID-19 Cases by State' 
  AND Measure_Unit = 'Beneficiary Count'
LIMIT 10
"""

df = spark.sql(SQL)
df.show(99, False)

+--------------+-----------------------+-----------------+------+
|Claims_Thru_Dt|Measure_Level          |Measure_Unit     |Value |
+--------------+-----------------------+-----------------+------+
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|86496 |
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|3023  |
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|100260|
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|45366 |
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|338346|
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|40837 |
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|45536 |
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|11247 |
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|6191  |
|02/20/2021    |COVID-19 Cases by State|Beneficiary Count|245845|
+--------------+-----------------------+-----------------+------+



In [90]:
# For that date, retrieve the `Value` for the `Overall` COVID-19 Cases
SQL = """
SELECT Claims_Thru_Dt, Measure_Level, Measure_Element, Measure_Unit, Value
FROM covid 
WHERE Claims_Thru_Dt = '02/20/2021'
  AND Measure_Level = 'COVID-19 Cases' 
  AND Measure_Element = 'Overall'
  AND Measure_Unit = 'Beneficiary Count'
"""

df = spark.sql(SQL)
df.show(10, False)

+--------------+--------------+---------------+-----------------+-------+
|Claims_Thru_Dt|Measure_Level |Measure_Element|Measure_Unit     |Value  |
+--------------+--------------+---------------+-----------------+-------+
|02/20/2021    |COVID-19 Cases|Overall        |Beneficiary Count|3860957|
+--------------+--------------+---------------+-----------------+-------+



In [64]:
SQL = """
SELECT Claims_Thru_Dt, CAST(SUM(Value) AS INT) overall_case
FROM covid 
WHERE 
    Measure_Level = 'COVID-19 Cases' AND 
    Measure_Element = 'Overall'
GROUP BY Claims_Thru_Dt
"""

df2 = spark.sql(SQL)
df2.show(10, False)

+--------------+------------+
|Claims_Thru_Dt|overall_case|
+--------------+------------+
|02/20/2021    |3867101     |
+--------------+------------+



In [69]:
SQL = """
SELECT Measure_Element, CAST(SUM(Value) AS INT) overall_case_by_state
FROM covid 
WHERE Measure_Level = 'COVID-19 Cases by State'
GROUP BY Measure_Element
"""

df2 = spark.sql(SQL)
df2.show(99, False)

+--------------------+---------------------+
|Measure_Element     |overall_case_by_state|
+--------------------+---------------------+
|Utah                |26160                |
|Hawaii              |4213                 |
|Minnesota           |61824                |
|Ohio                |156325               |
|Arkansas            |52380                |
|Oregon              |19342                |
|District Of Columbia|12788                |
|Texas               |339923               |
|North Dakota        |16307                |
|Pennsylvania        |173228               |
|Connecticut         |52124                |
|Nebraska            |27925                |
|Vermont             |3481                 |
|Nevada              |34742                |
|Puerto Rico         |45311                |
|Washington          |39197                |
|Illinois            |163114               |
|Oklahoma            |66603                |
|Missing Data        |1088                 |
|Virgin Is