##COVID-19 Analysis
This notebook processes and performs quick analysis from the [New York Times COVID-19 dataset](https://github.com/nytimes/covid-19-data).  The data is updated in the `/databricks-datasets/COVID/covid-19-data/` location regularly so you can access the data directly.

In [2]:
%sh pip install altair

In [3]:
%sh pip install vega_datasets

In [4]:
# Standard Libraries
import io

# External Libraries
import requests
import numpy as np
import pandas as pd
import altair as alt
from vega_datasets import data

# topographical
topo_usa = 'https://vega.github.io/vega-datasets/data/us-10m.json'
topo_tx = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/us-states/tx-48-texas-counties.json'
topo_king = 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries/USA/WA/King.geo.json'

### Download Mapping County FIPS to lat, long_

In [6]:
%sh mkdir -p /dbfs/tmp/dennylee/COVID/map_fips/ && wget -O /dbfs/tmp/dennylee/COVID/map_fips/countyfips_lat_long.csv https://raw.githubusercontent.com/dennyglee/tech-talks/master/datasets/countyfips_lat_long.csv && ls -al /dbfs/tmp/dennylee/COVID/map_fips/

In [7]:
# Create mapping of county FIPS to centroid long_ and lat
map_fips = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/dennylee/COVID/map_fips/countyfips_lat_long.csv")
map_fips = (map_fips
              .withColumnRenamed("STATE", "state")
              .withColumnRenamed("COUNTYNAME", "county")
              .withColumnRenamed("LAT", "lat")
              .withColumnRenamed("LON", "long_"))
map_fips.createOrReplaceTempView("map_fips")

In [8]:
map_fips_dedup = spark.sql("""select fips, min(state) as state, min(county) as county, min(long_) as long_, min(lat) as lat from map_fips group by fips""")
map_fips_dedup.createOrReplaceTempView("map_fips_dedup")

### Get 2019 Population Estimates

In [10]:
%sh mkdir -p /dbfs/tmp/dennylee/COVID/population_estimates_by_county/ && wget -O /dbfs/tmp/dennylee/COVID/population_estimates_by_county/co-est2019-alldata.csv https://raw.githubusercontent.com/databricks/tech-talks/master/datasets/co-est2019-alldata.csv && ls -al /dbfs/tmp/dennylee/COVID/population_estimates_by_county/

In [11]:
map_popest_county = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/dennylee/COVID/population_estimates_by_county/co-est2019-alldata.csv")
map_popest_county.createOrReplaceTempView("map_popest_county")
fips_popest_county = spark.sql("select State * 1000 + substring(cast(1000 + County as string), 2, 3) as fips, STNAME, CTYNAME, census2010pop, POPESTIMATE2019 from map_popest_county")
fips_popest_county.createOrReplaceTempView("fips_popest_county")

## Specify `nyt_daily` table
* Source: `/databricks-datasets/COVID/covid-19-data/`
* Contains the COVID-19 daily reports

In [13]:
nyt_daily = spark.read.option("inferSchema", True).option("header", True).csv("/databricks-datasets/COVID/covid-19-data/us-counties.csv")
nyt_daily.createOrReplaceTempView("nyt_daily")
display(nyt_daily)

date,county,state,fips,cases,deaths
2020-01-21,Snohomish,Washington,53061.0,1,0
2020-01-22,Snohomish,Washington,53061.0,1,0
2020-01-23,Snohomish,Washington,53061.0,1,0
2020-01-24,Cook,Illinois,17031.0,1,0
2020-01-24,Snohomish,Washington,53061.0,1,0
2020-01-25,Orange,California,6059.0,1,0
2020-01-25,Cook,Illinois,17031.0,1,0
2020-01-25,Snohomish,Washington,53061.0,1,0
2020-01-26,Maricopa,Arizona,4013.0,1,0
2020-01-26,Los Angeles,California,6037.0,1,0


# COVID-19 Cases and Deaths for Specific Counties
* Top 10 Texas counties (3/26/2020)
* Top 10 New York counties (3/18/2020)
The queries are using the US Census Population Estimates for 2019

In [15]:
# TX State 2 week window
tx_state_window = spark.sql("""
SELECT date, 100 + datediff(date, '2020-03-26T00:00:00.000+0000') as day_num, county, fips, cases, deaths, 100000.*cases/population_estimate AS cases_per_100Kpop, 100000.*deaths/population_estimate AS deaths_per_100Kpop
  from (
SELECT CAST(f.date AS date) AS date, f.county, f.fips, SUM(f.cases) AS cases, SUM(f.deaths) AS deaths, MAX(p.POPESTIMATE2019) AS population_estimate 
  FROM nyt_daily f 
    JOIN fips_popest_county p
      ON p.fips = f.fips
 WHERE f.state = 'Texas' 
   AND date BETWEEN '2020-03-26T00:00:00.000+0000' AND '2020-07-27T00:00:00.000+0000'
 GROUP BY f.date, f.county, f.fips
) a""")
tx_state_window.createOrReplaceTempView("tx_state_window")

# NY State 2 week window
fl_state_window = spark.sql("""
SELECT date, 100 + datediff(date, '2020-06-29T00:00:00.000+0000') as day_num, county, fips, cases, deaths, 100000.*cases/population_estimate AS cases_per_100Kpop, 100000.*deaths/population_estimate AS deaths_per_100Kpop
  FROM (
SELECT CAST(f.date AS date) AS date, f.county, p.fips, SUM(f.cases) as cases, SUM(f.deaths) as deaths, MAX(p.POPESTIMATE2019) AS population_estimate  
  FROM nyt_daily f 
    JOIN fips_popest_county p
      ON p.fips = coalesce(f.fips, 36061)
 WHERE f.state = 'Florida' 
   AND date BETWEEN '2020-06-22T00:00:00.000+0000' AND '2020-07-27T00:00:00.000+0000'
 GROUP BY f.date, f.county, p.fips
) a""")
fl_state_window.createOrReplaceTempView("fl_state_window")

# NY State 2 week window (-1 week)
ny_state_window_m1 = spark.sql("""
SELECT date, 100 + datediff(date, '2020-03-06T00:00:00.000+0000') as day_num, county, fips, cases, deaths, 100000.*cases/population_estimate AS cases_per_100Kpop, 100000.*deaths/population_estimate AS deaths_per_100Kpop
  FROM (
SELECT CAST(f.date AS date) AS date, f.county, p.fips, SUM(f.cases) as cases, SUM(f.deaths) as deaths, MAX(p.POPESTIMATE2019) AS population_estimate  
  FROM nyt_daily f 
    JOIN fips_popest_county p
      ON p.fips = coalesce(f.fips, 36061)
 WHERE f.state = 'New York' 
   AND date BETWEEN '2020-03-06T00:00:00.000+0000' AND '2020-03-20T00:00:00.000+0000'
 GROUP BY f.date, f.county, p.fips
) a""")
ny_state_window_m1.createOrReplaceTempView("ny_state_window_m1")

## COVID-19 Cases for TX and NY Counties

In [17]:
%sql
SELECT date, sum(cases) 
  FROM tx_state_window
  group by date
  order by date

date,sum(cases)
2020-03-27,1971
2020-03-28,2381
2020-03-29,2722
2020-03-30,3065
2020-03-31,3588
2020-04-01,4402
2020-04-02,4952
2020-04-03,5773
2020-04-04,6566
2020-04-05,7117


In [18]:
%sql
SELECT f.date, f.county, f.cases 
  FROM tx_state_window f
  JOIN (
      SELECT county, sum(cases) as Cases FROM tx_state_window GROUP BY County ORDER BY cases DESC LIMIT 15
    ) x ON x.county = f.county

date,county,cases
2020-04-26,Galveston,541
2020-06-16,Montgomery,1359
2020-06-23,Cameron,1647
2020-07-01,Bexar,12516
2020-07-07,El Paso,7642
2020-05-16,Galveston,701
2020-06-05,Denton,1467
2020-05-21,Galveston,732
2020-05-29,Potter,2317
2020-07-12,Bexar,19660


In [19]:
%sql
SELECT f.date, f.county, f.cases 
  FROM ny_state_window f
  JOIN (
      SELECT county, sum(cases) as cases FROM ny_state_window GROUP BY county ORDER BY cases DESC LIMIT 10
    ) x ON x.county = f.county

In [20]:
%sql
SELECT f.date, f.county, f.cases 
  FROM ny_state_window_m1 f
  JOIN (
      SELECT county, sum(cases) as cases FROM ny_state_window_m1 GROUP BY county ORDER BY cases DESC LIMIT 10
    ) x ON x.county = f.county

## COVID-19 Cases per 100K people for TX and NY Counties
The concern with the above graphs is while it provides you the total number of cases, it's hard to compare TX state and NY state due to density.  While not perfect, a better way to look at these numbers would be to review this data as a proportion of population estimates.

Let's look at these values by a percentage of the population; the numbers used are the 2019 US Census estimates of county populations.

*Note, reviewing the top 10 counties by case (vs. % of cases)*

In [22]:
%sql
SELECT f.date, f.county, f.cases_per_100Kpop
  FROM tx_state_window f
  JOIN (
      SELECT county, sum(cases) as cases FROM tx_state_window GROUP BY county ORDER BY cases DESC LIMIT 15  
    ) x ON x.county = f.county

date,county,cases_per_100Kpop
2020-04-26,Galveston,158.122868191
2020-06-16,Montgomery,223.74384869055
2020-06-23,Cameron,389.21172219688
2020-07-01,Bexar,624.6899260015
2020-07-07,El Paso,910.58793810576
2020-05-16,Galveston,204.88748724933
2020-06-05,Denton,165.35036355664
2020-03-27,Dallas,13.9251668364
2020-03-29,Montgomery,10.53686998984
2020-04-22,Galveston,143.50892473527


In [23]:
%sql
SELECT f.date, f.county, f.cases_per_100Kpop
  FROM tx_state_window f
  JOIN (
      SELECT county, sum(cases) as cases FROM tx_state_window GROUP BY county ORDER BY cases DESC LIMIT 15  
    ) x ON x.county = f.county
      WHERE x.county = "Bexar"

In [24]:
%sql
SELECT f.date, f.county, f.cases_per_100Kpop 
  FROM ny_state_window f
  JOIN (
      SELECT county, sum(cases) as cases FROM ny_state_window GROUP BY county ORDER BY cases DESC LIMIT 10
    ) x ON x.county = f.county

In [25]:
%sql
SELECT f.date, f.county, f.cases_per_100Kpop 
  FROM ny_state_window_m1 f
  JOIN (
      SELECT county, sum(cases) as cases FROM ny_state_window_m1 GROUP BY county ORDER BY cases DESC LIMIT 10
    ) x ON x.county = f.county

## Visualize Cases by State Choropleth Maps
* Join the data with `map_fips_dedup` to obtain the county centroid lat, long_

In [27]:
# Extract Day Number and county centroid lat, long_
tx_daynum = spark.sql("""select f.fips, f.county, f.date, f.day_num, cases as confirmed, cast(f.cases_per_100Kpop as int) as confirmed_per100K, deaths, cast(f.deaths_per_100Kpop as int) as deaths_per100K, m.lat, m.long_ from tx_state_window f join map_fips_dedup m on m.fips = f.fips""")
tx_daynum.createOrReplaceTempView("tx_daynum")
fl_daynum = spark.sql("""select cast(f.fips as int) as fips, f.county, f.date, f.day_num, cases as confirmed, cast(f.cases_per_100Kpop as int) as confirmed_per100K, deaths, cast(f.deaths_per_100Kpop as int) as deaths_per100K, m.lat, m.long_ from fl_state_window f join map_fips_dedup m on m.fips = f.fips""")
fl_daynum.createOrReplaceTempView("fl_daynum")
ny_daynum_m1 = spark.sql("""select cast(f.fips as int) as fips, f.county, f.date, f.day_num, cases as confirmed, cast(f.cases_per_100Kpop as int) as confirmed_per100K, deaths, cast(f.deaths_per_100Kpop as int) as deaths_per100K, m.lat, m.long_ from ny_state_window_m1 f join map_fips_dedup m on m.fips = f.fips""")
ny_daynum_m1.createOrReplaceTempView("ny_daynum_m1")

In [28]:
display(tx_daynum)

fips,county,date,day_num,confirmed,confirmed_per100K,deaths,deaths_per100K,lat,long_
48275,Knox,2020-05-02,137,1,27,0,0,33.6061,-99.7414
48503,Young,2020-05-03,138,4,22,1,5,33.1767,-98.6878
48237,Jack,2020-05-04,139,4,44,0,0,33.2335,-98.1725
48037,Bowie,2020-05-05,140,104,111,10,10,33.4454,-94.4229
48171,Gillespie,2020-05-05,140,4,14,0,0,30.3181,-98.9464
48217,Hill,2020-05-05,140,18,49,1,2,31.9907,-97.1324
48069,Castro,2020-05-06,141,20,265,1,13,34.53,-102.2617
48277,Lamar,2020-05-06,141,71,142,0,0,33.6671,-95.5711
48175,Goliad,2020-05-07,142,7,91,0,0,28.6571,-97.4264
48119,Delta,2020-05-08,143,1,18,0,0,33.3863,-95.6723


In [29]:
from pyspark.sql.functions import weekofyear, month
 
dfmax = tx_daynum.withColumn('week_of_year',weekofyear(tx_daynum.date)).withColumn('month',month(tx_daynum.date))
dfmax = dfmax.where(col('county').isin({'Bexar', 'Brazoria', 'Cameron', 'Collin', 'Dallas', 'Denton', 'El Paso', 'Fort Bend', 'Galveston', 'Harris', 'Hidalgo', 'Montgomery', 'Potter', 'Tarrant', 'Travis'}))

In [30]:
from pyspark.sql.functions import avg, stddev, col
dfmax = dfmax.select("county","week_of_year","deaths_per100K").groupBy("county","week_of_year").agg(avg(col("deaths_per100K")).alias('mean'),stddev(col("deaths_per100K")).alias('stdev'))
cvdf = dfmax.withColumn('cov',col("stdev")/col("mean")).na.drop()

In [31]:
from pyspark.sql.functions import asc, desc
display(cvdf.filter(cvdf.week_of_year >18).sort(asc('week_of_year')))

county,week_of_year,mean,stdev,cov
Dallas,19,4.428571428571429,0.5345224838248487,0.1206986253798045
Collin,19,2.0,0.0,0.0
Brazoria,19,1.5714285714285714,0.5345224838248488,0.3401506715249038
Travis,19,4.0,0.0,0.0
Bexar,19,2.0,0.0,0.0
Denton,19,2.0,0.0,0.0
Fort Bend,19,3.571428571428572,0.5345224838248487,0.1496662954709576
Tarrant,19,3.7142857142857135,0.4879500364742665,0.1313711636661486
Cameron,19,4.285714285714286,0.4879500364742665,0.1138550085106621
El Paso,19,2.571428571428572,0.5345224838248487,0.2078698548207744


In [32]:
from pyspark.sql.functions import avg, stddev, col
dfmax1 = dfmax.select("county","month","confirmed_per100K").groupBy("county","month").agg(avg(col("confirmed_per100K")).alias('mean'),stddev(col("confirmed_per100K")).alias('stdev'))
cvdf1 = dfmax1.withColumn('cov',col("stdev")/col("mean")).na.drop()

In [33]:
cvdf2 = cvdf1.replace("3","March")
cvdf2 = cvdf1.replace("4","April")
cvdf2 = cvdf1.replace("5","May")
cvdf2 = cvdf1.replace("6","June")
cvdf2 = cvdf1.replace("7","July")

In [34]:
from pyspark.sql.functions import *
cvdf2 = cvdf1.withColumn('MonthName', regexp_replace('month', '3', '3 March'))
cvdf3 = cvdf2.withColumn('MonthName', regexp_replace('MonthName', '4', '4 April'))
cvdf4 = cvdf3.withColumn('MonthName', regexp_replace('MonthName', '5', '5 May'))
cvdf5 = cvdf4.withColumn('MonthName', regexp_replace('MonthName', '6', '6 June'))
cvdf6 = cvdf5.withColumn('MonthName', regexp_replace('MonthName', '7', '7 July'))

In [35]:
display(cvdf6.sort(asc('MonthName')))

county,month,mean,stdev,cov,MonthName
El Paso,3,4.2,0.8366600265340756,0.1992047682223989,3 March
Brazoria,3,16.0,5.612486080160912,0.350780380010057,3 March
Montgomery,3,9.4,2.5099800796022267,0.2670191574044921,3 March
Cameron,3,4.0,1.224744871391589,0.3061862178478972,3 March
Collin,3,13.0,1.8708286933869709,0.1439098994913054,3 March
Hidalgo,3,2.8,0.8366600265340756,0.2988071523335984,3 March
Dallas,3,17.6,3.911521443121589,0.2222455365409993,3 March
Harris,3,9.8,3.701351104664349,0.377688888231056,3 March
Travis,3,14.0,2.0,0.1428571428571428,3 March
Bexar,3,6.8,1.9235384061671343,0.2828732950245786,3 March


In [36]:
# Obtain Topography
topo_usa = 'https://vega.github.io/vega-datasets/data/us-10m.json'
topo_tx = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/us-states/tx-48-texas-counties.json'
topo_ny = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/us-states/NY-36-new-york-counties.json'
topo_fl = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/us-states/FL-12-florida-counties.json'
us_counties = alt.topo_feature(topo_usa, 'counties')
tx_counties = alt.topo_feature(topo_tx, 'cb_2015_texas_county_20m')
ny_counties = alt.topo_feature(topo_ny, 'cb_2015_new_york_county_20m')
fl_counties = alt.topo_feature(topo_fl, 'cb_2015_florida_county_20m')

In [37]:
# Review TX
confirmed_tx = tx_daynum.select("fips", "day_num", "date", "confirmed", "confirmed_per100K", "county").where("confirmed > 0").toPandas()
confirmed_tx['date'] = confirmed_tx['date'].astype(str)
deaths_tx = tx_daynum.select("lat", "long_", "day_num", "date", "deaths", "deaths_per100K", "county").where("deaths > 0").toPandas()
deaths_tx['date'] = deaths_tx['date'].astype(str)

# Review FL
confirmed_fl = fl_daynum.select("fips", "day_num", "date", "confirmed", "confirmed_per100K", "county").where("confirmed > 0").toPandas()
confirmed_fl['date'] = confirmed_fl['date'].astype(str)
deaths_fl = fl_daynum.select("lat", "long_", "day_num", "date", "deaths", "deaths_per100K", "county").where("deaths > 0").toPandas()
deaths_fl['date'] = deaths_fl['date'].astype(str)

# Review NY -1 week
confirmed_ny_m1 = ny_daynum_m1.select("fips", "day_num", "date", "confirmed", "confirmed_per100K", "county").where("confirmed > 0").toPandas()
confirmed_ny_m1['date'] = confirmed_ny_m1['date'].astype(str)
deaths_ny_m1 = ny_daynum_m1.select("lat", "long_", "day_num", "date", "deaths", "deaths_per100K", "county").where("deaths > 0").toPandas()
deaths_ny_m1['date'] = deaths_ny_m1['date'].astype(str)

In [38]:
# State Choropleth Map Visualization Function
def map_state(curr_day_num, state_txt, state_counties, confirmed, confirmed_min, confirmed_max, deaths, deaths_min, deaths_max):
  # Get date_str
  date_str = confirmed[confirmed['day_num'] == 101]['date'].head(1).item()
  
  # State
  base_state = alt.Chart(state_counties).mark_geoshape(
      fill='white',
      stroke='lightgray',
  ).properties(
      width=800,
      height=600,
  ).project(
      type='mercator'
  )

  # counties
  base_state_counties = alt.Chart(us_counties).mark_geoshape(
  ).transform_lookup(
    lookup='id',
    from_=alt.LookupData(confirmed[(confirmed['confirmed_per100K'] > 0) & (confirmed['day_num'] == curr_day_num)], 'fips', ['confirmed_per100K', 'confirmed', 'county', 'date', 'fips'])  
  ).encode(
     color=alt.Color('confirmed_per100K:Q', scale=alt.Scale(type='log', domain=[confirmed_min, confirmed_max]), title='Confirmed per 100K'),
    tooltip=[
      alt.Tooltip('fips:O'),
      alt.Tooltip('confirmed:Q'),
      alt.Tooltip('confirmed_per100K:Q'),
      alt.Tooltip('county:N'),
      alt.Tooltip('date:N'),
    ],
  )

  # deaths by long, latitude
  points = alt.Chart(deaths[(deaths['deaths_per100K'] > 0) & (deaths['day_num'] == curr_day_num)]).mark_point(opacity=0.75, filled=True).encode(
    longitude='long_:Q',
    latitude='lat:Q',
    size=alt.Size('sum(deaths_per100K):Q', scale=alt.Scale(type='symlog', domain=[deaths_min, deaths_max]), title='Deaths per 100K'),
    color=alt.value('#BD595D'),
    stroke=alt.value('brown'),
    tooltip=[
      alt.Tooltip('lat'),
      alt.Tooltip('long_'),
      alt.Tooltip('deaths'),
      alt.Tooltip('county:N'),      
      alt.Tooltip('date:N'),      
    ],
  ).properties(
    # update figure title
    title=f'COVID-19 {state_txt} Confirmed Cases and Deaths per 100K by County'# [{curr_day_num}, {date_str}]'
  )

  return (base_state + base_state_counties + points)

### Texas (cases and deaths per 100K)

| Variables| Date | Cases | Deaths |
| ------- | -- | 
| Day 00 | 3/26/2020 | 1,396 | 18 |
| Day 21 | 4/16/2020 | 16,455 | 393 |
| Day 42 | 5/07/2020 | 35,390 | 973 |
| Day 63 | 5/28/2020 | 59,776 | 1,601 |
| Day 84 | 6/18/2020 | 99,851 | 2,105 |
| Day 105 | 7/09/2020 | 230,346 | 2,918 |
| Day 123 | 7/26/2020 | 381,656 | 5,038 |
| Reopening Phase 1 | 5/01/2020 | 29,229 | 816 |
| Reopening Phase 2 | 5/18/2020 | 48,639 | 1,347 |
| Reopening Phase 3 | 6/03/2020 | 68,271 | 1,734 |
| Texas Mask Mandate | 7/02/2020 | 175,977 | 2,525 |

In [41]:
map_state(214, 'TX', tx_counties, confirmed_tx, 1, 5000, deaths_tx, 1, 100)

In [42]:
map_state(198, 'TX', tx_counties, confirmed_tx, 1, 5000, deaths_tx, 1, 100)

In [43]:
map_state(214, 'TX', tx_counties, confirmed_tx, 1, 5000, deaths_tx, 1, 100)

In [44]:
map_state(163, 'TX', tx_counties, confirmed_tx, 1, 2000, deaths_tx, 1, 100)

### NY State (cases and deaths per 100K)

In [46]:
map_state(106, 'FL', fl_counties, confirmed_fl, 1, 2000, deaths_fl, 1, 200)

In [47]:
map_state(101, 'FL', fl_counties, confirmed_fl, 1, 1500, deaths_fl, 1, 20)

In [48]:
map_state(101, 'FL', fl_counties, confirmed_fl, 1, 1500, deaths_fl, 1, 20)

In [49]:
map_state(114, 'NY', ny_counties, confirmed_ny, 1, 1500, deaths_ny, 1, 20)

In [50]:
map_state(106, 'FL', fl_counties, confirmed_fl, 1, 2000, deaths_fl, 1, 200)

## COVID-19 Confirmed Cases and Deaths by TX and NY County Slider

In [52]:
# State Choropleth Map Visualization Function
def map_state_slider(state_txt, state_counties, confirmed, confirmed_min, confirmed_max, deaths, deaths_min, deaths_max, state_fips):
  # Pivot confirmed data by day_num
  confirmed_pv = confirmed[['fips', 'day_num', 'confirmed']].copy()
  confirmed_pv['fips'] = confirmed_pv['fips'].astype(str)
  confirmed_pv['day_num'] = confirmed_pv['day_num'].astype(str)
  confirmed_pv['confirmed'] = confirmed_pv['confirmed'].astype('int64')
  confirmed_pv = confirmed_pv.pivot_table(index='fips', columns='day_num', values='confirmed', fill_value=0).reset_index()

  # Pivot deaths data by day_num
  deaths_pv = deaths[['lat', 'long_', 'day_num', 'deaths']].copy()
  deaths_pv['day_num'] = deaths_pv['day_num'].astype(str)
  deaths_pv['deaths'] = deaths_pv['deaths'].astype('int64')
  deaths_pv = deaths_pv.pivot_table(index=['lat', 'long_'], columns='day_num', values='deaths', fill_value=0).reset_index()

  # Extract column names for slider
  column_names = confirmed_pv.columns.tolist()

  # Remove first element (`fips`)
  column_names.pop(0)

  # Convert to int
  column_values = [None] * len(column_names)
  for i in range(0, len(column_names)): column_values[i] = int(column_names[i]) 
  
  # Disable max_rows to see more data
  alt.data_transformers.disable_max_rows()

  # Topographic information
  us_states = alt.topo_feature(topo_usa, 'states')
  us_counties = alt.topo_feature(topo_usa, 'counties')

  # state county boundaries
  base_state = alt.Chart(state_counties).mark_geoshape(
      fill='white',
      stroke='lightgray',
  ).properties(
      width=800,
      height=600,
  ).project(
      type='mercator'
  )

  # Slider choices
  min_day_num = column_values[0]
  max_day_num = column_values[len(column_values)-1]
  slider = alt.binding_range(min=min_day_num, max=max_day_num, step=1)
  slider_selection = alt.selection_single(fields=['day_num'], bind=slider, name="day_num", init={'day_num':min_day_num})


  # Confirmed cases by county
  base_state_counties = alt.Chart(us_counties).mark_geoshape(
      stroke='black',
      strokeWidth=0.05
  ).transform_lookup(
      lookup='id',
      from_=alt.LookupData(confirmed_pv, 'fips', column_names)  
  ).transform_fold(
      column_names, as_=['day_num', 'confirmed']
  ).transform_calculate(
      state_id = "(datum.id / 1000)|0",
      day_num = 'parseInt(datum.day_num)',
      confirmed = 'isValid(datum.confirmed) ? datum.confirmed : -1'
  ).encode(
      color = alt.condition(
          'datum.confirmed > 0',      
          alt.Color('confirmed:Q', scale=alt.Scale(domain=(confirmed_min, confirmed_max), type='symlog')),
          alt.value('white')
        )  
  ).properties(
    # update figure title
    title=f'COVID-19 TX State Confirmed Cases by County'
  ).transform_filter(
      (alt.datum.state_id)==state_fips
  ).transform_filter(
      slider_selection
  )

  # deaths by long, latitude
  points = alt.Chart(
    deaths_pv
  ).mark_point(
    opacity=0.75, filled=True
  ).transform_fold(
    column_names, as_=['day_num', 'deaths']
  ).transform_calculate(
      day_num = 'parseInt(datum.day_num)',
      deaths = 'isValid(datum.deaths) ? datum.deaths : -1'  
  ).encode(
    longitude='long_:Q',
    latitude='lat:Q',
    size=alt.Size('deaths:Q', scale=alt.Scale(domain=(deaths_min, deaths_max), type='symlog'), title='deaths'),
    color=alt.value('#BD595D'),
    stroke=alt.value('brown'),
  ).add_selection(
      slider_selection
  ).transform_filter(
      slider_selection
  )

  # confirmed cases (base_counties) and deaths (points)
  return (base_state + base_state_counties + points) 

| Factors | WA | NY | 
| ------- | -- | -- | 
| Educational Facilities Closed | 3/13/2020 | 3/18/2020 |
| Day 00 | 3/6/2020 | 3/11/2020 |
| Day 14 | 3/20/2020 | 3/25/2020 | 
| Max Cases | 794 | 20011 |
| Max Deaths | 68 | 280 |
| Max Cases per 100K | 50.55 | 1222.97 | 
| Max Deaths per 100K | 3.27 | 17.11 |

In [54]:
map_state_slider('TX', tx_counties, confirmed_tx, 1, 800, deaths_tx, 1, 70, 53)

In [55]:
map_state_slider('NY', ny_counties, confirmed_ny, 1, 21000, deaths_ny, 1, 300, 36)

## COVID-19 Confirmed Cases and Deaths by WA and NY County Map and Graph

In [57]:
# map_state_graph
def map_state_graph(state_txt, state_counties, confirmed, confirmed_min, confirmed_max, deaths, deaths_min, deaths_max, state_fips):
  
  # pivot confirmed cases (by date)
  confirmed_pv2 = confirmed[['fips', 'date', 'confirmed']].copy()
  confirmed_pv2['fips'] = confirmed_pv2['fips'].astype(str)
  confirmed_pv2['date'] = confirmed_pv2['date'].astype(str)
  confirmed_pv2['confirmed'] = confirmed_pv2['confirmed'].astype('int64')
  confirmed_pv2 = confirmed_pv2.pivot_table(index='fips', columns='date', values='confirmed', fill_value=0).reset_index()

  # pivot deaths
  deaths_pv2 = deaths[['lat', 'long_', 'date', 'deaths']].copy()
  deaths_pv2['date'] = deaths_pv2['date'].astype(str)
  deaths_pv2['deaths'] = deaths_pv2['deaths'].astype('int64')
  deaths_pv2 = deaths_pv2.pivot_table(index=['lat', 'long_'], columns='date', values='deaths', fill_value=0).reset_index()

  # Extract column names for slider
  column_names2 = confirmed_pv2.columns.tolist()

  # Remove first element (`fips`)
  column_names2.pop(0)

  # date selection
  pts = alt.selection(type="single", encodings=['x'])

  # State
  base_state = alt.Chart(state_counties).mark_geoshape(
      fill='white',
      stroke='lightgray',
  ).properties(
      width=800,
      height=600,
  ).project(
      type='mercator'
  )

  # State Counties
  base_state_counties = alt.Chart(us_counties).mark_geoshape(
    stroke='black',
    strokeWidth=0.05,
  ).transform_lookup(
    lookup='id',
   from_=alt.LookupData(confirmed_pv2, 'fips', column_names2)
   ).transform_fold(
     column_names2, as_=['date', 'confirmed']
  ).transform_calculate(
      state_id = "(datum.id / 1000)|0",
      date = 'datum.date',
      confirmed = 'isValid(datum.confirmed) ? datum.confirmed : -1'
  ).encode(
       color = alt.condition(
          'datum.confirmed > 0',      
          alt.Color('confirmed:Q', scale=alt.Scale(domain=(confirmed_min, confirmed_max), type='symlog')),
          alt.value('white')
        )  
  ).transform_filter(
    pts
  ).transform_filter(
      (alt.datum.state_id)==state_fips
  )

  # Bar Graph
  bar = alt.Chart(confirmed).mark_bar().encode(
      x='date:N',
      y='confirmed_per100K:Q',
      color=alt.condition(pts, alt.ColorValue("steelblue"), alt.ColorValue("grey"))
  ).properties(
      width=800,
      height=200,
      title='Confirmed Cases per 100K'
  ).add_selection(pts)

  # Deaths
  points = alt.Chart(deaths).mark_point(opacity=0.75, filled=True).encode(
    longitude='long_:Q',
    latitude='lat:Q',
    size=alt.Size('sum(deaths):Q', scale=alt.Scale(domain=[deaths_min, deaths_max]), title='Deaths'),
    color=alt.value('#BD595D'),
    stroke=alt.value('brown'),
    tooltip=[
      alt.Tooltip('lat'),
      alt.Tooltip('long_'),
      alt.Tooltip('deaths'),
      alt.Tooltip('county:N'),      
      alt.Tooltip('date:N'),      
    ],
  ).properties(
    # update figure title
    title=f'COVID-19 Confirmed Cases and Deaths by County'
  ).transform_filter(
      pts
  )

  return (base_state + base_state_counties + points) & bar

In [58]:
map_state_graph('TX', tx_counties, confirmed_tx, 1, 800, deaths_tx, 1, 70, 53)

In [59]:
map_state_graph('NY', ny_counties, confirmed_ny, 1, 21000, deaths_ny, 1, 300, 36)

In [60]:
map_state_graph('NY', ny_counties, confirmed_ny_m1, 1, 4500, deaths_ny, 1, 70, 36)