In [None]:
# Import libraries and setup
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import linregress
import chardet
import gmaps
import warnings
warnings.simplefilter("ignore")

# Import API key
from api_keys import g_key

In [None]:
# Access maps with unique API key
gmaps.configure(api_key = g_key)

# Self-Reported Mask Wearing<br>and COVID-19 Cases and Deaths

### as of July 14, 2020

### Core message of this project:
We aim to determine the correlation between self-reported mask-wearing behavior per U.S. county and COVID-19 case and death rates per county as of July 14, 2020.

### Alternate Hypothesis:
COVID-19 case and death rates will be lower in counties with higher mask-wearing scores.

# Extract

### U.S. Census 2010-2019

In [None]:
# U.S. Census 2010-2019
censusDataReadMeURL = "https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-total.html"

# This is where the census data .CSV lives locally:
censusDataFilepath = "Resources/co-est2019-alldata_exp.csv"

print(f"{censusDataFilepath} is {round(os.path.getsize(censusDataFilepath)/1024/1024, 2)} megabytes (MB).\nMore info here:\n{censusDataReadMeURL}")

# Read CSV into censusData DataFrame
censusData = pd.read_csv(censusDataFilepath, encoding = "iso-8859-1")

# Extract

### U.S. COVID-19 Cases & Deaths

In [None]:
# COVID-19 cases and deaths
caseDataReadMeURL = "https://github.com/nytimes/covid-19-data/blob/master/README.md"

# This is where the .CSV lives locally:
caseDataFilepath = "Resources/us-counties.csv"

print(f"The file at {caseDataFilepath} is {round(os.path.getsize(caseDataFilepath)/1024/1024, 2)} MB.\nMore info here:\n{caseDataReadMeURL}")

# Read CSV into caseData DataFrame
caseData = pd.read_csv(caseDataFilepath, encoding = "UTF-8")

# Extract

### U.S. Landmass Data (by County)

In [None]:
# Landmass (and thence population density)
landMassDataReadMeURL = "https://hub.arcgis.com/datasets/48f9af87daa241c4b267c5931ad3b226_0/data?orderBy=FIPS"

# This is where the landmass data .CSV lives locally:
landMassDataFilepath = "Resources/counties-by-land-area.csv"

print(f"{landMassDataFilepath} is {round(os.path.getsize(landMassDataFilepath)/1024/1024, 2)} MB.\nMore info here:\n{landMassDataReadMeURL}")

# Read CSV into landmassData DataFrame
landmassData = pd.read_csv(landMassDataFilepath)

# Extract

### U.S. County Geographic Centers

In [None]:
# County Centers (by geographic center latitude and longitude)
countyCenterDataReadMeURL = "https://github.com/btskinner/spatial/blob/master/data/county_centers.csv"

# This is where the county center data .CSV lives locally:
countyCenterDataFilepath = "Resources/county_centers.csv"

print(f"{countyCenterDataFilepath} is {round(os.path.getsize(countyCenterDataFilepath)/1024/1024, 2)} MB.\nMore info here:\n{countyCenterDataReadMeURL}")

# Read CSV into countyCenterData DataFrame
countyCenterData = pd.read_csv(countyCenterDataFilepath)

# Extract

### U.S. Mask-Wearing Survey (by County)

In [None]:
# Mask-Wearing survey
maskWearingDataReadMeURL = "https://github.com/nytimes/covid-19-data/tree/master/mask-use"

# This is where the census data .CSV lives locally:
maskWearingDataFilepath = "Resources/mask-use-by-county-exp.csv"

print(f"{maskWearingDataFilepath} is {round(os.path.getsize(maskWearingDataFilepath)/1024/1024, 2)} MB.\nMore info here:\n{maskWearingDataReadMeURL}")

# Read CSV into maskWearingData DataFrame
maskWearingData = pd.read_csv(maskWearingDataFilepath)

# Transform

### U.S. COVID-19 Cases & Deaths

In [None]:
# Filter DataFrame to include only data taken thru July 14, 2020
caseData = caseData[caseData["date"].str.contains("7/14/2020")]

# Reset index in place
caseData.reset_index(inplace = True, drop = True)
caseData

In [None]:
# Drop rows containing NaN values (caseData's "unknown" counties)
# Joplin, MO and Kansas City, MO case numbers added to Jasper and Jackson Counties respectively
caseData.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace = True)
caseData

In [None]:
# Convert caseData FIPS values from float to int
caseData.fips = caseData.fips.astype(np.int64)
caseData.dtypes

In [None]:
# Display cleaned DataFrame
caseData

# Transform

### Merge with censusData DataFrame

In [None]:
# Merge caseData and censusData DataFrames on common identifier
DataFrame = pd.merge(censusData, caseData, how = "outer", left_on = "FIPS", right_on = "fips", on = None, sort = False, copy = True, indicator = False, validate = None)
DataFrame

In [None]:
# Manually add population to row 3142 (New York City aggregate)
DataFrame["POPESTIMATE2019"][3142] = 8336817
DataFrame.tail()

In [None]:
# Drop duplicate and/or irrelevant columns
DataFrame.drop(columns = ["FIPS", "STATE", "COUNTY", "STNAME", "CTYNAME", "CENSUS2010POP"], inplace = True)
DataFrame

In [None]:
# Rename columns
DataFrame = DataFrame.rename(columns = {"date":"Date", "fips":"FIPS", "county":"County", "state":"State",
                                        "POPESTIMATE2019":"PopEst", "cases":"Cases", "deaths":"Deaths"})
DataFrame

In [None]:
# Rearrange columns
DataFrame = DataFrame[["Date", "FIPS", "County", "State", "PopEst", "Cases", "Deaths"]]
DataFrame

In [None]:
# Due diligence to check DataFrame for rows with missing data
DataFrame.count()

In [None]:
# Drop rows containing no data
DataFrame.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace = True)

# Reset index in place
DataFrame.reset_index(inplace = True, drop = True)
DataFrame

In [None]:
# Data types
DataFrame.dtypes

In [None]:
# Convert FIPS values from float to integer
DataFrame.FIPS = DataFrame.FIPS.astype(np.int64)
DataFrame.dtypes

In [None]:
# Display updated DataFrame with new PopEst column
DataFrame

# Transform

### U.S. Landmass Data (by County)

In [None]:
# Create DataFrame to sort landmassData by FIPS code
a = landmassData[["FIPS", "FID", "NAME", "STATE_NAME", "STATE_NAME", "POPULATION", "SQMI"]]
a = a.sort_values(by = "FIPS").reset_index().drop(columns = ["index"])
a

In [None]:
# Create new DataFrame to sort primary DataFrame by FIPS code
b = DataFrame[["Date", "FIPS", "County", "State", "Cases", "Deaths"]]
b = b.sort_values(by = "FIPS").reset_index().drop(columns = ["index"])
b

In [None]:
# Convert FIPS values to integer and verify data types
b["FIPS"] = b["FIPS"].astype(int)
b.dtypes

In [None]:
# Verify landmassData data types
a.dtypes

In [None]:
# Verify last row's index number
b.tail()

In [None]:
# Calculate difference in rows between primary DataFrame and landmassData DataFrames to identify Puerto Rico and
# other "non-U.S. counties" we do not have cases and deaths data for
len(a) - len(b)

In [None]:
# Identify rows with duplicates value in County column
duplicates_df = b[b.duplicated("County")]
duplicates_df

In [None]:
# Create DataFrame to hold duplicate county's cases and deaths data
c = b[b["FIPS"] == 2016]
c

In [None]:
# Create DataFrame to hold duplicate county's census data
d = a[a["FIPS"] == 2016]
d

In [None]:
# Merge primary DataFrame with duplicate county's DataFrame to create one entry for duplicate county
DataFrame = b.merge(a, how = "left", on = "FIPS")
DataFrame.isnull().sum()

In [None]:
# Verify merge was successful
check = DataFrame[DataFrame["FIPS"] == 2016]
check

In [None]:
# View DataFrame to verify we have 3085 rows (including New York City aggregate)
DataFrame

In [None]:
# Drop duplicate and/or irrelevant columns
DataFrame.drop(columns = ["FID", "NAME", "STATE_NAME", "STATE_NAME"], inplace = True)
DataFrame

In [None]:
# Sort DataFrame by FIPS code
DataFrame.sort_values(by = ["FIPS"], ascending = True, inplace = True)
DataFrame

In [None]:
# Data types
DataFrame.dtypes

In [None]:
# Manually add population and landmass data to row 3085 (New York City aggregate) and verify
DataFrame["POPULATION"][3084] = 8336817
DataFrame["SQMI"][3084] = 302.06
DataFrame.tail()

In [None]:
# Rename column
DataFrame = DataFrame.rename(columns = {"POPULATION":"PopEst"})
DataFrame

In [None]:
# Create per 100,000 people divisor
perHundredK_divisor = DataFrame["PopEst"] / 100000

# Calculate cases per 100,000
casesPerHundredK = DataFrame["Cases"] / perHundredK_divisor

# Calculate deaths per 100,000
deathsPerHundredK = DataFrame["Deaths"] / perHundredK_divisor

# Calculate population density
popDens = DataFrame["PopEst"] / DataFrame["SQMI"]

In [None]:
# Add new columns to hold case rates and death rates (per 100,000 people), and population density
DataFrame["CaseRate"] = casesPerHundredK
DataFrame["DeathRate"] = deathsPerHundredK
DataFrame["PopDens"] = popDens
DataFrame

In [None]:
# Reorganize columns
DataFrame = DataFrame[["Date", "FIPS", "County", "State", "SQMI", "PopEst", "PopDens", "Cases", "CaseRate", "Deaths", "DeathRate"]]
DataFrame

In [None]:
# Convert FIPS and Population values from float to int
DataFrame.FIPS = DataFrame.FIPS.astype(np.int64)
DataFrame.PopEst = DataFrame.PopEst.astype(np.int64)
DataFrame.dtypes

In [None]:
# Sort on FIPS to restore DataFrame order
DataFrame = DataFrame.sort_values("FIPS", ascending = True)
DataFrame

# Transform

### Merge with countyCenterData DataFrame

In [None]:
# Merge primary DataFrame with countyCenterData to import lat/lng for heatmaps
DataFrame = DataFrame.merge(countyCenterData, how = "left", left_on = "FIPS", right_on = "fips", on = None, sort = False, copy = True, indicator = False, validate = None)
DataFrame

In [None]:
# Drop duplicate and/or irrelevant columns
DataFrame.drop(columns = ["fips", "clon00", "clat00", "pclon00", "pclat00", "pclon10", "pclat10"], inplace = True)
DataFrame

In [None]:
# Rename columns
DataFrame = DataFrame.rename(columns = {"clon10":"Lng", "clat10":"Lat"})
DataFrame

In [None]:
# Rearrange columns
DataFrame = DataFrame[["Date", "FIPS", "County", "State", "SQMI", "PopEst", "PopDens", "Cases", "CaseRate", "Deaths",
                       "DeathRate", "Lat", "Lng"]]
DataFrame

In [None]:
# Due diligence to check for missing data
DataFrame.count()

In [None]:
# Create DataFrame to look for rows with missing lat/lng values
null_df = DataFrame[DataFrame.isnull().any(axis = 1)]
null_df

In [None]:
# Manually add missing latitude and longitude coordinates:

# Kusilvak Census Area (Alaska)
DataFrame["Lat"][80] = 62.0900
DataFrame["Lng"][80] = -163.5300

# Oglala Lakota Census Area (South Dakota)
DataFrame["Lat"][2363] = 43.3300
DataFrame["Lng"][2363] = -102.5500

# New York City Aggregate (New York)
DataFrame["Lat"][3084] = 40.7420
DataFrame["Lng"][3084] = -73.9073

DataFrame.tail()

# Transform

### Merge with maskWearingData DataFrame

In [None]:
# Define intervals to create "mask-wearing scores" on a scale from 0 to 10
scale = 10
divisions = 5
interval = scale / (divisions - 1)
print(f"This will use the results of the NYT survey to score each county on a scale from 0 to {scale} where:")
print(f"Never = 0")
print(f"Rarely = {interval}")
print(f"Sometimes = {interval * 2}")
print(f"Frequently = {interval * 3}")
print(f"Always = {interval * 4}")

In [None]:
# Total number of counties 
counties = maskWearingData["COUNTYFP"].nunique()
totalCounties = pd.DataFrame([counties], columns = ["Total Counties"])
totalCounties

In [None]:
# Define function to convert percentage values to float
def percentages_to_floats(percentage):
    string = percentage[0:-1]
    return float(string) 

In [None]:
# Average Never - Mask 
neverMask = maskWearingData["NEVER"].apply(percentages_to_floats).mean()
neverMask

In [None]:
# Average Rarely - Mask 
rarelyMask = maskWearingData["RARELY"].apply(percentages_to_floats).mean()
rarelyMask

In [None]:
# Average Sometimes - Mask 
sometimesMask = maskWearingData["SOMETIMES"].apply(percentages_to_floats).mean()
sometimesMask

In [None]:
# Average Frequently - Mask 
frequentlyMask = maskWearingData["FREQUENTLY"].apply(percentages_to_floats).mean()
frequentlyMask

In [None]:
# Average Always - Mask 
alwaysMask = maskWearingData["ALWAYS"].apply(percentages_to_floats).mean()
alwaysMask

In [None]:
# Create DataFrame of mask wearing mean percentages
maskUsage = pd.DataFrame({"NEVER": [neverMask], "RARELY": [rarelyMask], "SOMETIMES": [sometimesMask],
                          "FREQUENTLY": [frequentlyMask], "ALWAYS": [alwaysMask]})
maskUsage

In [None]:
# Format DataFrame floats to percentages
pd.options.display.float_format = '{:,.2f}%'.format
maskUsage

In [None]:
# Update DataFrame with Mask Score column 
maskWearingData["Mask Score"] = maskWearingData["NEVER"].apply(percentages_to_floats) * 0 + maskWearingData["RARELY"].apply(percentages_to_floats) * 2.5 + maskWearingData["SOMETIMES"].apply(percentages_to_floats) * 5.0 + maskWearingData["FREQUENTLY"].apply(percentages_to_floats) *7.5 + maskWearingData["ALWAYS"].apply(percentages_to_floats) *10 
maskWearingData

In [None]:
# Merge with primary DataFrame to import county mask-wearing averages
DataFrame = DataFrame.merge(maskWearingData, how = "left", left_on = "FIPS", right_on = "COUNTYFP", on = None, sort = False, copy = True, indicator = False, validate = None)
DataFrame

In [None]:
# Drop duplicate and/or irrelevant columns
DataFrame.drop(columns = ["COUNTYFP", "NEVER", "RARELY", "SOMETIMES", "FREQUENTLY", "ALWAYS"], inplace = True)
DataFrame

In [None]:
# Data types
DataFrame.dtypes

In [None]:
# Manually add mask score to row 3084 (New York City aggregate) and verify
DataFrame["Mask Score"][3084] = 890.85
DataFrame.tail()

In [None]:
# Convert cases and deaths values to integers and verify data types
DataFrame["Cases"] = DataFrame["Cases"].astype(int)
DataFrame["Deaths"] = DataFrame["Deaths"].astype(int)
DataFrame

In [None]:
# Data types
DataFrame.dtypes

In [None]:
# Group by State and calculate mean
grouped_df = DataFrame.groupby("State")
mean_df = grouped_df.mean()
mean_df = mean_df.reset_index()
mean_df

In [None]:
# Create new DataFrame with only desired columns
stateAVG = mean_df[["State", "Mask Score"]].copy()
stateAVG.head()

In [None]:
# Merge primary DataFrame with state averages DataFrame to import calculated mean by state
DataFrame = DataFrame.merge(stateAVG, how = "left", left_on = "State", right_on = "State", on = None, sort = False, copy = True, indicator = False, validate = None)
DataFrame

In [None]:
# Rename columns
DataFrame = DataFrame.rename(columns = {"Mask Score_x":"CountyScore", "Mask Score_y":"StateScore"})
DataFrame

In [None]:
# Sort by FIPS
DataFrame = DataFrame.sort_values("FIPS", ascending = True)

# Reset index in place
DataFrame.reset_index(inplace = True, drop = True)
DataFrame

In [None]:
# Format values for consistency
DataFrame["CountyScore"] = DataFrame["CountyScore"] / 100
DataFrame["StateScore"] = DataFrame["StateScore"] / 100
DataFrame

In [None]:
# Data types
DataFrame.dtypes

# Visualizations

### U.S. COVID-19 Case Rates per 100,000 People as of July 14, 2020

In [None]:
# Sort on CaseRate to find lowest case rates per 100,000 people
DataFrame = DataFrame.sort_values("CaseRate", ascending = True)

In [None]:
# Display top 5 counties with lowest case rates per 100,000 people
DataFrame.head()

In [None]:
# Display top 5 counties with highest case rates per 100,000 people
DataFrame.tail()

In [None]:
# Sort on FIPS to restore order
DataFrame = DataFrame.sort_values("FIPS", ascending = True)
DataFrame

In [None]:
# Create heatmap for CaseRate

# Store latitude and longitude in locations
locations = DataFrame[["Lat", "Lng"]]

# Convert case rates to float
caseRate = DataFrame["CaseRate"].astype(float)

# Plot Heatmap (U.S. geographic center is 39.8333, -98.5855)
fig = gmaps.figure(zoom_level = 4.1, center = (37.8, -98.6), map_type = "ROADMAP")

# Set max intensity to highest case rate found in the dataset
max_intensity = DataFrame["CaseRate"].max()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights = caseRate, 
                                 dissipating = False, max_intensity = max_intensity,
                                 point_radius = 0.8, gradient = ["white", "lime", "green", "yellow", "red", "purple"])



# Add layer
fig.add_layer(heat_layer)

# Display figure
fig

### Case Rate per 100,000 People vs. Population Density (by County) as of July 14, 2020

In [None]:
# Do areas of higher population density have higher case rates?

# Retrieve case rate and population density data
countyCaseRate = DataFrame["CaseRate"]
popDens = DataFrame["PopDens"]
n = len(DataFrame)

# Perform a linear regression on population density versus case rates
slope, int, r, p, std_err = st.linregress(popDens, countyCaseRate)

# Create equation of line to calculate predicted case rates
fit = slope * popDens + int

# Create equation in string formats to print on scatter plot
equation = "y = " + str(round(slope, 2)) + "x + " + str(round(int, 2))

# Define scatter plot size
plt.figure(figsize = (21, 14))

# Plot x and y values on scatter plot
plt.scatter(popDens, countyCaseRate, marker = ".", color = "red")

# Plot linear regression line on scatter plot
plt.plot(popDens, fit, "--", color = "black")

# Define linear regression line and print on scatter plot
plt.annotate(equation, (5150, 500), fontsize = 14, color = "red")

# Define plot title, x and y labels, and gridlines
plt.title(f"COVID-19 Cases vs. Population Density (by County)\nas of July 14, 2020", fontsize = 18)
plt.xlabel("Population Density", fontsize = 14)
plt.ylabel("Cases", fontsize = 14)
plt.xlim(0, 6000)
plt.ylim(0, 7000)
plt.grid(axis = "x", linewidth = 0.5)
plt.grid(axis = "y", linewidth = 0.5)
plt.savefig("Images/scatterPlot1.png")

print(f"The r-value is: {r}")

plt.show()

### Question:

### _Is there a correlation between COVID-19 cases per 100,000 people and population density?_

### Answer:

### _Yes. &nbsp;COVID-19 cases per 100,000 people increases in counties with higher population densities._

# Visualizations

### U.S. COVID-19 Death Rate per 100,000 People as of July 14, 2020

In [None]:
# Sort on DeathRate to find lowest death rates per 100,000 people
DataFrame = DataFrame.sort_values("DeathRate", ascending = True)

In [None]:
# Display top 5 counties with lowest death rates per 100,000 people
DataFrame.head()

In [None]:
# Display top 5 counties with highest death rates per 100,000 people
DataFrame.tail()

In [None]:
# Sort on FIPS to restore order
DataFrame = DataFrame.sort_values("FIPS", ascending = True)
DataFrame

In [None]:
# Create heatmap for DeathRate

# Store latitude and longitude in locations
locations = DataFrame[["Lat", "Lng"]]

# Convert death rates to float
deathRate = DataFrame["DeathRate"].astype(float)

# Plot Heatmap (U.S. geographic center is 39.8333, -98.5855)
fig = gmaps.figure(zoom_level = 4.1, center = (37.8, -98.6), map_type = "ROADMAP")

# Set max intensity to mean death rate found in the dataset
max_intensity = DataFrame["DeathRate"].max()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights = deathRate, 
                                 dissipating = False, max_intensity = max_intensity,
                                 point_radius = 0.8, gradient = ["white", "lime", "green", "yellow", "red", "purple"])

# Add layer
fig.add_layer(heat_layer)

# Display figure
fig

### Death Rate per 100,000 People vs. Population Density (by County) as of July 14, 2020

In [None]:
# Do areas of higher population density have higher death rates?

# Retrieve death rate and population density data
countyDeathRate = DataFrame["DeathRate"]
popDens = DataFrame["PopDens"]
n = len(DataFrame)

# Perform a linear regression on population density versus death rates
slope, int, r, p, std_err = st.linregress(popDens, countyDeathRate)

# Create equation of line to calculate predicted death rates
fit = slope * popDens + int

# Create equation in string formats to print on scatter plot
equation = "y = " + str(round(slope, 2)) + "x + " + str(round(int, 2))

# Define scatter plot size
plt.figure(figsize = (21, 14))

# Plot x and y values on scatter plot
plt.scatter(popDens, countyDeathRate, marker = ".", color = "purple")

# Plot linear regression line on scatter plot
plt.plot(popDens, fit, "--", color = "black")

# Define linear regression line and print on scatter plot
plt.annotate(equation, (5150, 25), fontsize = 14, color = "purple")

# Define plot title, x and y labels, and gridlines
plt.title(f"COVID-19 Deaths vs. Population Density (by County)\nas of July 14, 2020", fontsize = 18)
plt.xlabel("Population Density", fontsize = 14)
plt.ylabel("Deaths", fontsize = 14)
plt.xlim(0, 6000)
plt.ylim(0, 350)
plt.grid(axis = "x", linewidth = 0.5)
plt.grid(axis = "y", linewidth = 0.5)
plt.savefig("Images/scatterPlot2.png")

print(f"The r-value is: {r}")

plt.show()

### Question:

### _Is there a correlation between COVID-19 deaths per 100,000 people and population density?_

### Answer:

### _Yes. &nbsp;COVID-19 deaths per 100,000 people increase in counties with higher population densities._

# Visualizations

### U.S. Mask-Wearing Scores as of July 14, 2020

### The New York Times survey asked, “How often do you wear a mask in public when you expect to be within six feet of another person?”

In [None]:
# Mask score sorted lowest to highest
stateAVG = stateAVG.sort_values("Mask Score", ascending = True)
stateAVG

In [None]:
# Convert values for consistency
stateAVG["Mask Score"] = stateAVG["Mask Score"] / 100
stateAVG

In [None]:
# Sort on mask score to find highest mask-wearing states
stateAVG = stateAVG.sort_values("Mask Score", ascending = False)

# Reset index in place
stateAVG.reset_index(inplace = True, drop = True)

In [None]:
# Display top 5 states with highest mask scores
stateAVG.head()

In [None]:
# Display top 5 states with lowest mask scores
stateAVG.tail()

In [None]:
# Mask score sorted lowest to highest
stateAVG = stateAVG.sort_values("Mask Score", ascending = True)
stateAVG

In [None]:
# Generate bar plot showing state mask scores

# Bar plot x and y values
xValues = stateAVG["State"]
yValues = stateAVG["Mask Score"]

# Define bar plot size
fig, ax = plt.subplots(figsize = (21, 14))

# Pass values into bar plot, and define color
ax.barh(xValues, yValues, color = "lightskyblue")

# Define bar plot title, ticks, and x label
ax.set_title("State Mask Scores\nas of July 14, 2020", fontsize = 18)
plt.yticks(fontsize = 12)
plt.xticks(fontsize = 12)
plt.xlabel("Mask Score", fontsize = 14)
plt.savefig("Images/barPlot1.png")

plt.show()

# Visualizations

### U.S. Mask-Wearing Scores as of July 14, 2020 (by County)

In [None]:
# Sort on CountyScore to find highest mask-wearing scores by county
DataFrame = DataFrame.sort_values("CountyScore", ascending = False)

In [None]:
# Display top 5 counties with highest mask scores
DataFrame.head()

In [None]:
# Display top 5 counties with lowest mask scores
DataFrame.tail()

In [None]:
# Sort on FIPS to restore order
DataFrame = DataFrame.sort_values("FIPS", ascending = True)
DataFrame

In [None]:
# Create heatmap for Mask Score

# Store latitude and longitude in locations
locations = DataFrame[["Lat", "Lng"]]

# Convert mask scores to float
mask_score = DataFrame["CountyScore"].astype(float)

# Plot Heatmap (U.S. geographic center is 39.8333, -98.5855)
fig = gmaps.figure(zoom_level = 4.1, center = (37.8, -98.6), map_type = "ROADMAP")

# Set max intensity to max mask score found in the dataset
max_intensity = DataFrame["CountyScore"].max() / 10

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights = mask_score, max_intensity = max_intensity, dissipating = False,
                                 point_radius = 0.8, gradient = ["white", "aqua", "blue", "purple"])

# Add layer
fig.add_layer(heat_layer)

# Display figure
fig

### U.S. Mask Score vs. Population Density (by County) as of July 14, 2020

In [None]:
# Do areas of higher population density have higher mask scores?

# Retrieve mask score and population density data
mskScore = DataFrame["CountyScore"]
popDens = DataFrame["PopDens"]
n = len(DataFrame)

# Perform a linear regression on population density versus mask scores
slope, int, r, p, std_err = st.linregress(popDens, mskScore)

# Create equation of line to calculate predicted mask scores
fit = slope * popDens + int

# Create equation in string formats to print on scatter plot
equation = "y = " + str(round(slope, 2)) + "x + " + str(round(int, 2))

# Define scatter plot size
plt.figure(figsize = (21, 14))

# Plot x and y values on scatter plot
plt.scatter(popDens, mskScore, marker = ".", color = "lightskyblue")

# Plot linear regression line on scatter plot
plt.plot(popDens, fit, "--", color = "black")

# Define linear regression line and print on scatter plot
plt.annotate(equation, (5150, 9.5), fontsize = 14, color = "black")

# Define plot title, x and y labels, and gridlines
plt.title(f"Mask Scores vs. Population Density (by County)\nas of July 14, 2020", fontsize = 18)
plt.xlabel("Population Density", fontsize = 14)
plt.ylabel("Mask Score", fontsize = 14)
plt.xlim(0, 6000)
plt.ylim(2.5, 10)
plt.grid(axis = "x", linewidth = 0.5)
plt.grid(axis = "y", linewidth = 0.5)
plt.savefig("Images/scatterPlot3.png")

print(f"The r-value is: {r}")

plt.show()

### Question:

### _Is there a correlation between mask scores and population density?_

### Answer:

### _Yes. &nbsp;People living in counties with higher population densities tend to wear face coverings more often than those in counties with lower population densities._

### Case Rate per 100,000 People vs. Mask Score (by County) as of July 14, 2020

In [None]:
# Do counties with higher mask scores have higher case rates per 100,000 people?

# Retrieve mask score and case rate data
countyCaseRate = DataFrame["CaseRate"]
countyMask = DataFrame["CountyScore"]
n = len(DataFrame)

# Perform a linear regression on population density versus mask scores
slope, int, r, p, std_err = st.linregress(countyMask, countyCaseRate)

# Create equation of line to calculate predicted mask scores
fit = slope * countyMask + int

# Create equation in string formats to print on scatter plot
equation = "y = " + str(round(slope, 2)) + "x + " + str(round(int, 2))

# Define scatter plot size
plt.figure(figsize = (21, 14))

# Plot x and y values on scatter plot
plt.scatter(countyMask, countyCaseRate, marker = ".", color = "red")

# Plot linear regression line on scatter plot
plt.plot(countyMask, fit, "--", color = "black")

# Define linear regression line and print on scatter plot
plt.annotate(equation, (3.25, 750), fontsize = 14, color = "red")

# Define plot title, x and y labels, and gridlines
plt.title(f"COVID-19 Cases vs. Mask Wearing Score (by County)\nas of July 14, 2020", fontsize = 18)
plt.xlabel("Mask Score", fontsize = 14)
plt.ylabel("Cases", fontsize = 14)
plt.xlim(2.5, 10)
plt.ylim(0, 3750)
plt.grid(axis = "x", linewidth = 0.5)
plt.grid(axis = "y", linewidth = 0.5)
plt.savefig("Images/scatterPlot4.png")

print(f"The r-value is: {r}")

plt.show()

### Question:

### _Do counties with higher mask scores have lower COVID-19 case rates per 100,000 people?_

### Answer:

### _No. &nbsp;Counties with higher mask scores have higher COVID-19 case rates per 100,000 people._

### Death Rate per 100,000 People vs. Mask Score (by County) as of July 14, 2020

In [None]:
# Do counties with higher mask scores have higher death rates per 100,000 people?

# Retrieve mask score and death rate data
countyDeathRate = DataFrame["DeathRate"]
countyMask = DataFrame["CountyScore"]
n = len(DataFrame)

# Perform a linear regression on death rates versus mask scores
slope, int, r, p, std_err = st.linregress(countyMask, countyDeathRate)

# Create equation of line to calculate predicted death rates
fit = slope * countyMask + int

# Create equation in string formats to print on scatter plot
equation = "y = " + str(round(slope, 2)) + "x + " + str(round(int, 2))

# Define scatter plot size
plt.figure(figsize = (21, 14))

# Plot x and y values on scatter plot
plt.scatter(countyMask, countyDeathRate, marker = ".", color = "purple")

# Plot linear regression line on scatter plot
plt.plot(countyMask, fit, "--", color = "black")

# Define linear regression line and print on scatter plot
plt.annotate(equation, (3.25, 75), fontsize = 14, color = "purple")

# Define plot title, x and y labels, and gridlines
plt.title(f"COVID-19 Deaths vs. Mask Wearing Score (by County)\nas of July 14, 2020", fontsize = 18)
plt.xlabel("Mask Score", fontsize = 14)
plt.ylabel("Deaths", fontsize = 14)
plt.xlim(2.5, 10)
plt.ylim(0, 375)
plt.grid(axis = "x", linewidth = 0.5)
plt.grid(axis = "y", linewidth = 0.5)
plt.savefig("Images/scatterPlot5.png")

print(f"The r-value is: {r}")

plt.show()

### Question:

### _Do counties with higher mask scores have lower COVID-19 death rates per 100,000 people?_

### Answer:

### _No. &nbsp;Counties with higher mask scores have higher COVID-19 death rates per 100,000 people._

# Summary

In [None]:
# Convert values to strings for cleaner formatted display
DataFrame["SQMI"] = DataFrame["SQMI"].map("{:,.2f}".format)
DataFrame["PopEst"] = DataFrame["PopEst"].map("{:,}".format)
DataFrame["PopDens"] = DataFrame["PopDens"].map("{:,.2f}".format)
DataFrame["Cases"] = DataFrame["Cases"].map("{:,}".format)
DataFrame["CaseRate"] = DataFrame["CaseRate"].map("{:,.2f}".format)
DataFrame["Deaths"] = DataFrame["Deaths"].map("{:,}".format)
DataFrame["DeathRate"] = DataFrame["DeathRate"].map("{:,.2f}".format)
DataFrame["Lat"] = DataFrame["Lat"].map("{:,.4f}".format)
DataFrame["Lng"] = DataFrame["Lng"].map("{:,.4f}".format)
DataFrame["CountyScore"] = DataFrame["CountyScore"].map("{:,.2f}".format)
DataFrame["StateScore"] = DataFrame["StateScore"].map("{:,.2f}".format)
DataFrame