# COVID-19 Clusters Data

The Metro Public Health Department tracks COVID-19 clusters. The files `clusters.csv` and `clusters_by_type.csv` contain the tables of clusters as reported by [WSMV](https://www.wsmv.com/news/metro-health-releases-latest-covid-19-clusters/article_ef554e08-1558-11eb-b290-873345e174d7.html) along with the coordinates of the clusters. Can you find any connection between the reported COVID violations and subsequent COVID clusters?

In [None]:
# import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# display settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Read in and explore the COVID-19 clusters dataframes

In [None]:
# Read this in as a normal dataframe first
clusters = pd.read_csv('../data/clusters.csv')
clusters.head()

In [None]:
clusters.info()

In [None]:
clusters.columns = ['cluster_name', 'type', 'start_date', 'case_count', 'lat', 'long']
clusters.head()

In [None]:
clusters.info()

In [None]:
# Large clusters without lat/long coordinates
clusters[clusters['lat'].isna() == True]

_Of the 62 10+ people clusters named and recorded in the dataset, the 11 listed above do not have any associated coordinates and cannot be mapped._

In [None]:
# Clean up data types in the clusters dataframe
clusters.start_date = pd.to_datetime(clusters['start_date'], errors = 'raise')
# # Not necessary anymore
# clusters.long = clusters.long.str.replace(',','')
# clusters.long = pd.to_numeric(clusters['long'], errors = 'raise')
# clusters.long.unique()

In [None]:
# Check that the datatypes look good
clusters.info()

In [None]:
# What is the smallest cluster?
clusters.case_count.min()

In [None]:
clusters.to_csv('../data/clusters_cleaned.csv')

In [None]:
clusters.groupby('type')['case_count'].sum().sort_values(ascending = False)

In [None]:
# See how many large clusters are identified by type
big_clusters = clusters.type.value_counts().to_frame()
big_clusters

In [None]:
# Make the index a column
big_clusters.reset_index(inplace = True)
big_clusters.columns

In [None]:
# Rename the type and cluster_count columns
big_clusters = big_clusters.rename(columns = {'type' : 'cluster_count', 'index':'type'})
big_clusters.columns

In [None]:
# Check to make sure it looks right
big_clusters.head()

### Read in the Clusters by Type dataframe

In [None]:
# Since there are no coordinates, just read this in as a normal pandas dataframe
clusters_by_type = pd.read_csv('../data/clusters_by_type.csv')
clusters_by_type.head()

In [None]:
clusters_by_type.info()

In [None]:
# Update the column names
clusters_by_type.columns = ['type', 'cluster_count']
clusters_by_type.info()

In [None]:
# Compare the cluster types in the two dataframes
print(clusters.sort_values('type').type.unique())
print(clusters_by_type.sort_values('type').type.unique())

The list looks largely the same, though there are some cluster types that are not represented in the individual `clusters` dataset. Based on my understanding of COVID-19 data aggregation practices and the WSMV statement that only clusters of 10+ individuals are identified, I assume that the clusters represented in the `clusters` dataset are all of those that are `>= 10 cases` to protect the privacy of individuals diagnosed with COVID-19 in small clusters.

In [None]:
# Take a look at the distribution of clusters by type
clusters_by_type.hist();

In [None]:
# Which type of facility is the outlier in terms of number of clusters? LTCF
clusters_by_type[clusters_by_type['cluster_count'] > 20]

In [None]:
# Merge the big cluster counts with the clusters_by_type dataframe
clusters_by_type = clusters_by_type.merge(big_clusters, how = 'left', on = 'type', suffixes = ['_total', '_big'])
clusters_by_type

In [None]:
# Fill NaN values so that the columns can be mathed
clusters_by_type['cluster_count_big'] = clusters_by_type['cluster_count_big'].fillna(0)

In [None]:
# Create the small cluster count column by subtracting the large clusters from the total
clusters_by_type['cluster_count_small'] = clusters_by_type.cluster_count_total - clusters_by_type.cluster_count_big

In [None]:
# Set all the types to integers
clusters_by_type = clusters_by_type.astype({'cluster_count_big' : 'int64', 'cluster_count_small' : 'int64'})
clusters_by_type = clusters_by_type.sort_values('cluster_count_total', ascending = False)

In [None]:
clusters_by_type.head()

In [None]:
# Check to make sure it looks good
clusters_by_type.info()

In [None]:
# Create a bar chart here of side-by-side cluster sizes
clusters_by_type.plot(kind = 'bar', x = 'type', y = ['cluster_count_big', 'cluster_count_small'], figsize = (20,10))
plt.title('Large and Small COVID-19 Clusters by Facility Type', fontsize = 40)
plt.xlabel('')
plt.ylabel('Number of Clusters', fontsize = 14)
plt.xticks(rotation = 290, fontsize = 14)
plt.legend(['10+ COVID Cluster', 'Small COVID Cluster'], fontsize = 20)
plt.tight_layout()
plt.savefig('../visualizations/large_and_small_clusters_by_type.png', dpi = 150)
;

In [None]:
clusters.info()

In [None]:
# Set the figsize
plt.figure(figsize=(20,10))
# Set the range for bubble sizes
minsize = min(clusters['case_count']*10)
maxsize = max(clusters['case_count']*10)
# make scatterplot
fig = sns.scatterplot(data = clusters, x = 'start_date', y = 'case_count', hue = 'type', size = 'case_count', sizes=(minsize, maxsize), alpha = 0.7)
fig.set_xlim(clusters['start_date'].min(), clusters['start_date'].max())
# set labels
plt.xlabel('Cluster Start Date', size=14)
plt.ylabel('Number of Cases', size=14)
plt.title('Large COVID-19 Case Clusters by Date', size=20)
# Fix the legend
h,l = fig.get_legend_handles_labels()
plt.legend(h[1:15], l[1:15], loc= 'upper right', fontsize=14).set_title('')
# plt.show(fig)
plt.tight_layout()
plt.savefig('../visualizations/large_clusters_over_time.png', dpi = 150);

In [None]:
# Are the sizes of big clusters changing over time?
clusters.plot(kind = 'scatter', x = 'start_date', y = 'case_count')
plt.xticks(rotation = 45)
;

In [None]:
# Create a bar chart of clusters over time
clusters.plot(kind = 'line', x = 'start_date', y = 'case_count');

In [None]:
clusters_by_type.to_csv('../data/clusters_by_type_cleaned.csv')

### Read in the COVID-19 reported violations dataframe

In [None]:
# Since I need to update a couple of items, reading this in as a pandas dataframe
violations = pd.read_csv('../data/covid_violations.csv')
violations.head()

In [None]:
violations.info()

In [None]:
# Create a new datetime field for the date a violation was reported
violations['date_opened'] = violations['datetime_opened'].str[:10]

In [None]:
# Convert that field into datetime format
violations['date_opened'] = pd.to_datetime(violations['date_opened'], errors = 'raise')

In [None]:
# Check the datatypes
violations.info()

In [None]:
# Check out the coordinates that have 15 violations but only 3 locations
violations[violations['coord'] == '(36.15658331160417, -86.78745279999998)']

_These all appear to be generic 'Broadway' references. Since we're going to be matching locations from the Google API by address, I'm still leaving the addresses in in the following new dataframe subsets._

In [None]:
violations.to_csv('../data/violations_cleaned.csv', index = False)

In [None]:
# Create a new dataframe with a count of violations by coordinates
violations_by_loc = violations.groupby(['long', 'lat', 'address']).count()
# Sort the dataframe and take a look at the top-50 results
violations_by_loc = violations_by_loc.sort_values(['request_no'], ascending = False)
violations_by_loc.head(50)

In [None]:
# Clean up the dataframe
violations_by_loc = violations_by_loc.drop(['datetime_opened', 'contact_type', 'city', 'zip', 'coord', 'date_opened'], axis = 1)

In [None]:
violations_by_loc.reset_index(inplace = True)
violations_by_loc.head()

In [None]:
violations_by_loc.to_csv('../data/violations_by_loc.csv', index = False)

In [None]:
violations_by_date = violations.groupby(['date_opened']).count()
# Sort the dataframe and take a look at the top-50 results
violations_by_date = violations_by_date.sort_values(['request_no'], ascending = False)
violations_by_date.head(50)

In [None]:
# Clean up the dataframe
violations_by_date.reset_index(inplace = True)
violations_by_date = violations_by_date.drop(['datetime_opened', 'contact_type', 'address', 'city', 'zip', 'lat', 'long'], axis = 1)
violations_by_date.head()

In [None]:
violations_by_date.columns = ['date_opened', 'total_violations', 'total_num_w_coords']
violations_by_date.info()

In [None]:
violations_by_date.to_csv('../data/violations_by_date.csv', index = False)

In [None]:
# Create a new dataframe with a count of violations by coordinates
violations_by_loc_and_date = violations.groupby(['long', 'lat', 'address', 'date_opened']).count()
# Sort the dataframe and take a look at the top-50 results
violations_by_loc_and_date = violations_by_loc_and_date.sort_values(['request_no'], ascending = False)
violations_by_loc_and_date.head(50)

In [None]:
# Clean up the dataframe
violations_by_loc_and_date = violations_by_loc_and_date.drop(['datetime_opened', 'contact_type', 'city', 'zip', 'coord'], axis = 1)
violations_by_loc_and_date.reset_index(inplace = True)
violations_by_loc_and_date.head()

In [None]:
violations_by_loc_and_date.to_csv('../data/violations_by_loc_and_date.csv', index = False)