In [None]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

COVID_19_Activity_df = pd.read_csv('https://query.data.world/s/ynb7kpmvt5dw26vogf5ozoqjqpyhb2')
COVID_19_Activity_df

In [None]:
#Drop the rows where at least one element is missing
COVID_19_Activity_reduced_df = COVID_19_Activity_df.dropna()
COVID_19_Activity_reduced_df

In [None]:
#Identify incomplete rows
COVID_19_Activity_reduced_df["PROVINCE_STATE_NAME"].value_counts()

In [None]:
#Groupby State
State_COVID_19_Activity_df = COVID_19_Activity_reduced_df.groupby(["PROVINCE_STATE_NAME"]).sum()
State_COVID_19_Activity_df = State_COVID_19_Activity_df.reset_index()
State_COVID_19_Activity_df

In [None]:
#Plot total COVID-19 positive cases in each state as of April 22, 2022
plt.rcParams["figure.figsize"] = (12,5)
State_COVID_19_Activity_Sort_df = State_COVID_19_Activity_df.sort_values("PEOPLE_POSITIVE_NEW_CASES_COUNT")
State_COVID_19_Activity_Sort_df.plot(x="PROVINCE_STATE_NAME", y="PEOPLE_POSITIVE_NEW_CASES_COUNT", kind = "bar", align='center', width=0.8, color='blue')
plt.xlabel("US States")
plt.ylabel("State Total Cases Count")
plt.title("US States Total COVID-19 Positive Cases Count")
plt.savefig('State Total Case Count.png')
plt.show()

In [None]:
#Plot total COVID-19 deaths in each state as of April 22, 2022
plt.rcParams["figure.figsize"] = (12,5)
State_COVID_19_Activity_Sort_df.plot(x="PROVINCE_STATE_NAME", y="PEOPLE_DEATH_NEW_COUNT", kind = "bar", align='center', width=0.8, color='red')
plt.xlabel("US States")
plt.ylabel("State Total Death Count")
plt.title("US States Total Covid-19 Deaths Count")
plt.savefig('State Total Death Count.png')
plt.show()

In [None]:
#Sort by date
COVID_19_Activity_reduced_sort_df = COVID_19_Activity_reduced_df.sort_values("REPORT_DATE")
COVID_19_Activity_reduced_sort_df

In [None]:
COVID_19_Activity_reduced_sort_df["REPORT_DATE"].value_counts()

In [None]:
#Group by date
Total_COVID_19_Activity_df = COVID_19_Activity_reduced_sort_df.groupby(["REPORT_DATE"]).sum()
#Total_COVID_19_Activity_df
Total_COVID_19_Activity_df = Total_COVID_19_Activity_df.reset_index()
Total_COVID_19_Activity_df

In [None]:
#Plot total COVID-19 cases in the US over time
plt.rcParams["figure.figsize"] = (6,4)
x = pd.to_datetime(Total_COVID_19_Activity_df["REPORT_DATE"])
# every 3 months
locator = mdates.MonthLocator(interval=3)  
# Specify the format (month-year)
fmt = mdates.DateFormatter('%b-%y')

y = Total_COVID_19_Activity_df["PEOPLE_POSITIVE_CASES_COUNT"]

plt.plot(x,y,color='blue', linestyle='dashed', marker='o',
     markerfacecolor='blue', markersize=3, label='Positive Case Count')
plt.legend(loc="lower right")

X = plt.gca().xaxis
X.set_major_locator(locator)
X.set_major_formatter(fmt)

plt.title("Total Covid-19 Positive Cases in the US Over Time")
plt.xlabel("Time (month-year)")
plt.xticks(rotation=45)
plt.ylabel("Positive Cases Count")
plt.savefig('Total Covid-19 Positive Cases in the US Over Time')
plt.show()

In [None]:
#Plot total COVID-19 deaths in the US over time
plt.rcParams["figure.figsize"] = (6,4)
x = pd.to_datetime(Total_COVID_19_Activity_df["REPORT_DATE"])

locator = mdates.MonthLocator(interval=3)  # every 3 months
# Specify the format - %b gives us Jan, Feb...
fmt = mdates.DateFormatter('%b-%y')

y = Total_COVID_19_Activity_df["PEOPLE_DEATH_COUNT"]
plt.plot(x,y,color='red', linestyle='dashed', marker='o',
     markerfacecolor='red', markersize=3, label='Death Count')
plt.legend(loc="lower right")

X = plt.gca().xaxis
X.set_major_locator(locator)
X.set_major_formatter(fmt)

plt.title("Total Covid-19 Deaths in the US Over Time")
plt.xlabel("Time (month-year)")
plt.xticks(rotation=45)
plt.ylabel("Death Count")
plt.savefig('Total Death in the US Over Time')
plt.show()

In [None]:
#Read and format Covid test data
#Data available in the Data_Indranil folder
Covid_Testing_df = pd.read_csv('Data_Indranil/COVID-19_Diagnostic_Laboratory_Testing__PCR_Testing__Time_Series.csv')
Covid_Testing_df = Covid_Testing_df.rename(columns = {'state_name': 'PROVINCE_STATE_NAME', 
                                                      'date': 'REPORT_DATE', 'new_results_reported':'NEW_TESTS', 'total_results_reported': 'TOTAL_TESTS'})
Covid_Testing_drop_df=Covid_Testing_df.drop(['state', 'state_fips', 'fema_region', 'overall_outcome', 'geocoded_state'], axis=1)
Covid_Testing_drop_df 

In [None]:
Covid_Testing_drop_df["PROVINCE_STATE_NAME"].value_counts()

In [None]:
Covid_Testing_drop_df=Covid_Testing_drop_df.groupby(["PROVINCE_STATE_NAME"]).sum()
#Covid_Testing_drop_df
Covid_Testing_sort_df = Covid_Testing_drop_df.sort_values("TOTAL_TESTS")
Covid_Testing_sort_df = Covid_Testing_sort_df.reset_index()
Covid_Testing_sort_df

In [None]:
#Plot total COVID-19 tests in the US over time
plt.rcParams["figure.figsize"] = (12,5)
Covid_Testing_sort_df.plot(x="PROVINCE_STATE_NAME", y="NEW_TESTS", kind = "bar", align='center', width=0.8, color='green')
plt.xlabel("US States")
plt.ylabel("State Total Test Count")
plt.title("US States Total Covid-19 Tests Count")
plt.savefig('State Total Test Count.png')
plt.show()

In [None]:
#Merge covid activity and test datasets
Covid_Activity_Testing_df = pd.merge(State_COVID_19_Activity_df, Covid_Testing_sort_df, on=['PROVINCE_STATE_NAME'])
Covid_Activity_Testing_df

In [None]:
#scatter plot: covid tests vs covid cases
import scipy.stats as st
plt.rcParams["figure.figsize"] = (6,4)

x = Covid_Activity_Testing_df['PEOPLE_POSITIVE_NEW_CASES_COUNT']
y = Covid_Activity_Testing_df['NEW_TESTS']

plt.title("US State Total Covid-19 Cases vs COVID-19 Tests")
plt.ylabel("Total Tests")
plt.xlabel("Total Cases")
plt.scatter(x,y,s = 10)

slope, intercept, r, p, std_err = st.linregress(x,y)
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.plot(x,slope*x+intercept, color = 'b')
plt.text(4000000, 5000000, f"r = {round(r,2)}")
print(f"The r-squared is: {round(r,2)}")
print(line_eq)
plt.savefig('US State Covid-19 Cases vs COVID-19 Tests.png')
plt.show()

In [None]:
#Create US map with covid cases and deaths 
#Data in Data_Indranil folder
import geopandas as gpd
states = gpd.read_file('Data_Indranil/USA_States_Generalized/USA_States_Generalized.shp')
states = states.rename(columns ={'STATE_NAME':'name'})
#states
states['name'] = states['name'].apply(lambda x: x.lower())
states

In [None]:
#Rename column 
State_COVID_19_Activity_df = State_COVID_19_Activity_df.rename(columns = {'PROVINCE_STATE_NAME':'name'})
State_COVID_19_Activity_df['name'] = State_COVID_19_Activity_df['name'].apply(lambda x: x.lower())
State_COVID_19_Activity_df

In [None]:
#Merge covid activity dataset and states dataset
mergedStuff = pd.merge(State_COVID_19_Activity_df, states, how = "inner",on = 'name')
mergedStuff

In [None]:
#Create US map with total covid cases   
from shapely.geometry import Point, Polygon
#"EPSG:4326" WGS84 Latitude/Longitude, used in GPS
crs = {'init': 'epsg:4326'}
geometry = []
for i in range(len(mergedStuff.name)):
    geometry.append(mergedStuff.geometry.values[i])
geo_df = gpd.GeoDataFrame(mergedStuff, crs=crs, geometry=geometry)

fig, ax = plt.subplots(figsize =(15,15))
states.plot(ax=ax, alpha=0.4, color='grey')
geo_df.plot(aspect=1)
geo_df[geo_df['PEOPLE_POSITIVE_NEW_CASES_COUNT'] < int(1000000)].plot(ax=ax, markersize=20, color='green', marker="o", label="<1000000")
geo_df[(geo_df['PEOPLE_POSITIVE_NEW_CASES_COUNT'] >= int(1000000)) & (geo_df['PEOPLE_POSITIVE_NEW_CASES_COUNT'] < int(2000000))].plot(ax=ax, markersize=20, color='yellow', marker="o", label="1000000-5000000")
geo_df[(geo_df['PEOPLE_POSITIVE_NEW_CASES_COUNT'] >= int(2000000)) & (geo_df['PEOPLE_POSITIVE_NEW_CASES_COUNT'] < int(5000000))].plot(ax=ax, markersize=20, color='orange', marker="o", label="2000000-5000000")
geo_df[(geo_df['PEOPLE_POSITIVE_NEW_CASES_COUNT'] >= int(5000000)) & (geo_df['PEOPLE_POSITIVE_NEW_CASES_COUNT'] <= int(10000000))].plot(ax=ax, markersize=20, color='red', marker="o", label="5000000-10000000")

plt.savefig('Covid-19 Cases US map_all States.png')
plt.show()

In [None]:
#Create US map with total covid deaths   
fig, ax = plt.subplots(figsize =(15,15))
states.plot(ax=ax, alpha=0.4, color='grey')

geo_df.plot(aspect=1)
geo_df[geo_df['PEOPLE_DEATH_NEW_COUNT'] < int(10000)].plot(ax=ax, markersize=20, color='green', marker="o", label="<10000")
geo_df[(geo_df['PEOPLE_DEATH_NEW_COUNT'] >= int(10000)) & (geo_df['PEOPLE_DEATH_NEW_COUNT'] < int(20000))].plot(ax=ax, markersize=20, color='yellow', marker="o", label="10000-20000")
geo_df[(geo_df['PEOPLE_DEATH_NEW_COUNT'] >= int(20000)) & (geo_df['PEOPLE_DEATH_NEW_COUNT'] <= int(50000))].plot(ax=ax, markersize=20, color='orange', marker="o", label="20000-50000")
geo_df[(geo_df['PEOPLE_DEATH_NEW_COUNT'] >= int(50000)) & (geo_df['PEOPLE_DEATH_NEW_COUNT'] <= int(100000))].plot(ax=ax, markersize=20, color='red', marker="o", label="50000-100000")
# # #plt.legend(prop = {'size':20})
plt.savefig('Covid-19 Deaths US map_all States.png')
plt.show()

In [None]:
#Read and format US population dataset
#Dataset available in the Data_Indranil folder
US_Population_df = pd.read_csv('Data_Indranil/NST-EST2021-POP.csv')
US_Population_df = US_Population_df.rename(columns = {'STATE': 'PROVINCE_STATE_NAME'})
#US_Population_df
US_Population_df['PROVINCE_STATE_NAME'] = US_Population_df['PROVINCE_STATE_NAME'].apply(lambda x: x.lower())
#US_Population_df
US_Population_df['PROVINCE_STATE_NAME'] = US_Population_df['PROVINCE_STATE_NAME'].apply(lambda x: x.replace(".",""))
#US_Population_df
US_Population_df['POPULATION'] = US_Population_df['POPULATION'].apply(lambda x: x.replace(",",""))
US_Population_df

In [None]:
Covid_Testing_sort_df['PROVINCE_STATE_NAME'] = Covid_Testing_sort_df['PROVINCE_STATE_NAME'].apply(lambda x: x.lower())
Covid_Testing_sort_df

In [None]:
#Merge and format dataset
Covid_Test_Pop_df = pd.merge(Covid_Testing_sort_df, US_Population_df, on=['PROVINCE_STATE_NAME'])
Covid_Test_Pop_df
#Covid_Test_Pop_df.dtypes
Covid_Test_Pop_df["POPULATION"]=Covid_Test_Pop_df["POPULATION"].astype(int)
Covid_Test_Pop_df.dtypes

In [None]:
#Add %Test column with US state % population 
Covid_Test_Pop_df["%Test"]=(Covid_Test_Pop_df["NEW_TESTS"]/Covid_Test_Pop_df["POPULATION"]*100)
Covid_Test_Pop_df=Covid_Test_Pop_df.sort_values("%Test")
Covid_Test_Pop_df

In [None]:
#Plot US state %test count 
plt.rcParams["figure.figsize"] = (12,5)
Covid_Test_Pop_df.plot(x="PROVINCE_STATE_NAME", y="%Test", kind = "bar", align='center', width=0.8, color='green')
plt.xlabel("US States")
plt.ylabel("State %Test Count")
plt.title("US States %Test Count")
plt.savefig('State %Test Count.png')
plt.show()