# Aim: - Initial exploration of the Covid-19 dataset

In [2]:
!pipenv install -U numpy
!pipenv install seaborn

[32m[1mInstalling -U...[0m
[?25lResolving -U[33m...[0m
[2K[32m⠋[0m Installing...
[1A[2KTraceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/pipenv/patched/pip/_vendor/packaging/requirements.py", line 102, in __init__
    req = REQUIREMENT.parseString(requirement_string)
  File "/opt/homebrew/lib/python3.10/site-packages/pipenv/patched/pip/_vendor/pyparsing/core.py", line 1141, in parse_string
    raise exc.with_traceback(None)
pipenv.patched.pip._vendor.pyparsing.exceptions.ParseException: Expected W:(0-9A-Za-z), found '-'  (at char 0), (line:1, col:1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/pipenv/patched/pip/_vendor/pkg_resources/__init__.py", line 3101, in __init__
    super(Requirement, self).__init__(requirement_string)
  File "/opt/homebrew/lib/python3.10/site-packages/pipenv/patched/pip/_vendor/packaging/requirements.py",

In [3]:
# Importing the packages needed
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

In [30]:
# Data 
counties_2020=pd.read_csv("../covid-19-data/us-counties-2020.csv")
counties_2021=pd.read_csv("../covid-19-data/us-counties-2021.csv")
counties_2022=pd.read_csv("../covid-19-data/us-counties-2022.csv")
counties_2023=pd.read_csv("../covid-19-data/us-counties-2023.csv")

In [5]:
counties_2020

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0
...,...,...,...,...,...,...
884732,2020-12-31,Sweetwater,Wyoming,56037.0,2966,16.0
884733,2020-12-31,Teton,Wyoming,56039.0,2138,4.0
884734,2020-12-31,Uinta,Wyoming,56041.0,1558,7.0
884735,2020-12-31,Washakie,Wyoming,56043.0,780,19.0


In [33]:
# test_df = pd.DataFrame({
#     'state': ['A', 'B', 'A', 'C', 'B', 'C', 'D', 'D', 'D', 'E'],
#     'cases': [100, 200, 150, 300, 250, 400, 100, 200, 300, 50],
#     'deaths': [10, 20, 15, 30, 25, 40, 10, 20, 30, 5]
# })

In [34]:
def states_by_covid(df):
    # Group by state and calculate sums for cases and deaths
    grouped_df = df.groupby('state', as_index=False)[['cases', 'deaths']].sum()
    
    # Calculate death per case percentage
    grouped_df['death_per_case'] = round((grouped_df['deaths'] / grouped_df['cases']) * 100, 2)
    
    # Sort by cases and death_per_case in descending order
    grouped_df = grouped_df.sort_values(by=['cases', 'death_per_case'], ascending=[False, False])
    
    # Return the top 10 rows
    return grouped_df.head(10)
counties_2020_groupby = states_by_covid(counties_2020)
counties_2021_groupby=states_by_covid(counties_2021)
counties_2022_groupby=states_by_covid(counties_2022)
counties_2023_groupby=states_by_covid(counties_2023)


In [32]:
counties_2023_groupby.head(5)

Unnamed: 0,state,cases,deaths,death_per_case
5,California,985844951,8424694.0,0.85
47,Texas,678812946,7679552.0,1.13
10,Florida,609882039,7020942.0,1.15
34,New York,551095969,6483341.0,1.18
15,Illinois,331615853,3377350.0,1.02


In [6]:
# Lets look at the states and their corresponding deaths
counties_2020_groupby=counties_2020.groupby('state', as_index=False)[['cases','deaths']].sum()
counties_2020_groupby['death_per_case']=round(counties_2020_groupby['deaths']/counties_2020_groupby['cases'],2)*100
counties_2020_groupby=counties_2020_groupby.sort_values(by=['cases', 'death_per_case'], ascending=[False, False])
top_10_states_covid_2020=counties_2020_groupby.head(10)
top_10_states_covid_2020

Unnamed: 0,state,cases,deaths,death_per_case
4,California,174969176,3065113.0,2.0
46,Texas,160159130,2927740.0,2.0
9,Florida,138123133,2632238.0,2.0
33,New York,126307878,8320598.0,7.0
14,Illinois,82118324,2211390.0,3.0
10,Georgia,62230196,1363977.0,2.0
31,New Jersey,57403306,3817150.0,7.0
40,Pennsylvania,47304959,2002629.0,4.0
2,Arizona,47147078,1048821.0,2.0
34,North Carolina,46306384,734456.0,2.0


In [20]:
from us import states
state_to_abbrev={state.name:state.abbr for state in states.STATES}
counties_2020_groupby['state_abbrev']=counties_2020_groupby['state'].map(state_to_abbrev)

In [22]:
counties_2020_groupby.head(5)

Unnamed: 0,state,cases,deaths,death_per_case,state_abbrev
4,California,174969176,3065113.0,2.0,CA
46,Texas,160159130,2927740.0,2.0,TX
9,Florida,138123133,2632238.0,2.0,FL
33,New York,126307878,8320598.0,7.0,NY
14,Illinois,82118324,2211390.0,3.0,IL


In [27]:
import plotly.express as px
fig=px.choropleth(counties_2020_groupby,locations='state_abbrev',locationmode='USA-states',
                  color='death_per_case' ,
                           scope="usa",
                           color_continuous_scale="Viridis",
                           labels={'death_per_case':'death rate'},
                           hover_name='state',
                           hover_data={'state_abbrev':False, 'death_per_case':True}
                          )

fig.update_layout(
    title_text='USA map with death_rate',
    margin={"r":0,"t":0,"l":0,"b":0})
    # margin={"r":0,"t":0,"l":0,"b":0})
fig.show()                          



In [38]:
import plotly.express as px

def plot_death_rate_choropleth(df, state_col, death_rate_col, abbrev_col):
   
    fig = px.choropleth(
        df,
        locations=abbrev_col,
        locationmode='USA-states',
        color=death_rate_col,
        scope="usa",
        color_continuous_scale="Viridis",
        labels={death_rate_col: 'Death Rate'},
        hover_name=state_col,
        hover_data={abbrev_col: False, death_rate_col: True}
    )
    
    fig.update_layout(
        title_text='USA Map with Death Rate',
        margin={"r": 0, "t": 0, "l": 0, "b": 0}
    )
    
    fig.show()

In [39]:
plot_death_rate_choropleth(
    df=counties_2020_groupby,
    state_col='state',
    death_rate_col='death_per_case',
    abbrev_col='state_abbrev'
)

ValueError: Value of 'locations' is not the name of a column in 'data_frame'. Expected one of ['state', 'cases', 'deaths', 'death_per_case'] but received: state_abbrev