In [1]:
# Importing necessary libraries
import pandas as pd
import requests
from requests.auth import HTTPBasicAuth
from zipfile import ZipFile
import io
import urllib3
import plotly.express as px

# Literature
https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2021GH000383
The above link associates the air quality with covid cases in Mumbai, India

In [2]:
# Disables the warning when downloading
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def zipped_downloader(url):
    """ Downloads the files necessary for the analysis, unzips and creates a pandas dataframe """
    
    response = requests.get(url, auth=HTTPBasicAuth('user', 'pass'), stream=True, verify=False)
    with ZipFile(io.BytesIO(response.content)) as myzip: # Unzips the file
        with myzip.open(myzip.namelist()[0]) as myfile: # Open first file in folder
            df = pd.read_csv(myfile) # Reads csv and creates a pandas dataframe
            return df      

# Download of PM2.5 FRM/FEM Mass (88101) from 'aqs.epa.gov'
PM2_5_Mass_2020 = zipped_downloader('https://aqs.epa.gov/aqsweb/airdata/daily_88101_2020.zip')
PM2_5_Mass_2021 = zipped_downloader('https://aqs.epa.gov/aqsweb/airdata/daily_88101_2021.zip')
# Downloads nytimes covid-19 data
covid19_usa = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

## Note about covid data
The covid 19 file is updated on a daily basis, if the file isn't reshaped or deleted this shouldn't be a problem, but in case it goes wrong, please download the dataset at this [google drive](https://drive.google.com/file/d/1Po0Uk1t_jk_5NGeW_MQAKyES_jv_t-zk/view?usp=sharing).

In [3]:
# Select only the columns of interest
PM2_5_Mass_2021 = PM2_5_Mass_2021[['State Name', 'Date Local', 'AQI']]
PM2_5_Mass_2020 = PM2_5_Mass_2020[['State Name', 'Date Local', 'AQI']]
covid19_usa = covid19_usa.drop(columns=['fips'])

In [4]:
# Set columns to correct data type
PM2_5_Mass_2020['Date Local'] = pd.to_datetime(PM2_5_Mass_2020['Date Local'])
PM2_5_Mass_2021['Date Local'] = pd.to_datetime(PM2_5_Mass_2021['Date Local'])
covid19_usa['date'] = pd.to_datetime(covid19_usa['date'])

In [5]:
# No missing data for the selected columns in the covid dataset
# Both air quality files have several rows with missing data

# The rows with NaN's are dropped
PM2_5_Mass_2020 = PM2_5_Mass_2020.dropna()
PM2_5_Mass_2021 = PM2_5_Mass_2021.dropna()

In [6]:
# The missing data was removed without problem
print(covid19_usa.isnull().sum())
print(PM2_5_Mass_2020.isnull().sum())
print(PM2_5_Mass_2021.isnull().sum())

date      0
state     0
cases     0
deaths    0
dtype: int64
State Name    0
Date Local    0
AQI           0
dtype: int64
State Name    0
Date Local    0
AQI           0
dtype: int64


In [7]:
print(f"Covid, Begin:{covid19_usa['date'].min()}, End:{covid19_usa['date'].max()}")
print(f"PM2.5 Mass 2020, Begin:{PM2_5_Mass_2020['Date Local'].min()}, End:{PM2_5_Mass_2020['Date Local'].max()}")
print(f"PM2.5 Mass 2021, Begin:{PM2_5_Mass_2021['Date Local'].min()}, End:{PM2_5_Mass_2021['Date Local'].max()}")
print('The covid dataset spans from 2020-2022, the PN2.5 mass data is from 2020 until nov 2021')

Covid, Begin:2020-01-21 00:00:00, End:2022-01-17 00:00:00
PM2.5 Mass 2020, Begin:2020-01-01 00:00:00, End:2020-12-31 00:00:00
PM2.5 Mass 2021, Begin:2021-01-01 00:00:00, End:2021-11-10 00:00:00
The covid dataset spans from 2020-2022, the PN2.5 mass data is from 2020 until nov 2021


In [62]:
# Concatenates the 2020 and 2021 files
PM2_5_Mass_20et21 = pd.concat([PM2_5_Mass_2020, PM2_5_Mass_2021], axis=0)
# Renames the column to date so both dataframes have the same name for the same column
PM2_5_Mass_20et21.rename(columns = {'Date Local':'date'}, inplace=True)

# From the covid dataset,only years 2020 and 2021 are necessary
covid19_usa_20et21 = covid19_usa[(covid19_usa['date'].dt.year >= 2020) & (covid19_usa['date'].dt.year <= 2021)]

In [57]:
# Considering the daily data is very 

def quickliner(df, query):
    """ Add doctring later , y = query, which can be 'cases' or deaths """
    fig = px.line(df, x="date", y=query, title=f'Covid {query} in Washington', color='state')
    return fig

quickliner(covid19_usa, "cases")

In [58]:
quickliner(covid19_usa, "deaths")

Looking at the above plot, a few states immediately stand out, such as California, Texas, Florida and New York. Both of these have the largest number of covid cases and deaths. Considering however the number of states in the USA, this line plot might be useful for a simple glance, however the dataset needs to be further analysed to come to concrete conclusions.

In [68]:
# For a better view of the states with most covid cases, the cumulitive number of cases will be observed.
# The groupby outputs a series, which cannot be sorted, thus a pandas dataframe will be generated 
covid19_usa.groupby(['state'])['cases'].sum().sort_values(ascending=False)

state
California                  1777486805
Texas                       1441352729
Florida                     1191384086
New York                     971508016
Illinois                     651117603
Pennsylvania                 542957505
Georgia                      542737098
Ohio                         524773214
North Carolina               483511502
New Jersey                   459154868
Michigan                     440429487
Arizona                      422906338
Tennessee                    417645340
Indiana                      358611083
Wisconsin                    325962032
Massachusetts                323231091
Virginia                     317960238
Missouri                     303524282
South Carolina               289758485
Minnesota                    281495342
Alabama                      276558334
Louisiana                    256624144
Colorado                     254284202
Kentucky                     231346068
Washington                   223688896
Oklahoma           

In [75]:
PM2_5_Mass_20et21['State Name'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District Of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico',
       'Virgin Islands', 'Country Of Mexico'], dtype=object)

Not on mainland nor in both datasets: American Samoa

# To-do
* Unify the PM2.5 file - OK
* Get cases per month and average AQI values per month
* Select on only available dates (2020 Jan - 2021 Nov) - OK
* Select most and less polluted states
* Find statistical test to observe difference between states of high and low pollution
* Plot line chart with pollution and covid cases over the 2 year period for the main states