In [1]:
# Importing necessary libraries
import pandas as pd
import requests
from requests.auth import HTTPBasicAuth
from zipfile import ZipFile
import io
import urllib3

# Literature
https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2021GH000383
The above link associates the air quality with covid cases in Mumbai, India

In [2]:
# Disables the warning when downloading
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def zipped_downloader(url):
    """ Downloads the files necessary for the analysis, unzips and creates a pandas dataframe """
    
    response = requests.get(url, auth=HTTPBasicAuth('user', 'pass'), stream=True, verify=False)
    with ZipFile(io.BytesIO(response.content)) as myzip: # Unzips the file
        with myzip.open(myzip.namelist()[0]) as myfile: # Open first file in folder
            df = pd.read_csv(myfile) # Reads csv and creates a pandas dataframe
            return df      

# Download of PM2.5 FRM/FEM Mass (88101) from 'aqs.epa.gov'
PM2_5_Mass_2020 = zipped_downloader('https://aqs.epa.gov/aqsweb/airdata/daily_88101_2020.zip')
PM2_5_Mass_2021 = zipped_downloader('https://aqs.epa.gov/aqsweb/airdata/daily_88101_2021.zip')
# Downloads nytimes covid-19 data
covid19_usa = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

In [3]:
# Select only the columns of interest
PM2_5_Mass_2021 = PM2_5_Mass_2021[['State Name', 'Date Local', 'AQI']]
PM2_5_Mass_2020 = PM2_5_Mass_2020[['State Name', 'Date Local', 'AQI']]
covid19_usa = covid19_usa.drop(columns=['fips'])

In [4]:
# Set columns to correct data type
PM2_5_Mass_2020['Date Local'] = pd.to_datetime(PM2_5_Mass_2020['Date Local'])
PM2_5_Mass_2021['Date Local'] = pd.to_datetime(PM2_5_Mass_2021['Date Local'])
covid19_usa['date'] = pd.to_datetime(covid19_usa['date'])

In [5]:
# No missing data for the selected columns in the covid dataset
# Both air quality files have several rows with missing data

# The rows with NaN's are dropped
PM2_5_Mass_2020 = PM2_5_Mass_2020.dropna()
PM2_5_Mass_2021 = PM2_5_Mass_2021.dropna()

In [6]:
# The missing data was removed without problem
print(covid19_usa.isnull().sum())
print(PM2_5_Mass_2020.isnull().sum())
print(PM2_5_Mass_2021.isnull().sum())

date      0
state     0
cases     0
deaths    0
dtype: int64
State Name    0
Date Local    0
AQI           0
dtype: int64
State Name    0
Date Local    0
AQI           0
dtype: int64


In [20]:
print(f"Covid, Begin:{covid19_usa['date'].min()}, End:{covid19_usa['date'].max()}")
print(f"PM2.5 Mass 2020, Begin:{PM2_5_Mass_2020['Date Local'].min()}, End:{PM2_5_Mass_2020['Date Local'].max()}")
print(f"PM2.5 Mass 2021, Begin:{PM2_5_Mass_2021['Date Local'].min()}, End:{PM2_5_Mass_2021['Date Local'].max()}")
print('The covid dataset spans from 2020-2022, the PN2.5 mass data is from 2020 until nov 2021')

Covid, Begin:2020-01-21 00:00:00, End:2022-01-13 00:00:00
PM2.5 Mass 2020, Begin:2020-01-01 00:00:00, End:2020-12-31 00:00:00
PM2.5 Mass 2021, Begin:2021-01-01 00:00:00, End:2021-11-10 00:00:00
The covid dataset spans from 2020-2022, the PN2.5 mass data is from 2020 until nov 2021


# To-do
* Unify the PM2.5 file
* Get cases per month and average AQI values per month
* Select on only available dates (2020 Jan - 2021 Nov)
* Select most and less pulluted states
* Find statistical test to observe difference between states of high and low pollution
* Plot line chart with pollution and covid cases over the 2 year period for the main states