In [1]:
# Import the libraries

import pandas as pd
import datetime as dt
import plotly.express as px

The objective is to retrieve the following data live from NASA: https://earthdata.nasa.gov/learn/pathfinders/covid-19/environmental-impacts
1) Aerosol Optical Depth/Thickness
2) Nitrogen Dioxide
3) Carbon Monoxide
4) Ozone
After the analysis that follows, we identified that we only need to retrieve the Nitrogen Dioxide (NO2) for our application.

In [2]:
# The variables
kpis = ["co", "dew", "no2", "o3", "pm10", "pm25", "so2"]

# We will compare january to may 2019 and january to may 2020 for the city of Paris
start_date_2019 = dt.datetime(2019, 1, 1, 0, 0, 0, 0)
end_date_2019 = dt.datetime(2019, 5, 20, 0, 0, 0, 0)
start_date_2020 = dt.datetime(2020, 1, 1, 0, 0, 0, 0)
end_date_2020 = dt.datetime(2020, 5, 20, 0, 0, 0, 0)

In [5]:
def load_data(period):
    """
    :param
    period - string file ending for the needed period for load
    :returns
    returns raw data frame for the selected period
    """
    data = pd.read_csv(f"Datasets/covid19-data-{period}.csv", encoding='ISO-8859-1')
    data['Date'] = pd.to_datetime(data['Date'])
    return data

def consolidate_data(df_list):
    """
    :param
    df_list - list of pandas dataframes to concatenate
    :returns
    full_data - consolidated pandas dataframe
    """
    full_data = pd.concat(df_list)
    return full_data
    
def clean_data(data, start_date, end_date, kpis):
    """
    data - raw pandas df
    start_data - datetime, start date for filtering
    end_date - datetime, end date for filtering
    kpis - list of variables
    """
    clean_data = data[data.Specie.isin(kpis) &
                    (data['Date'] > start_date) &
                    (data['Date'] < end_date)].copy()
    return clean_data

In [6]:
# Extracting data from files
data2019Q1 = load_data('2019Q1')
data2019Q2 = load_data('2019Q2')
data2020 = load_data('2020')

#Combining data
data2019 = consolidate_data([data2019Q1, data2019Q2])
clean_data2019 = clean_data(data2019, start_date_2019, end_date_2019, kpis)
clean_data2020 = clean_data(data2020, start_date_2020, end_date_2020, kpis)

In [7]:
clean_data2020.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
292,2020-04-18,BE,LiÃ¨ge,o3,70,4.5,34.2,19.1,703.76
293,2020-04-29,BE,LiÃ¨ge,o3,70,11.0,34.2,27.3,435.14
295,2020-03-22,BE,LiÃ¨ge,o3,71,18.3,32.9,26.4,118.68
296,2020-04-07,BE,LiÃ¨ge,o3,69,1.3,39.4,24.8,1423.84
297,2020-04-19,BE,LiÃ¨ge,o3,71,2.1,32.9,17.1,547.19


# Identify interesting variables to compare
We will plot a Radar plot will all the variables for Paris for the years 2019 and 2020 (for the same period of time: January to May)

In [13]:
def radar_plot(data, city):
    data = data[(data["City"]==city)]
    fig = px.line_polar(data, r="median", theta="Specie", color="Year", line_close=True,
                        hover_name="City",
                       color_discrete_sequence=["#3deb34","#eb4034"])
    return fig

clean_data2019['DayMonth'] = clean_data2019["Date"].dt.month_name().str[:3] + ' ' + clean_data2019["Date"].dt.day.astype(str)
clean_data2020['DayMonth'] = clean_data2020["Date"].dt.month_name().str[:3] + ' ' + clean_data2020["Date"].dt.day.astype(str)
clean_data2019['DayOfYear'] = clean_data2019["Date"].dt.dayofyear
clean_data2020['DayOfYear'] = clean_data2020["Date"].dt.dayofyear
clean_data2019['WeekOfYear'] = clean_data2019["Date"].dt.weekofyear
clean_data2020['WeekOfYear'] = clean_data2020["Date"].dt.weekofyear

data2019_grouped = clean_data2019[["Date", "Specie", "City", "median"]].groupby(["Specie", "City"], as_index=False).mean()
data2020_grouped = clean_data2020[["Date", "Specie", "City", "median"]].groupby(["Specie", "City"], as_index=False).mean()
data2019_grouped['Year'] = 'Y2019'
data2020_grouped['Year'] = 'Y2020'

data_grouped_time = pd.concat([data2019_grouped, data2020_grouped])

fig = radar_plot(data_grouped_time, 'Paris')
fig.show()

# Variables

We can identify 5 variables to follow:
1. pm25: Fine particulate matter (PM2.5) is an air pollutant that is a concern for people's health when levels in air are high.

2. pm10: PM10 is particulate matter 10 micrometers or less in diameter that is also a concern for people's health when levels in air are high.
3. o3: Ozone is a highly reactive gas composed of three oxygen atoms. It is both a natural and a man-made product that occurs in the Earth's upper atmosphere ozone molecule(the stratosphere) and lower atmosphere (the troposphere).  Depending on where it is in the atmosphere, ozone affects life on Earth in either good or bad ways.
4. no2: Nitrogen Dioxide (NO2) Pollution primarily gets in the air from the burning of fuel. NO2 forms from emissions from cars, trucks and buses, power plants, and off-road equipment.

# Plot no2 levels for 2019 and 2020 for Paris

In [29]:
def line_plot(data, kpi, city):
    try:
        data = data[(data["Specie"]==kpi) & (data["City"]==city)]
        sub_df = data[data.groupby('DayMonth').DayMonth.transform('count')>1].copy()
        fig = px.line(sub_df.sort_values(["DayOfYear"]), x="DayMonth", y="median", color="Year") #labels={'x':'DayMonth'}
        fig.update_xaxes(tickangle=-45, title_text='Month and Day')
        fig.update_yaxes(title_text='Indicator Median')
    except:
        pass
    return fig

data2019_grouped = clean_data2019[["DayOfYear", "DayMonth", "Specie", "City", "median"]].groupby(["DayOfYear", "DayMonth", "Specie", "City"], as_index=False).mean().sort_values('DayOfYear', ascending=True)
data2020_grouped = clean_data2020[["DayOfYear", "DayMonth", "Specie", "City", "median"]].groupby(["DayOfYear", "DayMonth", "Specie", "City"], as_index=False).mean().sort_values('DayOfYear', ascending=True)
data2019_grouped['Year'] = 'Y2019'
data2020_grouped['Year'] = 'Y2020'

data_grouped_days = pd.concat([data2019_grouped, data2020_grouped])
#data_grouped_days = data_grouped_days[data_grouped_days.groupby('DayMonth').DayMonth.transform('count')>1].copy()
#data_grouped_days = data_grouped_days[(data_grouped_days["Specie"]=="co") & (data_grouped_days["CountryName"]=="Argentina")]

figure = line_plot(data_grouped_days, 'no2', 'Paris')
figure.show()

# Plot o3 levels for 2019 and 2020 for Paris

In [30]:
figure = line_plot(data_grouped_days, 'o3', 'Paris')
figure.show()

# Plot pm25 levels for 2019 and 2020 for Paris

In [32]:
figure = line_plot(data_grouped_days, 'pm25', 'Paris')
figure.show()

# Plot pm10 levels for 2019 and 2020 for Paris

In [33]:
figure = line_plot(data_grouped_days, 'pm10', 'Paris')
figure.show()

# Conclusion
1. We can see a difference in the variable of no2 that is directly impacted by car usage.
2. We see the same patterns for pm25 and pm10
3. We don't see a difference in o3, in fact o3 is higher in 2020 than 2019

In this context, for the city of Paris, we can use three variable to calculate the quality of the air for our application: no2, pm25 and pm10.