In [1]:
import pandas as pd
import requests
import os
import numpy as np

# Download Dataset for Portsmouth

#### We create a directory where to store the datasets, and use requests.url to download the data from the DEFRA website
#### This initial part of the notebook can be easily changed to consider other UK locations

In [2]:
dir_UKAIR = "./data/UKAIR"

In [3]:
os.mkdir(dir_UKAIR)

In [4]:
dir_UKAIR_po = dir_UKAIR+"/Portsmouth"
os.mkdir(dir_UKAIR_po)


In [5]:
file_url = "https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_"
suffix = ".csv?v=1"
file_prefix= "PO_"
for year in (range(2001, 2025)):
    file_name_to_save = f"{file_prefix}{year}.csv"
    with open(dir_UKAIR_po + "/" + file_name_to_save, "wb") as f:
        url = f"{file_url}{year}{suffix}"
        print('downloading: ', url)
        r = requests.get(url)
        f.write(r.content)

downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2001.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2002.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2003.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2004.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2005.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2006.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2007.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2008.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2009.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2010.csv?v=1
downloading:  https://uk-air.defra.gov.uk/datastore/data_files/site_data/PMTH_2011.csv?v=1

# Load the datasets
#### We make use of a dict and then concatenate the datasets

In [6]:
# Dictionary to store DataFrames 
dfs = {}


In [8]:
# Load the CSV into the dictionary 

for year in (range(2001, 2025)):
    file_path = f'{dir_UKAIR_po}/PO_{year}.csv'  
    dfs[year] = pd.read_csv(file_path, engine='python', skiprows=4)

In [9]:
# Concatenate the datasets
df_to_conc = []

for year in (range(2001, 2025)):
    file_path = f'{dir_UKAIR_po}/PO_{year}.csv'  
    df_to_conc.append(pd.read_csv(file_path, engine='python', skiprows=4))

In [10]:
df_port = pd.concat(df_to_conc, ignore_index=True) 

In [11]:
df_port.head()

Unnamed: 0,Date,time,PM10 particulate matter (Hourly measured),status,unit,Nitric oxide,status.1,unit.1,Nitrogen dioxide,status.2,...,unit.8,Volatile PM2.5 (Hourly measured),status.9,unit.9,PM<sub>10</sub> particulate matter (Hourly measured),Non-volatile PM<sub>10</sub> (Hourly measured),Non-volatile PM<sub>2.5</sub> (Hourly measured),PM<sub>2.5</sub> particulate matter (Hourly measured),Volatile PM<sub>10</sub> (Hourly measured),Volatile PM<sub>2.5</sub> (Hourly measured)
0,01-01-2001,01:00,31.0,R,ugm-3 (GRAV EQ),1.0,R,ugm-3,6.0,R,...,,,,,,,,,,
1,01-01-2001,02:00,25.0,R,ugm-3 (GRAV EQ),0.0,R,ugm-3,8.0,R,...,,,,,,,,,,
2,01-01-2001,03:00,25.0,R,ugm-3 (GRAV EQ),0.0,R,ugm-3,6.0,R,...,,,,,,,,,,
3,01-01-2001,04:00,22.0,R,ugm-3 (GRAV EQ),0.0,R,ugm-3,6.0,R,...,,,,,,,,,,
4,01-01-2001,05:00,29.0,R,ugm-3 (GRAV EQ),0.0,R,ugm-3,6.0,R,...,,,,,,,,,,


The dataset resolution is one hour.
We can now save the dataset to use it in the DigitalTwin of the Portsmouth harbour

In [12]:
df_port.to_csv(dir_UKAIR+"AirQuality_digitaltwin_1h.csv")