In [1]:
import numpy as np
import scipy
import pandas as pd
import requests
from os import makedirs, path, listdir, remove
from bs4 import BeautifulSoup, SoupStrainer
import zipfile as zpf
from shutil import rmtree
import matplotlib.pyplot as plt
from scipy.spatial.distance import squareform, pdist, cosine
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import minimize
from matplotlib import cm
import re

import httplib2
import geopandas as gpd
from tqdm import tqdm
from graph_utils import *

In [13]:
region = "London"
start_date = "1996-01-01"
end_date = "2021-01-01"
data_folder = "/Users/michellewan/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/MEng_Kevin/Data and code"

In [21]:
# Load NO2 data

species = "NO2"
data_filename = f"LAQN_{species}_{start_date}_{end_date}.csv"
data = Dataset(path.join(data_folder, data_filename))
grouped_NO2 = data.group('D')
print("Daily data shape:", grouped_NO2.shape)

# Cut off before COVID
df_before_2020_NO2 = grouped_NO2.loc[grouped_NO2.index < '2020-01-01']

# Rename columns
df_before_2020_NO2 = df_before_2020_NO2.rename(columns={c: c+'_NO2' for c in df_before_2020_NO2.columns})

Daily data shape: (9133, 201)


In [22]:
# Load PM10 data

species = "PM10"
data_filename = f"LAQN_{species}_{start_date}_{end_date}.csv"
data = Dataset(path.join(data_folder, data_filename))
grouped_PM10 = data.group('D')
print("Daily data shape:", grouped_PM10.shape)

# Cut off before COVID
df_before_2020_PM10 = grouped_PM10.loc[grouped_PM10.index < '2020-01-01']

# Rename columns
df_before_2020_PM10 = df_before_2020_PM10.rename(columns={c: c+'_PM10' for c in df_before_2020_PM10.columns})


Daily data shape: (9133, 174)


In [27]:
# Concatenate NO2 and PM10 data
df_before_2020 = pd.concat([df_before_2020_NO2, df_before_2020_PM10], axis=1)
print("Daily data shape:", df_before_2020.shape)
print("Start date:", df_before_2020.index.min())
print("End date:", df_before_2020.index.max())

Daily data shape: (8766, 375)
Start date: 1996-01-01 00:00:00
End date: 2019-12-31 00:00:00


In [28]:
# Find complete subset
complete_subset, column_names = get_complete_subset(df_before_2020, num_valid_values=500)
print("Complete subset shape:", complete_subset.shape)
print("Stations:", column_names)
print(complete_subset.index.min(), complete_subset.index.max())

Complete subset shape: (500, 33)
Stations: Index(['BG1_NO2', 'BY7_NO2', 'EA1_NO2', 'EN4_NO2', 'GB6_NO2', 'GR5_NO2',
       'HK4_NO2', 'HV3_NO2', 'LH2_NO2', 'HS4_NO2', 'IS2_NO2', 'KC2_NO2',
       'KC3_NO2', 'WE0_NO2', 'LB1_NO2', 'LW2_NO2', 'RB1_NO2', 'WA4_NO2',
       'WA2_NO2', 'MY1_NO2', 'BG2_PM10', 'BN2_PM10', 'BX4_PM10', 'CD3_PM10',
       'EA2_PM10', 'GB6_PM10', 'GR5_PM10', 'HR1_PM10', 'LH2_PM10', 'LW2_PM10',
       'SK1_PM10', 'ST4_PM10', 'MY1_PM10'],
      dtype='object')
2003-10-16 00:00:00 2005-02-26 00:00:00


In [33]:
# Normalise data between 0 and 1

def normalise_data(data):
    return (data - data.min()) / (data.max() - data.min())

normalised_data = normalise_data(complete_subset)

In [35]:
normalised_data.head()

Unnamed: 0_level_0,BG1_NO2,BY7_NO2,EA1_NO2,EN4_NO2,GB6_NO2,GR5_NO2,HK4_NO2,HV3_NO2,LH2_NO2,HS4_NO2,...,CD3_PM10,EA2_PM10,GB6_PM10,GR5_PM10,HR1_PM10,LH2_PM10,LW2_PM10,SK1_PM10,ST4_PM10,MY1_PM10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-10-16,0.320757,0.35052,0.350898,0.445417,0.459489,0.411725,0.234698,0.393964,0.477767,0.340343,...,0.490679,0.089686,0.373671,0.416636,0.304685,0.452281,0.367191,0.385596,0.411724,0.349094
2003-10-17,0.369123,0.402029,0.39283,0.471828,0.545091,0.451161,0.231407,0.433025,0.549947,0.325944,...,0.531101,0.091808,0.386798,0.438739,0.334739,0.435826,0.383405,0.394999,0.478716,0.336824
2003-10-18,0.330719,0.398023,0.349562,0.348346,0.490055,0.366843,0.190798,0.375249,0.447245,0.293276,...,0.520483,0.099709,0.401768,0.422546,0.392701,0.456593,0.353839,0.43302,0.488486,0.33744
2003-10-19,0.097021,0.124506,0.130572,0.105829,0.238291,0.180762,0.053199,0.147272,0.366998,0.120264,...,0.257014,0.04633,0.197603,0.251016,0.221268,0.261518,0.183596,0.218724,0.255408,0.1766
2003-10-20,0.2131,0.360042,0.208154,0.300621,0.38082,0.337914,0.14932,0.25435,0.307008,0.186218,...,0.405611,0.048398,0.209592,0.228359,0.235529,0.354063,0.196471,0.230649,0.25471,0.137225


In [36]:
# Get PM10 column indices
PM10_indices = [i for i, name in enumerate(column_names) if name.endswith('_PM10')]

In [37]:
PM10_indices

[20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]