# Approximate Graph Propagation v2

In [None]:
import numpy as np
import scipy
import pandas as pd
import requests
from os import makedirs, path, listdir, remove
from bs4 import BeautifulSoup, SoupStrainer
import zipfile as zpf
from shutil import rmtree
import matplotlib.pyplot as plt
from scipy.spatial.distance import squareform, pdist, cosine
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import minimize
import matplotlib
from matplotlib import cm
from datetime import datetime

import httplib2
import geopandas as gpd
from tqdm import tqdm

## Get LAQN Data

In [None]:
species = "SO2"
region = "London"
start_date = "1996-01-01"
end_date = "2021-01-01"
folder='tmp'

In [None]:
# Load LAQN data
laqn_df = pd.read_csv(path.join(folder, f"LAQN_{species}_{start_date}_{end_date}.csv"), index_col="date", infer_datetime_format=True)
print(laqn_df.shape)

In [None]:
# Load map file
london_boroughs_gdf = gpd.read_file(path.join(folder, "london_boroughs_coordinates.shp"))
london_gdf = london_boroughs_gdf.dissolve()
print(london_boroughs_gdf.shape)
london_boroughs_gdf.plot()
plt.show()

In [None]:
# Load LAQN metadata
london_sites_gdf = gpd.read_file(path.join(folder, "LAQN_sites.shp"))
print(london_sites_gdf.shape)

In [None]:
london_sites_gdf

In [None]:
def plot_on_map(data_geodataframe, map_geodataframe, 
                data_column=None, map_column=None, 
                data_cmap=None, map_cmap=None, 
                data_color=None, map_color="whitesmoke", 
                data_markersize=0.1, 
                map_edge_color="black", 
                colorbar=False, 
                title="Greater London", 
                fontsize="25", 
                figsize=(20,10), 
                axis="off",
                mark=None):
    
    base = data_geodataframe.plot(column=data_column, 
                           ax=map_geodataframe.plot(column=map_column, 
                                                    figsize=figsize, 
                                                    color=map_color, 
                                                    edgecolor=map_edge_color, 
                                                    cmap=map_cmap), 
                           color=data_color, cmap=data_cmap, markersize=data_markersize)
    if colorbar:
        colorbar_max = data_geodataframe[data_column].max()
        norm = plt.Normalize(data_geodataframe[data_column].min(), colorbar_max)
        plt.colorbar(plt.cm.ScalarMappable(cmap=data_cmap, 
        norm=norm)).set_label(data_column)
        
    if mark:
        marked = data_geodataframe[data_geodataframe['@SiteCode'] == mark]
        marked.plot(ax=base, marker='x', color='black', markersize=15);
    
    plt.suptitle(title, fontsize=fontsize)
    plt.xlabel('Longitude', fontsize=14)
    plt.ylabel('Latitude', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.axis(axis)
    plt.show()

## Graph Propagation

In [None]:
class Dataset():
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
        self.orig = self.df.copy()
        self.df['date'] = pd.to_datetime(self.df.date)
        
    def drop_null(self, nan_percent):
        # drop column if proportion of NaN elements exceed the nan_percent
        min_count = int(((100-nan_percent)/100)*self.df.shape[0] + 1)
        return self.df.dropna(axis=1, thresh=min_count) 
        
    def fill_mean(self):
        return self.df.fillna(self.df.mean())
    
    def group(self, freq):
        # group the data by the specified freq (month/year) and average across this period, then fill NaN values 
        df = self.df.groupby(pd.Grouper(key="date", freq=freq)).mean()
        return df
    
    def group_and_fill(self, freq):
        # group the data by the specified freq (month/year) and average across this period, then fill NaN values 
        df = self.df.groupby(pd.Grouper(key="date", freq=freq)).mean()
        return df.ffill().bfill()
    
    def fill(self):
        df = self.df.copy()
        for col in df.columns.drop('date'):
            df[col] = df[col].fillna(df.groupby([df.date.dt.year, df.date.dt.month])[col].transform('mean'))
        return df.ffill().bfill()

In [None]:
class ComputeAM():
    def __init__(self, df):
        am_shape = (df.shape[1], df.shape[1])
        self.am = pd.DataFrame(np.zeros(shape=am_shape), columns=df.columns, index=df.columns)
    
    def euclidean_dist(self, df):
        # np.linalg.norm(complete['TD0'].values - complete['BG3'].values) #test euclidean distance between two columns
        dist_arr = squareform(pdist(df.transpose()))
        return pd.DataFrame(dist_arr, columns=df.columns.unique(), index=df.columns.unique())
    
    def cosine_dist(self, df):
        dist_arr = cosine_similarity(df.transpose())
        np.fill_diagonal(dist_arr, 0)
        return pd.DataFrame(dist_arr, columns=df.columns.unique(), index=df.columns.unique())
    
    def threshold_euclidean(self, df, threshold):
        for col in df.columns:
#             df.loc[df[col] > threshold, col] = 0
#             df.loc[df[col] < threshold, col] = 1
            df[col] = np.where(df[col]>=threshold, 0, 1)
        np.fill_diagonal(df.values, 0)
        return df
    
    def diagonal_degree(self, df):
        diag_series = np.diag(df.sum())
        degree_mat = pd.DataFrame(diag_series, columns=df.columns.unique(), index=df.columns.unique())
        return degree_mat

## (a) Test Dataset

In [None]:
data = Dataset("tmp/LAQN_NO2_1996-01-01_2021-01-01.csv")

In [None]:
grouped = data.group('D')

In [None]:
grouped.mean().mean()

In [None]:
grouped

In [None]:
grouped.isna().sum().sum() / (9133 * 201)

In [None]:
def get_test_set(df, num_valid_values=500):
    max_size = 0
    max_index = 0

    for i in range(0, df.shape[0], 5):
        test = df.iloc[i:].isnull()
        test.reset_index(drop=True, inplace=True)
        res = test.eq(True).idxmax()
        size = res[res > num_valid_values].size
        if size > max_size:
            max_size = size
            max_index = i

    test = df.iloc[max_index:].isnull()
    test.reset_index(drop=True, inplace=True)
    res = test.eq(True).idxmax()
    max_cols = res[res > num_valid_values].keys()
    test_set = df[max_cols].iloc[max_index:max_index+num_valid_values]
    return test_set, max_cols

In [None]:
test_set, max_cols = get_test_set(grouped)

In [None]:
test_set

In [None]:
def force_gaps(test_set, num_gaps=2000, seed=0):
    np.random.seed(0)
    testing = test_set.copy()

    # Replace random entries with NaNs
    num_entries = test_set.size # 14000 (500 * 28)
    nan_indices = np.random.choice(np.arange(num_entries), num_gaps, replace=False)
    nan_entries = [(num // test_set.shape[1], num % test_set.shape[1]) for num in nan_indices]

    initial = []
    for entry in nan_entries:
        initial.append(testing.iloc[entry])
        testing.iloc[entry] = np.nan
    return nan_entries, initial, testing

## Graph Propagation Implementation

In [None]:
class GraphPropagation():
    def __init__(self):
        pass
    
    def threshold_am(self, df, threshold):
        result = df.copy()
        for col in result.columns:
#             df.loc[df[col] > threshold, col] = 0
#             df.loc[df[col] < threshold, col] = 1
            result[col] = np.where(result[col] >= threshold, 1, 0)
        np.fill_diagonal(result.values, 1)
        return result
    
    def diagonal_degree(self, df):
        diag_series = np.diag(df.sum())
        result = pd.DataFrame(diag_series, columns=df.columns.unique(), index=df.columns.unique())
        return result

In [None]:
# GRAPH PROPAGATION ALGORITHM

def D_pow(mat, power):
    return scipy.linalg.fractional_matrix_power(mat, power)

# w = [0.8, 0.2]

def basic_graph_propagation(X, A, w, L, a=0.5, b=0.5):
    D_list = np.sum(A, axis=1) # D matrix
    w = np.array(w) 
    prop_matrix = np.diag(D_list**-a).dot(A).dot(np.diag(D_list**-b)) # DAD^(-1)
    prop_matrix = np.nan_to_num(prop_matrix) # convert NaNs to 0s
    
    pi = np.zeros_like(X)
    r = X
    for i in range(L):
        Y_i = w[i:].sum()
        Y_iplus = w[i+1:].sum()
        
        # update pi estimate
        q = (w[i]/Y_i) * r
        pi += q
        
        # update r
        r = (Y_i/Y_iplus) * prop_matrix.dot(r.T).T
        
    q = w[L]/w[L:].sum() * r
    pi += q
    return pi

In [None]:
def fill_and_refactor(gap_data):
    filled_data = gap_data.ffill().bfill()
    am = ComputeAM(filled_data)
    euclidean_am = am.euclidean_dist(filled_data)

    mean = euclidean_am.mean().mean()
    refactored = (mean / euclidean_am)
    np.fill_diagonal(refactored.values, 0)
    return filled_data, refactored

In [None]:
def get_L(matrix):
    total = np.zeros_like(matrix)
    
    i = 0
    while np.count_nonzero(total) != matrix.size:
        i += 1
        total += np.linalg.matrix_power(matrix, i)
        if i == 10:
            break
    return i

In [None]:
def compute_progation_matrix(data, euclideans, threshold, L=None, alpha=None, w=np.array([1, 0, 0, 0])):
    prop = GraphPropagation()
    A = prop.threshold_am(euclideans, threshold)

    if alpha:
        w = [alpha*(1-alpha)**i for i in range(10)]
    if not L:
        L = get_L(A)

    # Apply algorithm
    array_data = data.to_numpy()
    Z = basic_graph_propagation(array_data, A, w, L)
    return Z, A

In [None]:
alpha = 0.2
[alpha*(1-alpha)**i for i in range(10)]

### Compute Error - Individual

Parameters:
* alpha: determines weight of each hop
* threshold: determine which edges are 1s and 0s
* L: # of hops

Variables:
* testing - test dataset with forced gaps
* initial - list of initial readings for gaps
* nan_entries - indices of forced gap entries
* filled_data - test dataset forward and backward filled
* euclidean - similarity matrix formed from euclidean distance metric
* A - adjacency matrix formed from thresholding euclidean

In [None]:
def rmse_error(initial, final):
    return np.linalg.norm(np.array(initial) - np.array(final)) / len(initial)**0.5

def absolute_error(initial, final):
    return np.mean(np.absolute(np.array(initial) - np.array(final)))

def smape_error(initial, final):
    initial, final = np.array(initial), np.array(final)
    num = np.absolute(initial - final)
    den = (np.absolute(initial) + np.absolute(final)) / 2
    elems = num/den
    return np.sum(elems) / elems.size

def compute_alpha_error(alpha, threshold, L, initial, nan_entries, data, euclideans, error_type='rmse'):
    prop = GraphPropagation()
    A = prop.threshold_am(euclideans, threshold)
    w = [alpha*(1-alpha)**i for i in range(10)]

    # Apply algorithm
    array_data = data.to_numpy()
    Z = basic_graph_propagation(array_data, A, w, L)
    
    final = []
    for entry in nan_entries:
        final.append(Z[entry])
        
    if error_type == 'rmse':
        error = rmse_error(initial, final)
    elif error_type == 'absolute':
        error = absolute_error(initial, final)
    
    return error

def compute_threshold_error(threshold, alpha, L, initial, nan_entries, data, euclideans, error_type='rmse'):
    prop = GraphPropagation()
    A = prop.threshold_am(euclideans, threshold)
    w = [alpha*(1-alpha)**i for i in range(10)]

    # Apply algorithm
    array_data = data.to_numpy()
    Z = basic_graph_propagation(array_data, A, w, L)
    
    final = []
    for entry in nan_entries:
        final.append(Z[entry])
    
    if error_type == 'rmse':
        error = rmse_error(initial, final)
    elif error_type == 'absolute':
        error = absolute_error(initial, final)
    
    return error

def compute_hop_error(L, alpha, threshold, initial, nan_entries, data, euclideans, error_type='rmse'):
    L = int(round(L))
    
    prop = GraphPropagation()
    A = prop.threshold_am(euclideans, threshold)
    w = [alpha*(1-alpha)**i for i in range(10)]

    # Apply algorithm
    array_data = data.to_numpy()
    Z = basic_graph_propagation(array_data, A, w, L)
    
    final = []
    for entry in nan_entries:
        final.append(Z[entry])
    
    if error_type == 'rmse':
        error = rmse_error(initial, final)
    elif error_type == 'absolute':
        error = absolute_error(initial, final)
    
    return error

In [None]:
nan_entries, initial, testing = force_gaps(test_set, num_gaps=2000, seed=0)

In [None]:
filled_data, euclidean = fill_and_refactor(testing)

In [None]:
euclidean

In [None]:
testing.mean().mean()

# -----------------------------------------
# RMSE Metric
# -----------------------------------------

In [None]:
# Optimise alpha

res_alpha = minimize(compute_alpha_error, 0.5, args=(1.1, 1, initial, nan_entries, filled_data, euclidean))
print(res_alpha)

In [None]:
# compute_alpha_error(0.2124, 1.5, 2, initial, nan_entries, filled_data, euclidean)
compute_threshold_error(1.06, 0.2218, 2, initial, nan_entries, filled_data, euclidean)

In [None]:
# t_alpha = 0.3792
# t_hop = 1
# t_threshold = 1.07
# # error - 409.517

t_alpha = 0.2218
t_hop = 2
t_threshold = 1.06
# error  - 8.96

# # HOURLY
# t_alpha = 0.216
# t_hop = 2
# t_threshold = 1.66
# # error - 8.3583

In [None]:
# Optimise alpha

plt.figure(1)
alpha_err = []
alpha_range = np.linspace(0.0, 0.6, 101)
for alpha in alpha_range:
    err = compute_alpha_error(alpha, t_threshold, t_hop, initial, nan_entries, filled_data, euclidean)
    alpha_err.append(err)
plt.plot(alpha_range, alpha_err)
plt.title('RMSE Error', fontsize=18)
plt.xlabel('Alpha', fontsize=14)
plt.ylabel('Error', fontsize=14)

plt.figure(2)
hop_err = []
hop_range = np.arange(1, 6)
for L in hop_range:
    err = compute_hop_error(L, t_alpha, t_threshold, initial, nan_entries, filled_data, euclidean)
    hop_err.append(err)
plt.plot(hop_range, hop_err)
plt.title('RMSE Error', fontsize=18)
plt.xlabel('Hops', fontsize=14)
plt.ylabel('Error', fontsize=14)


plt.figure(3)
threshold_err = []
threshold_range = np.linspace(1.0, 2.0, 101)
for threshold in threshold_range:
    err = compute_threshold_error(threshold, t_alpha, t_hop, initial, nan_entries, filled_data, euclidean)
    threshold_err.append(err)
plt.plot(threshold_range, threshold_err)
plt.title('RMSE Error', fontsize=18)
plt.xlabel('Threshold', fontsize=14)
plt.ylabel('Error', fontsize=14)

alpha_err = np.nan_to_num(alpha_err, nan=np.inf)
print('Alpha error: ', min(alpha_err), alpha_range[np.argmin(alpha_err)])
print('Hops error: ', min(hop_err), hop_range[np.argmin(hop_err)])
print('Threshold error: ', min(threshold_err), threshold_range[np.argmin(threshold_err)])
print(min(alpha_err))

In [None]:
'''
t_alpha = 0.2218
t_hop = 2
t_threshold = 1.06
'''

Z, A = compute_progation_matrix(filled_data, euclidean, threshold=t_threshold, L=t_hop, alpha=t_alpha)
final = []
for entry in nan_entries:
    final.append(Z[entry])

x = np.arange(100)
plt.figure(figsize=(5, 5))
plt.scatter(initial, final)
plt.plot(x, x, color='black')
plt.title(r'Algorithm evaluation (RMSE = 8.96)')
plt.xlabel(r'True NO$_2$ concentration ($\mu g/mm^3$)')
plt.ylabel(r'Propagated NO$_2$ concentration ($\mu g/mm^3$)')

compute_alpha_error(0.222, 1.06, 2, initial, nan_entries, filled_data, euclidean)

In [None]:
'''
t_alpha = 0.3792
t_hop = 1
t_threshold = 1.07
# error - 409.517
'''

Z2, A2 = compute_progation_matrix(filled_data, euclidean, threshold=1.07, L=1, alpha=0.3792)
final2 = []
for entry in nan_entries:
    final2.append(Z2[entry])

x = np.arange(100)
plt.figure(figsize=(5, 5))
plt.scatter(initial, final2)
plt.plot(x, x, color='black')
plt.title(r'Algorithm evaluation (RMSE = 9.16)')
plt.xlabel(r'True NO$_2$ concentration ($\mu g/mm^3$)')
plt.ylabel(r'Propagated NO$_2$ concentration ($\mu g/mm^3$)')

compute_alpha_error(0.3792, 1.07, 1, initial, nan_entries, filled_data, euclidean)

In [None]:
'''
t_alpha = 0.12212779
t_hop = 3
t_threshold = 1.06
# error - 9.225
'''

Z2, A2 = compute_progation_matrix(filled_data, euclidean, threshold=1.06, L=3, alpha=0.122)
final2 = []
for entry in nan_entries:
    final2.append(Z2[entry])

x = np.arange(100)
plt.figure(figsize=(5, 5))
plt.scatter(initial, final2)
plt.plot(x, x, color='black')
plt.title(r'Algorithm evaluation (RMSE = 9.23)')
plt.xlabel(r'True NO$_2$ concentration ($\mu g/mm^3$)')
plt.ylabel(r'Propagated NO$_2$ concentration ($\mu g/mm^3$)')

compute_alpha_error(0.122, 1.06, 3, initial, nan_entries, filled_data, euclidean)

In [None]:
for i in range(len(final)):
    print(f'{initial[i]} -> {final[i]}')

### Computer Error - Total

In [None]:
def compute_error(params, initial, nan_entries, data, euclideans, error_type='rmse'):
    alpha = params[0]
    threshold = params[1]
    L = int(params[2])
    
    prop = GraphPropagation()
    A = prop.threshold_am(euclideans, threshold)
    w = [alpha*(1-alpha)**i for i in range(10)]

    # Apply algorithm
    array_data = data.to_numpy()
    Z = basic_graph_propagation(array_data, A, w, L)
    
    final = []
    for entry in nan_entries:
        final.append(Z[entry])
        
    if error_type == 'rmse':
        error = rmse_error(initial, final)
    elif error_type == 'absolute':
        error = absolute_error(initial, final)
    elif error_type == 'smape':
        error = smape_error(initial, final)
    
    return error

In [None]:
compute_error([0.2, 1.5, 2], initial, nan_entries, filled_data, euclidean)

In [None]:
res = minimize(compute_error, [0.5, 1.06, 3], args=(initial, nan_entries, filled_data, euclidean))
print(res)

## Comparison Plots

TEST:
* seed = 1
* alpha = 0.2218
* threshold = 1.06
* L = 2

In [None]:
# Plot error against alpha and threshold
plt.figure(1)

alpha_range = np.linspace(0.1, 0.5, 50)
threshold_range = np.linspace(0.5, 2.0, 16)
L_range = np.arange(1, 6)

loss = np.zeros((len(alpha_range), len(threshold_range)))
for i, val1 in enumerate(alpha_range): 
    for j, val2 in enumerate(threshold_range):
        val1 = round(val1, 2)
        val2 = round(val2, 2)
        
        t_hop = 3
        loss[i][j] = compute_alpha_error(val1, val2, t_hop, initial, nan_entries, filled_data, euclidean)
        
X, Y = np.meshgrid(threshold_range, alpha_range)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

surf = ax.plot_surface(X[:50], Y[:50], loss[:50], cmap='plasma', linewidth=2)
fig.colorbar(surf, shrink=0.5, aspect=5)
ax.set_xlabel('threshold', fontsize=12)
ax.set_ylabel('alpha', fontsize=12)
ax.set_zlabel('error', fontsize=12)

shape = np.unravel_index(loss.argmin(), loss.shape)
print(f'Threshold: {X[shape]}')
print(f'Alpha: {Y[shape]}')
print(f'Error: {np.min(loss)}')

In [None]:
# Plot loss against alpha and L (hops)

alpha_range = np.linspace(0.1, 0.5, 50)
threshold_range = np.linspace(0.5, 2.0, 16)
L_range = np.arange(1, 6)

loss = np.zeros((len(alpha_range), len(L_range)))
for i, val1 in enumerate(alpha_range): 
    for j, val2 in enumerate(L_range):
        val1 = round(val1, 2)
        val2 = round(val2, 2)
        
        t_threshold = 1.1
        loss[i][j] = compute_alpha_error(val1, t_threshold, val2, initial, nan_entries, filled_data, euclidean)
        
X, Y = np.meshgrid(L_range, alpha_range)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

lim1 = 50
lim2 = 4
surf = ax.plot_surface(X[:lim1, :lim2], Y[:lim1, :lim2], loss[:lim1, :lim2], cmap='plasma', linewidth=2)
fig.colorbar(surf, shrink=0.5, aspect=5)
ax.set_xlabel('L (hops)', fontsize=12)
ax.set_ylabel('alpha', fontsize=12)
ax.set_zlabel('error', fontsize=12)

shape = np.unravel_index(loss.argmin(), loss.shape)
print(f'Hops: {X[shape]}')
print(f'Alpha: {Y[shape]}')
print(f'Error: {np.min(loss)}')

In [None]:
# Plot loss against threshold and L (hops)

alpha_range = np.linspace(0.1, 1.0, 10)
threshold_range = np.linspace(0.5, 2.0, 160)
L_range = np.arange(1, 6)

loss = np.zeros((len(threshold_range), len(L_range)))
for i, val1 in enumerate(threshold_range): 
    for j, val2 in enumerate(L_range):
        val1 = round(val1, 2)
        val2 = round(val2, 2)
        
        t_alpha = 0.222
        loss[i][j] = compute_alpha_error(t_alpha, val1, val2, initial, nan_entries, filled_data, euclidean)
        
X, Y = np.meshgrid(L_range, threshold_range)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

lim1 = len(threshold_range)
lim2 = len(L_range)
surf = ax.plot_surface(X[:lim1, :lim2], Y[:lim1, :lim2], loss[:lim1, :lim2], cmap='plasma', linewidth=2)
fig.colorbar(surf, shrink=0.5, aspect=5)
ax.invert_xaxis()
ax.set_xlabel('L (hops)')
ax.set_ylabel('threshold')
ax.set_zlabel('error')

shape = np.unravel_index(loss.argmin(), loss.shape)
print(f'Hops: {X[shape]}')
print(f'Threshold: {Y[shape]}')
print(f'Error: {np.min(loss)}')

In [None]:
np.mean(initial)

In [None]:
print(compute_error([0.12, 1.3, 3], initial, nan_entries, filled_data, euclidean, error_type='rmse'))
print(compute_error([0.22, 1.3, 2], initial, nan_entries, filled_data, euclidean, error_type='rmse'))
print(compute_error([0.22, 1.1, 2], initial, nan_entries, filled_data, euclidean, error_type='rmse'))

In [None]:
rmse_err = compute_error([0.2218, 1.06, 2], initial, nan_entries, filled_data, euclidean, error_type='rmse')
smape_err = compute_error([0.2218, 1.06, 2], initial, nan_entries, filled_data, euclidean, error_type='smape')
print(f'RMSE Error: {rmse_err}')
print(f'SMAPE Error: {smape_err}')

## Scale to full dataset

In [None]:
full_data, similarity = fill_and_refactor(grouped)

In [None]:
# full_data
am = ComputeAM(full_data)
euclidean_am = am.euclidean_dist(full_data)
euclidean_am

In [None]:
'''
t_alpha = 0.2218
t_hop = 2
t_threshold = 1.06
'''

Z, A = compute_progation_matrix(full_data, similarity, threshold=1.06, L=2, alpha=0.2218)
# Z2 = compute_progation_matrix(full_data, similarity, threshold=0.7, L=1, alpha=0.4)

# HOURLY
# Z, A = compute_progation_matrix(full_data, similarity, threshold=1.66, L=2, alpha=0.216)

In [None]:
corrected = np.copy(Z)

In [None]:
for (i, column) in enumerate(grouped):
    for (j, entry) in enumerate(np.asarray(grouped[column])): 
        if not np.isnan(entry):
            corrected[j][i] = entry
#     print(np.count_nonzero(np.isnan(np.asarray(grouped[column]))))
#     break    

In [None]:
corrected == Z
# print(grouped.isna().sum().sum())
# print(grouped.size)

In [None]:
#Get similarity matrix from propagated data

corrected_df = pd.DataFrame(corrected, columns=grouped.columns.unique(), index=grouped.index.unique())
fd1, similarity = fill_and_refactor(corrected_df)

In [None]:
propagated_df = pd.DataFrame(corrected, columns = grouped.columns.unique(), index = grouped.index.unique())
propagated_df

In [None]:
def group_dataframe(df, freq='M'):
    grouped_df = df.copy() 
    grouped_df = grouped_df.reset_index(level=0)
    grouped_df['date'] = pd.to_datetime(grouped_df.date)
    grouped_df = grouped_df.groupby(pd.Grouper(key="date", freq=freq)).mean()
    return grouped_df

In [None]:
# propagated_df.to_csv('complete_NO2.csv')

In [None]:
inds = [0, 25, 50, 75, 100, 125, 150, 175, 200]
vals = [np.array(similarity.columns)[i] for i in inds] 
vals

In [None]:
plt.figure(figsize=(10, 10))
sim_heat = np.fill_diagonal(similarity.values, 3)
plt.imshow(similarity, cmap='magma', interpolation='nearest', vmin=0, vmax=3)
plt.title('Similarity Heat Map')
plt.xticks(plt.xticks()[0][1:10], labels=vals[:9], fontsize=9)
plt.yticks(plt.yticks()[0][1:10], labels=vals[:9], fontsize=9)
# print(plt.xticks()[0].size)
plt.tick_params(top=True, labeltop=True)
plt.tick_params(bottom=False, labelbottom=False)
plt.colorbar()
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sim_heat = np.fill_diagonal(similarity.values, 3)
plt.imshow(similarity/3, cmap='magma', interpolation='nearest', vmin=0, vmax=1)
plt.title('Similarity Heat Map')
plt.xticks(plt.xticks()[0][1:10], labels=vals[:9], fontsize=9)
plt.yticks(plt.yticks()[0][1:10], labels=vals[:9], fontsize=9)
# print(plt.xticks()[0].size)
plt.tick_params(top=True, labeltop=True)
plt.tick_params(bottom=False, labelbottom=False)
plt.colorbar()
plt.show()

## Setup

In [None]:
similarity

In [None]:
species = "NO2"
region = "London"
start_date = "1996-01-01"
end_date = "2021-01-01"

In [None]:
# Get LAQN site codes
url_sites = f"http://api.erg.kcl.ac.uk/AirQuality/Information/MonitoringSites/GroupName={region}/Json"
               
london_sites = requests.get(url_sites)
sites_df = pd.DataFrame(london_sites.json()['Sites']['Site'])
site_codes = sites_df["@SiteCode"].tolist()
print(len(site_codes))

In [None]:
# Get sites for each local authority
site_map = {} # map between local authority codes and list of sites belonging to that local authority
location_map = {} # map between local authority codes and local authority names
# local_codes = set(sites_df['@LocalAuthorityCode'].unique()) # 1 - 33
for i in range(1, 34):
    code = str(i)
    location_map[code] = sites_df[sites_df['@LocalAuthorityCode'] == code]['@LocalAuthorityName'].unique()[0]
    res = sites_df[sites_df['@LocalAuthorityCode'] == code]['@SiteCode']
    site_map[code] = []
    for j, site in res.items():
        site_map[code].append(site)

In [None]:
propagated_df

## Time Series Plots (grouped by day)

In [None]:
dates = propagated_df.index.values
stations = {'TD0', 'EN5', 'BN1', 'SK2', 'KC2', 'HI2'}
while len(stations) < 10:
    sample = np.random.choice(data.df.columns.values[1:], 1)[0]
    stations.add(sample)
    
for index, station in enumerate(stations):
    plt.figure(index, figsize=(12, 4))
    plt.plot(dates, propagated_df[station].values, color='black', linestyle='dotted')
    plt.plot(dates, grouped[station].values, color='black')
    plt.title(f'Station: {station}')
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)

In [None]:
dates = propagated_df.index.values
stations = {'TD0', 'EN5', 'BN1', 'SK2', 'KC2', 'HI2', 'ME1'}
while len(stations) < 10:
    sample = np.random.choice(data.df.columns.values[1:], 1)[0]
    stations.add(sample)

start = 2000
add = 300
for index, station in enumerate(stations):
    plt.figure(index, figsize=(12, 4))
    plt.plot(dates[start:start+add], propagated_df[station].values[start:start+add], color='black', linestyle='dotted')
    plt.plot(dates[start:start+add], grouped[station].values[start:start+add], color='black')
    plt.title(f'Station: {station}')
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)

## Time Series Plots (grouped by week)

In [None]:
grouped_W = group_dataframe(grouped, 'W')
propagated_df_W = group_dataframe(propagated_df, 'W')
# grouped_W
propagated_df_W

In [None]:
dates = propagated_df_W.index.values
stations = {'TD0', 'EN5', 'BN1', 'SK2', 'KC2', 'HI2'}
while len(stations) < 10:
    sample = np.random.choice(data.df.columns.values[1:], 1)[0]
    stations.add(sample)
    
for index, station in enumerate(stations):
    plt.figure(index, figsize=(12, 4))
    plt.plot(dates, propagated_df_W[station].values, color='black', linestyle='dotted')
    plt.plot(dates, grouped_W[station].values, color='black')
    plt.title(f'Station: {station}')
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)

## Time Series Plots (grouped by month)

In [None]:
grouped_M = group_dataframe(grouped, 'M')
propagated_df_M = group_dataframe(propagated_df, 'M')
grouped_M

In [None]:
dates = propagated_df_M.index.values

### Find dates where months have missing data
missing_dates = grouped.loc[pd.isna(grouped['TD0']), :].index

In [None]:
# GET MISSING DATA PER MONTH

years = np.arange(1996, 2021)
months = np.arange(1, 13)

def missing_data_count(df):
    grouped_M = group_dataframe(df, 'M')
    res = pd.DataFrame(index=grouped_M.index, columns=grouped_M.columns)
    
    stations = df.columns.tolist()
    test = df.reset_index(level=0)
    
    i = 0
    for year in years:
        for month in months:
            sample_df = test[(test['date'].dt.year == year) & (test['date'].dt.month == month)]
            res.iloc[i] = sample_df.isna().sum().tolist()[1:]
            i += 1

    return res

def get_missing_dates(df, station):
    station_data = df[station]
    grouped_missing = station_data[station_data.isnull()]
    grouped_missing.iloc[:] = 1.0
    grouped_M_missing = group_dataframe(grouped_missing, 'M')
    test = grouped_M_missing.reset_index(level=0)
    missing_dates = test[test[station] == 1.0]['date'].tolist()
    return missing_dates

In [None]:
miss_count = missing_data_count(grouped)
# res.to_csv('random.csv')

missing_dates = get_missing_dates(grouped, 'TD0')
propagated_df_M['TD0'][missing_dates].values

In [None]:
dates

In [None]:
dates = propagated_df_M.index.values
stations = {'TD0', 'EN5', 'BN1', 'SK2', 'KC2', 'HI2', 'RI2', 'WM6', 'EA1'}
while len(stations) < 20:
    sample = np.random.choice(data.df.columns.values[1:], 1)[0]
    stations.add(sample)
    
for index, station in enumerate(stations):
    plt.figure(figsize=(12, 4))
    plt.plot(dates, propagated_df_M[station].values, color='black', linestyle='dotted')
    plt.plot(dates, grouped_M[station].values, color='black')
    plt.title(f'Station: {station}')
    plt.xlabel('date', fontsize=12)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=12)
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)

In [None]:
dates = propagated_df_M.index.values
stations = {'TD0', 'EN5', 'BN1', 'SK2', 'KC2', 'HI2', 'RI2'}
while len(stations) < 10:
    sample = np.random.choice(data.df.columns.values[1:], 1)[0]
    stations.add(sample)
    
for index, station in enumerate(stations):
    plt.figure(figsize=(12, 4))
#     plt.plot(dates, propagated_df_M[station].values, color='black', linestyle='dotted')
    plt.plot(dates, grouped_M[station].values, color='black')
    plt.plot(dates, [np.nanmean(grouped_M[station].values)]*301 , color='red', alpha= 0.0)
    plt.title(f'Station: {station}')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=12)
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
#     plt.xlim(date(1996, 1, 1), date(2021, 1, 1))

In [None]:
dates = propagated_df_M.index.values
stations = {'TD0', 'EN5', 'BN1', 'SK2', 'KC2', 'HI2'}
while len(stations) < 20:
    sample = np.random.choice(data.df.columns.values[1:], 1)[0]
    stations.add(sample)
    
for index, station in enumerate(stations):
    plt.figure(2*index, figsize=(12, 4))
    plt.plot(dates, propagated_df_M[station].values, color='black', linestyle='dotted')
    plt.plot(dates, grouped_M[station].values, color='black')
    
    missing_dates = get_missing_dates(grouped, station)
    plt.scatter(missing_dates, propagated_df_M[station][missing_dates].values, marker='o', color='r', s = 3.0)
    
    plt.title(f'Station: {station}')
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)
    
    plt.figure(2*index+1, figsize=(12, 4))
    plt.scatter(dates, propagated_df_M[station].values, c=miss_count[station].values, marker='o', s=5.0, cmap='viridis')
    plt.title(f'Station: {station}')
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)
    plt.colorbar()

### Borough Plots

In [None]:
grouped

In [None]:
prop_cycle = [x['color'] for x in plt.rcParams['axes.prop_cycle']]

In [None]:
dates = propagated_df_M.index.values
for i in range(1, 34):
    code = str(i)
    cols = [site for site in site_map[code] if site in propagated_df_M.columns]
#     for col in cols
    plt.figure(figsize=(10, 4))
    for j, col in enumerate(cols):
        color = prop_cycle[j % len(prop_cycle)]
        plt.plot(dates, grouped_M[col].values, color=color, label=f'{col}', linewidth=1)
        plt.plot(dates, propagated_df_M[col].values, color=color, linestyle='dashed', linewidth=1)
    plt.title(f'{location_map[code]}', fontsize=13)
    plt.ylabel("NO$_{2}$ Concentrations (µg/m$^3$)", fontsize=11)
    plt.xlabel("Date", fontsize=11)
    plt.legend()
    
    #     ax = propagated_df_M[cols].plot(figsize=(10, 4), title=f'{location_map[code]}', fontsize=5)
#     ax.set_title(f'{location_map[code]}', fontsize=13)
#     ax.set_ylabel("NO$_{2}$ Concentrations (µg/m$^3$)", fontsize=11)
#     ax.set_xlabel("Date", fontsize=11)
#     ax.tick_params(axis='both', which='major', labelsize=9.5)

In [None]:
dates = propagated_df_M.index.values
for i in range(1, 34):
    code = str(i)
    cols = [site for site in site_map[code] if site in propagated_df_M.columns]
#     for col in cols
    plt.figure(figsize=(10, 4))
    for j, col in enumerate(cols):
        color = prop_cycle[j % len(prop_cycle)]
        plt.plot(dates, grouped_M[col].values, color=color, label=f'{col}', linewidth=1)
#         plt.plot(dates, propagated_df_M[col].values, color=color, linestyle='dashed', linewidth=1)
    plt.title(f'{location_map[code]}', fontsize=13)
    plt.ylabel("NO$_{2}$ Concentrations (µg/m$^3$)", fontsize=12)
    plt.xlabel("Date", fontsize=12)
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.legend()

### Similar Station Plots

In [None]:
similar_stations = {}
for station in A.columns:
    similar_stations[station] = similarity[station].sort_values(ascending=False)[:5].index.tolist()

In [None]:
colour_cycle = prop_cycle = [x['color'] for x in plt.rcParams['axes.prop_cycle']]
plotted_stations = set()

i = 0
for station, similars in similar_stations.items():
    plotted_stations.add(station)
    
    plt.figure(i, figsize=(12, 4))
    plt.plot(dates, propagated_df_M[station].values, label=station, color=colour_cycle[0])
    for j, similar in enumerate(similars):
        plt.plot(dates, propagated_df_M[similar].values, label=similar, color=colour_cycle[j+1])
    plt.title(f'Similar stations to {station}')
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)
    plt.legend()
    
    plt.figure(i+1, figsize=(12, 4))
    plt.plot(dates, grouped_M[station].values, label=station, color=colour_cycle[0])
    for j, similar in enumerate(similars):
        plt.plot(dates, grouped_M[similar].values, label=similar, color=colour_cycle[j+1])
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)
    plt.legend()
    
    i += 2
    if i == 40:
        break

In [None]:
colour_cycle = prop_cycle = [x['color'] for x in plt.rcParams['axes.prop_cycle']]
plotted_stations = set()

i = 0
for station, similars in similar_stations.items():
    plotted_stations.add(station)
    
    plt.figure(i, figsize=(12, 4))
#     plt.plot(dates, grouped_M[station].values, color=colour_cycle[0], label=f'{station}', linewidth=1)
#     plt.plot(dates, propagated_df_M[station].values, color=colour_cycle[0], linestyle='dashed', linewidth=1)
    for j, similar in enumerate(similars):
        plt.plot(dates, grouped_M[similar].values, color=colour_cycle[j+1], label=f'{similar}', linewidth=1)
        plt.plot(dates, propagated_df_M[similar].values, color=colour_cycle[j+1], linestyle='dashed', linewidth=1)
#         plt.plot(dates, propagated_df_M[similar].values, label=similar, color=colour_cycle[j+1])
#     plt.title(f"Stations similar to: '{station}'")
    plt.xlabel('date', fontsize=10)
    plt.ylabel('NO$_{2}$ Concentrations (µg/m$^3$)', fontsize=10)
    plt.legend()
    
    plt.figure(i+1)
    similarity_list = similarity[station]
    london_sites_gdf_sim = london_sites_gdf.copy()
    london_sites_gdf_sim['Similarity'] = np.nan
    for index, sim_val in similarity_list.items():
        london_sites_gdf_sim.loc[london_sites_gdf_sim['@SiteCode'] == index, 'Similarity'] = sim_val
    london_sites_gdf_sim = london_sites_gdf_sim[~london_sites_gdf_sim['Similarity'].isna()]

    plot_on_map(london_sites_gdf_sim, london_gdf, data_column='Similarity', colorbar=True,
                title=f"Similarity map: '{station}'", 
                data_markersize=5, fontsize=15,
                map_edge_color="gray", figsize=(15,7), axis="on", mark=station)
    
    
    i += 2
    if i == 20:
        break

## Plot similar stations on land use map 

In [None]:
# Load LAQN metadata
london_landuse = gpd.read_file(path.join(folder, "gis_osm_landuse_a_free_1.shp"))
print(london_landuse.shape)

In [None]:
london_landuse

In [None]:
london_landuse.plot(figsize=(10,10), column='fclass', legend=True, legend_kwds={'loc': 'center right', 'bbox_to_anchor':(1.3,0.5)})
plt.title('Greater London Land-use Map')

In [None]:
land_palette = {
    'allotments': '#002fff',
    'cemetery': 'gray',
    'commercial': 'orange',
    'farmland': '#002fff',
    'farmyard': '#002fff',
    'forest': 'green',
    'grass': 'green',
    'heath': 'green',
    'industrial': '#4B0092',
    'meadow': 'green',
    'military': '#4B0092',
    'nature_reserve': 'green',
    'orchard': 'pink',
    'park': 'green',
    'quarry': 'gray',
    'recreation_ground': 'green',
    'residential': '#E3E3E3',
    'retail': 'orange',
    'scrub': 'green',
}
cmap = matplotlib.colors.ListedColormap([color for key, color in land_palette.items()])

london_landuse.plot(figsize=(10,10), column='fclass', legend=True, cmap=cmap, legend_kwds={'loc': 'center right', 'bbox_to_anchor':(1.3,0.5)})
plt.title('Greater London Land-use Map')

In [None]:
def plot_on_osm_map(data_geodataframe, map_geodataframe, cmap, figsize=(20,10), colorbar=False, data_column='Similarity', title='LAQN Monitoring Station Distribution', mark=None, similars=None):
    
    base = data_geodataframe.plot(ax=map_geodataframe.plot(figsize=figsize, 
                                           column='fclass',
                                           legend=False,
                                           cmap=cmap,
                                           alpha=0.5,
                                           legend_kwds={'loc': 'center right', 'bbox_to_anchor':(1.3,0.5)}),
                    color='black', marker='x', markersize=75, linewidths=3)
    
    if colorbar:
        colorbar_max = data_geodataframe[data_column].max()
        norm = plt.Normalize(data_geodataframe[data_column].min(), colorbar_max)
        plt.colorbar(plt.cm.ScalarMappable(cmap=None, 
        norm=norm)).set_label(data_column)
        
    if mark:
        marked = data_geodataframe[data_geodataframe['@SiteCode'] == mark]
        marked.plot(ax=base, marker='o', color='black', markersize=100);

    if mark and similar:
        title = f'{title}\n Similar stations: {similars}'
    
    plt.suptitle(title, fontsize=20)
    plt.xlabel('Longitude', fontsize=14)
    plt.ylabel('Latitude', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.axis("on")
    plt.savefig(f'images/{mark}_similarity.png')
    plt.show()
    

In [None]:
similarity.max().max()

In [None]:
# # Get similarity maps

# for station, similars in similar_stations.items():
#     similarity_list = similarity[station]
#     london_sites_gdf_sim = london_sites_gdf.copy()
#     london_sites_gdf_sim['Similarity'] = np.nan
#     for index, sim_val in similarity_list.items():  
#         #ensure current station is most similar
#         if index == station:
#             london_sites_gdf_sim.loc[london_sites_gdf_sim['@SiteCode'] == index, 'Similarity'] = 100
#         else:
#             london_sites_gdf_sim.loc[london_sites_gdf_sim['@SiteCode'] == index, 'Similarity'] = sim_val
        
#     london_sites_gdf_sim = london_sites_gdf_sim[~london_sites_gdf_sim['Similarity'].isna()]
    
#     # MAP N MOST SIMILAR STATIONS
# #     data_count=10
# #     london_sites_gdf_sim = london_sites_gdf_sim.sort_values(by='Similarity', ascending=False)[:data_count]
    
#     # ... OR MAP STATIONS > 0.9*MAX
#     max_similarity = london_sites_gdf_sim.sort_values(by='Similarity', ascending=False).iloc[1]['Similarity']
#     london_sites_gdf_sim = london_sites_gdf_sim.loc[(london_sites_gdf_sim['Similarity'] >= 0.9*max_similarity)]
      
#     similars = london_sites_gdf_sim['@SiteCode'].values
#     similars = np.setdiff1d(similars, station)
#     plot_on_osm_map(london_sites_gdf_sim[:11], london_landuse, cmap, mark=station, title=f'LAQN NO$_2$ Dataset - Station {station}', similars=similars[:10])

### Tests

In [None]:
# Find stations in common: WA7 == WA8, LB4, KC5, WM6 == WMZ, NB1

stations = ['CD4', 'CD5', 'WA7', 'WA8', 'WM6', 'WMZ', 'CD9', 'KT3', 'NB1']
for i in stations:
    test = london_sites_gdf[london_sites_gdf['@SiteCode'] == i]
    plot_on_osm_map(test, london_landuse, cmap, title = i)

In [None]:
np.mean(similarity).sort_values(ascending=True)[:20]

In [None]:
similar_stations

In [None]:
# count of similar stations over all stations
similar_station_count = {}

for station in A.columns:
    data_count = 10
    similars = similarity[station].sort_values(ascending=False)[:data_count].index.tolist()
    
    if station == 'TD0':
        print(similars)
    
    for i in similars:
        if i not in similar_station_count:
            similar_station_count[i] = 0
        similar_station_count[i] += 1

In [None]:
similar_station_count

In [None]:
grouped

In [None]:
stations = ['CD4', 'CD5']
grouped[stations].to_csv('random.csv')