In [1]:
import numpy as np
import scipy
import pandas as pd
import requests
from os import makedirs, path, listdir, remove
from bs4 import BeautifulSoup, SoupStrainer
import zipfile as zpf
from shutil import rmtree
import matplotlib.pyplot as plt
from scipy.spatial.distance import squareform, pdist, cosine
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import minimize
import matplotlib
from matplotlib import cm
from datetime import datetime

import httplib2
import geopandas as gpd
from tqdm import tqdm

### Map Functions

In [2]:
def plot_on_map(data_geodataframe, map_geodataframe, 
                data_column=None, map_column=None, 
                data_cmap=None, map_cmap=None, 
                data_color=None, map_color="whitesmoke", 
                data_markersize=0.1, 
                map_edge_color="black", 
                colorbar=False, 
                title="Greater London", 
                fontsize="25", 
                figsize=(20,10), 
                axis="off",
                mark=None):
    
    base = data_geodataframe.plot(column=data_column, 
                           ax=map_geodataframe.plot(column=map_column, 
                                                    figsize=figsize, 
                                                    color=map_color, 
                                                    edgecolor=map_edge_color, 
                                                    cmap=map_cmap), 
                           color=data_color, cmap=data_cmap, markersize=data_markersize)
    if colorbar:
        colorbar_max = data_geodataframe[data_column].max()
        norm = plt.Normalize(data_geodataframe[data_column].min(), colorbar_max)
        plt.colorbar(plt.cm.ScalarMappable(cmap=data_cmap, 
        norm=norm)).set_label(data_column)
        
    if mark:
        marked = data_geodataframe[data_geodataframe['@SiteCode'] == mark]
        marked.plot(ax=base, marker='x', color='black', markersize=15);
    
    plt.suptitle(title, fontsize=fontsize)
    plt.xlabel('Longitude', fontsize=14)
    plt.ylabel('Latitude', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.axis(axis)
    plt.show()

In [1]:
def plot_on_osm_map(data_geodataframe, map_geodataframe, cmap, figsize=(20,10), colorbar=False, data_column='Similarity', title='LAQN Monitoring Station Distribution', mark=None, similars=None):
    
    base = data_geodataframe.plot(ax=map_geodataframe.plot(figsize=figsize, 
                                           column='fclass',
                                           legend=False,
                                           cmap=cmap,
                                           alpha=0.5,
                                           legend_kwds={'loc': 'center right', 'bbox_to_anchor':(1.3,0.5)}),
                    color='black', marker='x', markersize=75, linewidths=3)
    
    if colorbar:
        colorbar_max = data_geodataframe[data_column].max()
        norm = plt.Normalize(data_geodataframe[data_column].min(), colorbar_max)
        plt.colorbar(plt.cm.ScalarMappable(cmap=None, 
        norm=norm)).set_label(data_column)
        
    if mark:
        marked = data_geodataframe[data_geodataframe['@SiteCode'] == mark]
        marked.plot(ax=base, marker='o', color='black', markersize=100);

    if mark and similar:
        title = f'{title}\n Similar stations: {similars}'
    
    plt.suptitle(title, fontsize=20)
    plt.xlabel('Longitude', fontsize=14)
    plt.ylabel('Latitude', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.axis("on")
    plt.savefig(f'images/{mark}_similarity.png')
    plt.show()

### Graph Adjacency

In [None]:
class Dataset():
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
        self.orig = self.df.copy()
        self.df['date'] = pd.to_datetime(self.df.date)
        
    def drop_null(self, nan_percent):
        # drop column if proportion of NaN elements exceed the nan_percent
        min_count = int(((100-nan_percent)/100)*self.df.shape[0] + 1)
        return self.df.dropna(axis=1, thresh=min_count) 
        
    def fill_mean(self):
        return self.df.fillna(self.df.mean())
    
    def group(self, freq):
        # group the data by the specified freq (month/year) and average across this period, then fill NaN values 
        df = self.df.groupby(pd.Grouper(key="date", freq=freq)).mean()
        return df
    
    def group_and_fill(self, freq):
        # group the data by the specified freq (month/year) and average across this period, then fill NaN values 
        df = self.df.groupby(pd.Grouper(key="date", freq=freq)).mean()
        return df.ffill().bfill()
    
    def fill(self):
        df = self.df.copy()
        for col in df.columns.drop('date'):
            df[col] = df[col].fillna(df.groupby([df.date.dt.year, df.date.dt.month])[col].transform('mean'))
        return df.ffill().bfill()

In [None]:
class ComputeAM():
    def __init__(self, df):
        am_shape = (df.shape[1], df.shape[1])
        self.am = pd.DataFrame(np.zeros(shape=am_shape), columns=df.columns, index=df.columns)
    
    def euclidean_dist(self, df):
        # np.linalg.norm(complete['TD0'].values - complete['BG3'].values) #test euclidean distance between two columns
        dist_arr = squareform(pdist(df.transpose()))
        return pd.DataFrame(dist_arr, columns=df.columns.unique(), index=df.columns.unique())
    
    def cosine_dist(self, df):
        dist_arr = cosine_similarity(df.transpose())
        np.fill_diagonal(dist_arr, 0)
        return pd.DataFrame(dist_arr, columns=df.columns.unique(), index=df.columns.unique())
    
    def threshold_euclidean(self, df, threshold):
        for col in df.columns:
#             df.loc[df[col] > threshold, col] = 0
#             df.loc[df[col] < threshold, col] = 1
            df[col] = np.where(df[col]>=threshold, 0, 1)
        np.fill_diagonal(df.values, 0)
        return df
    
    def diagonal_degree(self, df):
        diag_series = np.diag(df.sum())
        degree_mat = pd.DataFrame(diag_series, columns=df.columns.unique(), index=df.columns.unique())
        return degree_mat

### Create Control Set

In [None]:
def get_test_set(df, num_valid_values=500):
    max_size = 0
    max_index = 0

    for i in range(0, df.shape[0], 5):
        test = df.iloc[i:].isnull()
        test.reset_index(drop=True, inplace=True)
        res = test.eq(True).idxmax()
        size = res[res > num_valid_values].size
        if size > max_size:
            max_size = size
            max_index = i

    test = df.iloc[max_index:].isnull()
    test.reset_index(drop=True, inplace=True)
    res = test.eq(True).idxmax()
    max_cols = res[res > num_valid_values].keys()
    test_set = df[max_cols].iloc[max_index:max_index+num_valid_values]
    return test_set, max_cols

In [1]:
def force_gaps(test_set, proportion=0.25, seed=0):
    np.random.seed(seed)
    testing = test_set.copy()
    
    num_gaps = int(proportion * test_set.size)

    # Replace random entries with NaNs
    num_entries = test_set.size
    nan_indices = np.random.choice(np.arange(num_entries), num_gaps, replace=False)
    nan_entries = [(num // test_set.shape[1], num % test_set.shape[1]) for num in nan_indices]

    initial = []
    for entry in nan_entries:
        initial.append(testing.iloc[entry])
        testing.iloc[entry] = np.nan
    return nan_entries, initial, testing

### Graph Propagation

In [None]:
class GraphPropagation():
    def __init__(self):
        pass
    
    def threshold_am(self, df, threshold):
        result = df.copy()
        for col in result.columns:
#             df.loc[df[col] > threshold, col] = 0
#             df.loc[df[col] < threshold, col] = 1
            result[col] = np.where(result[col] >= threshold, 1, 0)
        np.fill_diagonal(result.values, 1)
        return result
    
    def diagonal_degree(self, df):
        diag_series = np.diag(df.sum())
        result = pd.DataFrame(diag_series, columns=df.columns.unique(), index=df.columns.unique())
        return result

In [None]:
# GRAPH PROPAGATION ALGORITHM

def D_pow(mat, power):
    return scipy.linalg.fractional_matrix_power(mat, power)

def basic_graph_propagation(X, A, w, L, a=0.5, b=0.5):
    D_list = np.sum(A, axis=1) # D matrix
    w = np.array(w) 
    prop_matrix = np.diag(D_list**-a).dot(A).dot(np.diag(D_list**-b)) # DAD^(-1)
    prop_matrix = np.nan_to_num(prop_matrix) # convert NaNs to 0s
    
    pi = np.zeros_like(X)
    r = X
    for i in range(L):
        Y_i = w[i:].sum()
        Y_iplus = w[i+1:].sum()
        
        # update pi estimate
        q = (w[i]/Y_i) * r
        pi += q
        
        # update r
        r = (Y_i/Y_iplus) * prop_matrix.dot(r.T).T
        
    q = w[L]/w[L:].sum() * r
    pi += q
    return pi

In [None]:
def fill_and_refactor(gap_data):
    filled_data = gap_data.ffill().bfill()
    am = ComputeAM(filled_data)
    euclidean_am = am.euclidean_dist(filled_data) # initially, the larger the value, the more distant and the less similar

    mean = euclidean_am.mean().mean() 
    refactored = (mean / euclidean_am)  # Larger values represent more similar stations
    np.fill_diagonal(refactored.values, 0)
    return filled_data, refactored

def get_L(matrix):
    total = np.zeros_like(matrix)
    
    i = 0
    while np.count_nonzero(total) != matrix.size:
        i += 1
        total += np.linalg.matrix_power(matrix, i)
        if i == 10:
            break
    return i

def compute_progation_matrix(data, euclideans, threshold, L=None, alpha=None, w=np.array([1, 0, 0, 0])):
    prop = GraphPropagation()
    A = prop.threshold_am(euclideans, threshold)

    if alpha:
        w = [alpha*(1-alpha)**i for i in range(10)]
    if not L:
        L = get_L(A)

    # Apply algorithm
    array_data = data.to_numpy()
    Z = basic_graph_propagation(array_data, A, w, L)
    return Z, A

### Errors

In [None]:
def rmse_error(initial, final):
    return np.linalg.norm(np.array(initial) - np.array(final)) / len(initial)**0.5

def smape_error(initial, final):
    initial, final = np.array(initial), np.array(final)
    num = np.absolute(initial - final)
    den = (np.absolute(initial) + np.absolute(final)) / 2
    elems = num/den
    return np.sum(elems) / elems.size

def compute_error(alpha, threshold, L, initial, nan_entries, data, euclideans, error_type='rmse'):
    prop = GraphPropagation()
    A = prop.threshold_am(euclideans, threshold)
    w = [alpha*(1-alpha)**i for i in range(10)]

    # Apply algorithm
    array_data = data.to_numpy()
    Z = basic_graph_propagation(array_data, A, w, L)
    
    final = []
    for entry in nan_entries:
        final.append(Z[entry])
    
    if error_type == 'rmse':
        error = rmse_error(initial, final)
    elif error_type == 'smape':
        error = smape_error(initial, final)
    
    return error

### Missing Data

In [None]:
def missing_data_count(df):
    grouped_M = group_dataframe2(df, 'M')
    res = pd.DataFrame(index=grouped_M.index, columns=grouped_M.columns)
    
    stations = df.columns.tolist()
    test = df.reset_index(level=0)
    
    i = 0
    for year in years:
        for month in months:
            sample_df = test[(test['date'].dt.year == year) & (test['date'].dt.month == month)]
            res.iloc[i] = sample_df.isna().sum().tolist()[1:]
            i += 1

    return res

def get_missing_dates(df, station):
    station_data = df[station]
    grouped_missing = station_data[station_data.isnull()]
    grouped_missing.iloc[:] = 1.0
    grouped_M_missing = group_dataframe2(grouped_missing, 'M')
    test = grouped_M_missing.reset_index(level=0)
    missing_dates = test[test[station] == 1.0]['date'].tolist()
    return missing_dates