In [None]:
import os, sys
import time

# machine learning
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard

# data manipulation and signal processing
import math
import random
import pandas as pd
import numpy as np
import scipy
from scipy import signal
import scipy.stats as ss

# plots
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import folium


# path = "/content/drive/Shareddrives/covid.eng.pdn.ac.lk/COVID-AI (PG)/spatio_temporal/Covid19_DL_Forecasting_Codes"
# os.chdir(path)
sys.path.insert(0, os.path.join(sys.path[0], '..'))
from utils.plots import bar_metrics, plot_prediction
from utils.functions import split_into_pieces_inorder,split_into_pieces_random,create_dataset_random, distance, convert_lon_lat_to_adjacency_matrix 
from utils.data_loader import load_data, per_million, get_daily
from utils.data_splitter import split_on_region_dimension, split_on_time_dimension
from utils.smoothing_functions import O_LPF,NO_LPF,O_NDA,NO_NDA

# EXTRACTING DATA

In [None]:
daily_data = True
DATASET = "Sri Lanka" # "Texas" "USA" "Global"
# DATASET = "Texas"

Required variables:

*   **region_names** - Names of the unique regions.
*   **confirmed_cases** - 2D array. Each row should corresponds to values in 'region_names'. Each column represents a day. Columns should be in ascending order. (Starting day -> Present)
*   **daily_cases** - confirmed_cases.diff()
*   **population** - Population in 'region'
*   **features** - Features of the regions. Each column is a certain feature.
*   **START_DATE** - Starting date of the data DD/MM/YYYY
*   **n_regions** Number of regions



In [None]:
d = load_data(DATASET,path="../Datasets")
region_names=d["region_names"] 
confirmed_cases=d["confirmed_cases"] 
daily_cases=d["daily_cases"] 
features=d["features"] 
START_DATE=d["START_DATE"] 
n_regions=d["n_regions"] 

population = features["Population"]
for i in range(len(population)):
    print("{:.2f}%".format(confirmed_cases[i,:].max()/population[i]*100), region_names[i])

days = confirmed_cases.shape[1]

print(f"Total population {population.sum()/1e6:.2f}M, regions:{n_regions}, days:{days}")

# Addressing data imbalances

In [None]:
def get_count(segments, data):
    bounds = []
    count = []
    idx = []
    for i in range(segments):
        data = (data - np.amin(data))
        bounds.append(np.round((i+1)*np.amax(data)/segments,3))
        if i==0:
            ineq = data <= bounds[i]
        elif i==(segments-1):
            ineq = data > bounds[i-1]
        else:
            ineq = (data > bounds[i-1])*(data <= bounds[i])
        count.append(np.sum(ineq))
        idx.append(np.reshape(np.array(np.where(ineq)),[-1,]))
    count = np.array(count).astype(int)
    bounds = np.array(bounds).astype(np.float64)
    return count, bounds, idx


In [None]:
input_size = 15
train_end = 100
plot_state = 1

dataset = np.copy(daily_cases)
dataset_norm = np.zeros_like(dataset)
for i in range(daily_cases.shape[0]):
    dataset_norm[i,:] = dataset[i,:]/np.amax(dataset[i,:])

alldata_train = dataset_norm[:,0:train_end]

samples_all = np.zeros([alldata_train.shape[0], alldata_train.shape[1]-input_size, input_size])
samples_mean = np.zeros([alldata_train.shape[0], alldata_train.shape[1]-input_size])

# evaluating optimal number of segments for each district
segment_array = [2,3,4,5,6,7,8,9,10]
segment_dist = []
if plot_state == 1:
    plt.figure(figsize=(5*6,5*4))
for i in range(samples_all.shape[0]):
    for k in range(samples_all.shape[1]):
        samples_all[i,k,:] = alldata_train[i,k:k+input_size]
        samples_mean[i,k] = np.mean(samples_all[i,k,:])
    all_counts = []
    count_score = []
    # evaluating the count score for each district
    for n in range(len(segment_array)):    
        segments = segment_array[n]
        [count, bounds, idx] = get_count(segments, samples_mean[i,:])              
        all_counts.append(np.amin(count)*len(count))
        count_score.append((all_counts[n]**1)*(n+1))
    if plot_state ==1:
        plt.subplot(5,5,i+1)
        plt.plot(segment_array,all_counts/np.amax(all_counts),linewidth=2)
        plt.plot(segment_array,count_score/np.amax(count_score),linewidth=2)
        plt.legend(['normalised total counts','segment score'])
        plt.title('dist: '+region_names[i]+'  segments: '+str(segment_array[np.argmax(count_score)])+'  samples: '+str(all_counts[np.argmax(count_score)]))
    segment_dist.append(segment_array[np.argmax(count_score)]) 
segment_dist = np.array(segment_dist).astype(int)
if plot_state ==1:    
    plt.show()

print('segments per district= ', segment_dist)

idx_rand_all = []
for i in range(samples_all.shape[0]):
    data = samples_mean[i,:]
    segments = segment_dist[i]
    [count_dist, bounds_dist, idx_dist] = get_count(segments, data)
    n_per_seg = np.amin(count_dist)
    data_new = []
    idx_rand = np.zeros([segments,n_per_seg])
    for k in range(segments):
        idx_temp = list(idx_dist[k])
        idx_rand[k,:] = random.sample(idx_temp,n_per_seg)
    idx_rand = np.reshape(idx_rand, [-1,])
    idx_rand_all.append(idx_rand)
print(len(idx_rand_all))

# undersampling using optimal number of segments
for i in range(samples_all.shape[0]):
    data = samples_mean[i,:]
    segments = segment_dist[i]
    [count_dist, bounds_dist, idx_dist] = get_count(segments, data)
    n_per_seg = np.amin(count_dist)
    data_new = []
    idx_rand = np.zeros([segments,n_per_seg])
    for k in range(segments):
        idx_temp = list(idx_dist[k])
#         print(idx_temp)
        idx_rand[k,:] = random.sample(idx_temp,n_per_seg)
    idx_rand = np.reshape(idx_rand, [-1,])
    print(region_names[i], idx_rand.shape)