In [None]:
## 08 Density Forecasting
## This code does the density forecasting based on the saved error values from 03Autoregression

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import itertools
import statsmodels.api as sm
import os
from joblib import Parallel, delayed
import properscoring as ps
from scipy.stats import norm

In [2]:
from epiweeks import Week, Year
from datetime import date
def create_epiweek(date):
    return Week.fromdate(date)
def create_epiweekplot(epiweek):
    epiweek = str(epiweek)
    return F'Y{epiweek[:4]}W{epiweek[4:]}'
def filename_to_epiweek(filename):
    return Week.fromstring(F'{filename[:4]}W{filename[4:6]}')
def create_epiweek_fromstr(str):
    return Week.fromstring(str)
def create_epiweek_fromint(int):
    return Week.fromstring(str(int))

In [3]:
def knn_dof(target_var, step_name, epiweek):
    knn = pd.read_csv(F'{target_var}/params/{step_name}.csv', parse_dates = [0], dayfirst = True)
    
    knn['epiweek'] = knn['epiweek'].apply(create_epiweek_fromstr)
    knn = knn.set_index('epiweek')
    
    str = knn.loc[epiweek,'knn_n']
    
    n_neighbors = ''
    for i in str:
        if i.isdigit():
    
            n_neighbors += i
    if n_neighbors != '':
        n_neighbors = int(n_neighbors)
    else:
        n_neighbors = 0
    return n_neighbors

#knn = knn_dof('Cardiovascular disease', 'L8_S1', Week(2016, 4))
#knn

In [4]:
def lasso_dof(target_var, step_name, epiweek):
    lasso = pd.read_csv(F'{target_var}/lasso_param/{step_name}.csv', parse_dates = [0], dayfirst = True)
    
    lasso['epiweek'] = lasso['epiweek'].apply(create_epiweek_fromstr)
    lasso = lasso.set_index('epiweek')
    
    return lasso.loc[epiweek,'lasso_edf']

In [5]:
def ridge_dof(target_var, step_name, epiweek):
    ridge = pd.read_csv(F'{target_var}/ridge_param/{step_name}.csv', parse_dates = [0], dayfirst = True)
    
    ridge['epiweek'] = ridge['epiweek'].apply(create_epiweek_fromstr)
    ridge = ridge.set_index('epiweek')
    
    return ridge.loc[epiweek,'ridge_edf']

In [6]:
## This function returns the degrees of freedom based on which forecast method is currently being calculated
## See the additional helper functions above. 
def dof(model_name, epiweek, target_var, step_name):
    if model_name == 'gradientboost':
        return 0
    if model_name == 'randomforest':
        return 0
    if model_name == 'knn':
        return knn_dof(target_var, step_name, epiweek)
    if model_name == 'lasso':
        return lasso_dof(target_var, step_name, epiweek)
        # return number of non-zero features, i.e. remaining columns used
        # extract regression coefficients, sum or count
    
    if model_name == 'ridge' or model_name == 'elasticnet':
        # singular value decomposition
        # return ridge edf
        return ridge_dof(target_var, step_name, epiweek) 
        
    if model_name == 'linreg':
        #should be 223 (224 -- total number of columns in xtrain, i.e. 28 variables x 8 steps - 1 for ytrain)
        return 223
    if model_name == 'naive':
        return 0

In [7]:
def variance(y_train_errors, model_name, epiweek, target_var, step_name):
    return (sum(y_train_errors**2)/(len(y_train_errors) - dof(model_name, epiweek, target_var, step_name)))

In [8]:
def find_model_variance(target_var, step_name, model_name, model_directory):
    model_variance_df = pd.DataFrame()
    for errors_filename in os.listdir(model_directory):
        errors_file = os.path.join(model_directory, errors_filename)
        if os.path.isfile(errors_file) and errors_filename[-3:] == 'csv':
            y_train_errors = pd.read_csv(errors_file)
            epiweek = filename_to_epiweek(errors_filename)
            model_variance_df.at[epiweek,model_name] = variance(y_train_errors.iloc[:,1], model_name, epiweek, target_var, step_name)
    model_variance_df.sort_index(inplace=True)
    return model_variance_df
    
#test_df = find_model_variance('Cardiovascular disease', 'L8_S1', 'knn','Cardiovascular disease/errors/L8_S1/knn')
#test_df
    

In [9]:
def run_variance(target_var, errors_directory, variance_directory):
    top_directory = os.path.join(target_var, errors_directory)
    
    variance_path = os.path.join(target_var, variance_directory)
    if not os.path.exists(variance_path):
        os.makedirs(variance_path)
        
    for step_name in os.listdir(top_directory):
        model_variance_df = pd.DataFrame()
        step_directory = os.path.join(top_directory, step_name)
        # checking if it is a directory
        if os.path.isdir(step_directory):
            print(step_directory)
            for model_name in os.listdir(step_directory):
                model_directory = os.path.join(step_directory, model_name)
                if os.path.isdir(model_directory):
                    print(F'{step_name}: {model_name}')
                    model_variance_df = pd.concat([model_variance_df, find_model_variance(target_var, step_name, model_name, model_directory)], axis=1)
        model_variance_df.to_csv(os.path.join(variance_path,F'{step_name}.csv'))
        

In [10]:
## This function runs the full variance calculation for density forecasting later below

def run_full_variance(target_variables_file, errors_directory, variance_directory):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            # Remove linebreak which is the last character of the string
            target_variable = line[:-1]
            # Add item to the list
            target_variables.append(target_variable)
    print(target_variables)
    Parallel(n_jobs=-2, verbose=51)(delayed(run_variance)(target_var, errors_directory, variance_directory) for target_var in target_variables)
run_full_variance('target_variables.txt', 'errors', 'variance')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Ill-defined diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-2)]: Done   2 out of  16 | elapsed:   29.0s remaining:  3.4min
[Parallel(n_jobs=-2)]: Done   3 out of  16 | elapsed:   29.2s remaining:  2.1min
[Parallel(n_jobs=-2)]: Done   4 out of  16 | elapsed:   29.3s remaining:  1.5min
[Parallel(n_jobs=-2)]: Done   5 out of  16 | elapsed:   29.4s remaining:  1.1min
[Parallel(n_jobs=-2)]: Done   6 out of  16 | elapsed:   29.4s remaining:   49.0s
[Parallel(n

In [16]:
def crps_abs(y_val, y_pred, y_variance, model, target_var):
    crps_df = pd.DataFrame()
    #ensemble = np.random.RandomState(0).randn(1000)
    
    for epiweek in y_val.index:
        
        crps_df.at[epiweek, model] = ps.crps_gaussian(y_val.loc[epiweek, target_var], 
                                                      mu=y_pred.loc[epiweek, model], 
                                                      sig=(y_variance.loc[epiweek, model])**0.5)
    
    return crps_df
    
def crps_rel(y_val, y_pred, y_variance, model, target_var):
    crps_df = pd.DataFrame()
    #ensemble = np.random.RandomState(0).randn(1000)
    
    for epiweek in y_val.index:
        
        crps_df.at[epiweek, model] = ps.crps_gaussian(y_val.loc[epiweek, target_var], 
                                                      mu=y_pred.loc[epiweek, model], 
                                                      sig=(y_variance.loc[epiweek, model])**0.5)/y_val.loc[epiweek, target_var]
    
    return crps_df

In [19]:
def disease_crps(target_var, pred_directory, variance_directory, density_forecast_directory):
    pred_directory_path = os.path.join(target_var, pred_directory)
    variance_directory_path = os.path.join(target_var, variance_directory)
    density_forecast_directory_path = os.path.join(target_var, density_forecast_directory)
    if not os.path.exists(density_forecast_directory_path):
        os.makedirs(density_forecast_directory_path)
    for filename in os.listdir(pred_directory_path):
        pred_file = os.path.join(pred_directory_path, filename)
        variance_file = os.path.join(variance_directory_path, filename)
        if os.path.isfile(pred_file) and os.path.isfile(variance_file):
            y_pred = pd.read_csv(pred_file, parse_dates = [0], dayfirst = True)
            y_pred['epiweek'] = y_pred['epiweek'].apply(create_epiweek_fromstr)
            y_pred = y_pred.set_index('epiweek')
            
            y_variance = pd.read_csv(variance_file, parse_dates = [0], dayfirst = True, index_col = 0)
            y_variance['epiweek'] = y_variance.index
            y_variance['epiweek'] = y_variance['epiweek'].apply(create_epiweek_fromint)
            y_variance = y_variance.set_index('epiweek')
            
            abs_density_forecast_df = y_pred[[target_var]].drop(target_var, axis=1).copy()
            rel_density_forecast_df = y_pred[[target_var]].drop(target_var, axis=1).copy()
            
            for model in y_variance.columns:
                abs_crps_col = crps_abs(y_pred[[target_var]].copy(), y_pred[[model]].copy(), y_variance[[model]].copy(), model, target_var)
                abs_density_forecast_df = pd.concat([abs_density_forecast_df, abs_crps_col], axis=1)

                rel_crps_col = crps_rel(y_pred[[target_var]].copy(), y_pred[[model]].copy(), y_variance[[model]].copy(), model, target_var)
                rel_density_forecast_df = pd.concat([rel_density_forecast_df, rel_crps_col], axis=1)
            
            abs_density_forecast_df.columns = y_variance.columns
            rel_density_forecast_df.columns = y_variance.columns
            abs_density_forecast_df = abs_density_forecast_df[['naive', 'linreg', 'ridge', 'lasso', 'elasticnet', 'randomforest', 'gradientboost', 'knn']]
            rel_density_forecast_df = rel_density_forecast_df[['naive', 'linreg', 'ridge', 'lasso', 'elasticnet', 'randomforest', 'gradientboost', 'knn']]
            
            density_forecast_output = pd.DataFrame()
            for col in abs_density_forecast_df.columns:
                density_forecast_output.at[col, 'DENSITY_FORECAST_ABS'] = abs_density_forecast_df[col].mean()
                density_forecast_output.at[col, 'DENSITY_FORECAST_REL'] = rel_density_forecast_df[col].mean()
            density_forecast_output.to_csv(os.path.join(density_forecast_directory_path,filename))
            
#disease_crps('Cardiovascular disease', 'pred', 'variance', 'density_forecast')

In [20]:
## This function calculates the density forecast based on the output prediction forecast and calculated variance

def run_full_crps(target_variables_file, pred_directory, variance_directory, density_forecast_directory):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            # Remove linebreak which is the last character of the string
            target_variable = line[:-1]
            # Add item to the list
            target_variables.append(target_variable)
    print(target_variables)
    Parallel(n_jobs=-2, verbose=51)(delayed(disease_crps)(target_var, pred_directory, variance_directory, density_forecast_directory) for target_var in target_variables)
    
run_full_crps('target_variables.txt', 'pred', 'variance', 'density_forecast')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Ill-defined diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done   2 out of  16 | elapsed:    8.1s remaining:   56.8s
[Parallel(n_jobs=-2)]: Done   3 out of  16 | elapsed:    8.1s remaining:   35.2s
[Parallel(n_jobs=-2)]: Done   4 out of  16 | elapsed:    8.1s remaining:   24.3s
[Parallel(n_jobs=-2)]: Done   5 out of  16 | elapsed:    8.1s remaining:   17.9s
[Parallel(n_jobs=-2)]: Done   6 out of  16 | elapsed:    8.1s remaining:   13.6s
[Parallel(n