In [1]:
#05 Second Order Regression
## This code takes the prediction forecast output from 03Autoregression and runs a second order regression
## In the first order (03Autoregression) case: disease and weather variables were the X values for predicting Y - target variable
## In the second order case (this one): prediction forecasts from first order, i.e., Naive Forecast, Linear Regression, LASSO, Ridge, etc
## are the new X values for predicting Y the target variable. 

## During this second order regression, we run the same forecast methods, e.g. Linear, LASSO, Ridge, etc, a second time
## In the same way that in first order regression the forecast methods help us choose which disease/weather variables best predict the target variable,
## In second order regression, the forecast methods help us choose which forecast methods best predict the target variable. 

## The code here is almost identical to 03Autoregression, but it takes in the inputs from 03Autoregression prediction outputs.
## Additionally, naive forecast method is omitted from second order. 


In [2]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import itertools
import statsmodels.api as sm
from IPython.display import clear_output
import os
from joblib import Parallel, delayed

In [3]:
from epiweeks import Week, Year
from datetime import date
def create_epiweek(date):
    return Week.fromdate(date)
def create_epiweekplot(epiweek):
    epiweek = str(epiweek)
    return F'Y{epiweek[:4]}W{epiweek[4:]}'
def create_epiweek_fromstr(str):
    return Week.fromstring(str)

In [4]:
def create_X_and_y(dataset, target_var):
    X_and_y = dataset.copy()
    return X_and_y.drop(target_var, axis = 1), X_and_y[[target_var]]

In [5]:
def create_window(X, window_perc):
    return X.index[0], X.index[int(len(X)*window_perc)]
def create_output_dataset(y, window_end):
    return y.copy().loc[window_end+1:]

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

def regression_without_naive(X_dataset, y_dataset, window_start, window_end, y_pred, y_params, test_length):
    count = 0
    df_end = X_dataset.index[-1]
    while window_end != df_end:
        X = X_dataset.copy()
        y = y_dataset.copy()
        # Note: .loc is end-inclusive    
        X_train = X.loc[window_start:window_end]
        #print(X_train.info())
        ## values.ravel() converts y_train to numpy array for compatibility with models
        y_train = y.loc[window_start:window_end].values.ravel()
        #print(len(y_train))
        ## double square brackets so X_test is extracted as a pandas df instead of series
        X_test = X.loc[[window_end+1]]
        #print(X_test)
        y_test = y.loc[window_end+1]
        #print(y_test)
    
        ## Scaling
        scaler = StandardScaler()
        ## .fit_transform stores the scaling parameters (fit), and transforms the training set
        X_train = scaler.fit_transform(X_train)
        ## .transform takes the previously stored scaling parameters to transform the test set
        ## Therefore, test set is transformed based on the training set parameters
        X_test = scaler.transform(X_test)
    
        ## evaluate variance
    
        ## Naive Forecast N/A for second order regression
        # add the [0] to extract as float, and not as series
        #y_pred.at[window_end+1, 'naive'] = naive.loc[window_end+1][0]
        
        ## Linear Regression Model
        linreg_model = LinearRegression()
        # Fit the model to the training data
        linreg_model.fit(X_train, y_train)
        # Make predictions and store
        y_pred.at[window_end+1, 'linreg_2'] = linreg_model.predict(X_test)

        
    
        ## Implement cross-validation split
        tscv = TimeSeriesSplit(n_splits = 5)
    
        ## Ridge model
        ridge_cv = RidgeCV(cv = tscv)
        ridge_cv.fit(X_train, y_train)
    
        ridge_model = Ridge(alpha = ridge_cv.alpha_)
        ridge_model.fit(X_train, y_train)
        y_pred.at[window_end+1, 'ridge_2'] = ridge_model.predict(X_test)
        y_params.at[window_end+1, 'ridge_2_alpha'] = ridge_cv.alpha_
    
        ## Lasso Model
        lasso_cv = LassoCV(cv = tscv, random_state = 18, max_iter = 100000)
        lasso_cv.fit(X_train, y_train)
        
        # Create the Lasso model with the optimal alpha value
        lasso_model = Lasso(alpha = lasso_cv.alpha_)
        lasso_model.fit(X_train, y_train)
        y_pred.at[window_end+1, 'lasso_2'] = lasso_model.predict(X_test)
        y_params.at[window_end+1, 'lasso_2_alpha'] = lasso_cv.alpha_
    
        ## ElasticNet Model
        elasticnet_cv = ElasticNetCV(cv = tscv, max_iter = 100000)
        elasticnet_cv.fit(X_train, y_train)
    
        # Create the ElasticNet model with the optimal l1 and alpha values
        elasticnet_model = ElasticNet(alpha = elasticnet_cv.alpha_, l1_ratio = elasticnet_cv.l1_ratio_)
        elasticnet_model.fit(X_train, y_train)
        y_pred.at[window_end+1, 'elasticnet_2'] = elasticnet_model.predict(X_test)
        y_params.at[window_end+1, 'elasticnet_2_alpha'] = elasticnet_cv.alpha_
        y_params.at[window_end+1, 'elasticnet_2_l1ratio'] = elasticnet_cv.l1_ratio_
        
        ## Random Forest
        randomforest_model = RandomForestRegressor(n_estimators = 1000, max_features = 'sqrt', random_state = 18)
        randomforest_model.fit(X_train, y_train)
        y_pred.at[window_end+1, 'randomforest_2'] = randomforest_model.predict(X_test)
    
        ## Gradient Boost
        gradientboost_model = GradientBoostingRegressor(n_estimators = 1000, random_state = 18)
        gradientboost_model.fit(X_train, y_train)
        y_pred.at[window_end+1, 'gradientboost_2'] = gradientboost_model.predict(X_test)
    
        ## KNN
        knn_parameters = {'n_neighbors': range(1, 10), 'weights': ['uniform', 'distance']}
        #round(len(y_train)*0.15-1))
        knn_cv = GridSearchCV(KNeighborsRegressor(), knn_parameters, cv = tscv)
        knn_cv.fit(X_train, y_train)
        #knn_model = knn_cv.predict(X_test)
        #knn_model = KNeighborsRegressor()
        #knn_model.fit(X_train, y_train)
        y_pred.at[window_end+1, 'knn_2'] = knn_cv.predict(X_test)
        y_params.at[window_end+1, 'knn_2_n'] = knn_cv.best_estimator_
        
        ##
        #keep track of model progress, every number of weeks
        tracking_interval = 5
        if window_end.weektuple()[1] % tracking_interval == 0:
            print(F'done with {window_end+1}; {count} out of {test_length}')
        
        ## Implement expanding window
        #window_start = window_start+1 (only for rolling window)
        window_end += 1
        count += 1

    print(F'The last epiweek to be predicted is: {window_end}')
    print(F'The total number of predicted epiweeks is: {count}')

In [7]:
def second_order_regression(dataset, target_var, window_perc):
    #print('Running for lag '+str(lag)+' step '+str(step))

    #no naive for second order regression
    #naive = create_naive(dataset, target_var)
    #print(naive.info())
    
    #lagged_dataset = create_lagged_dataset(dataset, lag)
    
    X, y = create_X_and_y(dataset, target_var)
    print(X.info())
    print(y.info())
    
    window_start, window_end = create_window(X, window_perc)
    print(F'The first epiweek to be predicted is: {window_end+1}')
    
    y_pred = create_output_dataset(y, window_end)
    y_params = create_output_dataset(y, window_end)
    
    train_length = len(X.loc[window_start:window_end])
    print(F'The initial training dataset length is: {train_length}')
    test_length = len(X.loc[window_end+1:])
    print(F'The initial testing dataset length is: {test_length}')

    regression_without_naive(X, y, window_start, window_end, y_pred, y_params, test_length)
    #print('Completed for lag '+str(lag)+' step '+str(step))
    clear_output(wait=False)
    return y_pred, y_params

In [8]:
def run_second_order_regression(target_var, pred_directory):
    directory = os.path.join(target_var, pred_directory)
    for filename in os.listdir(directory):
        pred_file = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(pred_file):
            print(pred_file)
            
            y_pred = pd.read_csv(pred_file, parse_dates = [0], dayfirst = True)
            y_pred['epiweek'] = y_pred['epiweek'].apply(create_epiweek_fromstr)
            y_pred = y_pred.set_index('epiweek')
            y_pred_2, y_params_2 = second_order_regression(y_pred, target_var, 0.7)


            pred_2_path = os.path.join(target_var, 'pred_2')
            if not os.path.exists(pred_2_path):
                os.makedirs(pred_2_path)
            y_pred_2.to_csv(os.path.join(pred_2_path, filename))

            params_2_path = os.path.join(target_var, 'params_2')
            if not os.path.exists(params_2_path):
                os.makedirs(params_2_path)
            y_params_2.to_csv(os.path.join(params_2_path, filename))
            

            print('done')


In [9]:
def full_second_order_regression(target_variables_file, pred_directory):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            # Remove linebreak which is the last character of the string
            target_variable = line[:-1]
            # Add item to the list
            target_variables.append(target_variable)
    print(target_variables)

    Parallel(n_jobs=-2, verbose=51)(delayed(run_second_order_regression)(target_var, pred_directory) for target_var in target_variables)
    
full_second_order_regression('target_variables.txt', 'pred')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Ill-defined diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-2)]: Done   2 out of  16 | elapsed:  7.3min remaining: 51.2min
[Parallel(n_jobs=-2)]: Done   3 out of  16 | elapsed:  7.3min remaining: 31.8min
[Parallel(n_jobs=-2)]: Done   4 out of  16 | elapsed:  7.3min remaining: 22.0min
[Parallel(n_jobs=-2)]: Done   5 out of  16 | elapsed:  7.4min remaining: 16.2min
[Parallel(n_jobs=-2)]: Done   6 out of  16 | elapsed:  7.4min remaining: 12.3min
[Parallel(n