In [None]:
def normalize_Xy_splits(splits, normalization_method='minmax', train_test_only=False, feature_indices=None):
    """ Split into training, validation and test data.
    """
    min_, max_ = (0, 1)
    # Note that the last frame (date_range) that exists in X has already been determined by the choice of the number
    # of steps to predict in the future, this is only slicing the frames. 
    if train_test_only:
        (X_train, y_train, X_test, y_test) = splits
    else:
        (X_train, y_train, X_validate, y_validate, X_test, y_test) = splits

    if normalization_method=='minmax':
        # To encapsulate the time-dependent nature of the problem and ignore the dramatic difference between current
        # and initial behavior, only rescale the validation and testing frames by the most recent frame's values.
        # There is only a single value per feature in this case, meaning that to rescale, the values need to
        # be repeated for each validation, test frame for each country for each timestep.
        X_min = X_train.min(axis=(2))
        X_max = X_train.max(axis=(2))


        X_train_scaled = minmax(X_train, X_min[:, :, np.newaxis, :],
                        X_max[:, :, np.newaxis, :])
        X_test_scaled = minmax(X_test, X_min[-1][np.newaxis, :, np.newaxis, :], 
                               X_max[-1][np.newaxis, :, np.newaxis, :])
        if train_test_only:
        # Normalize the training data by each frame's specific mean and std deviation. 
            splits = (X_train_scaled, y_train, X_test_scaled, y_test)
        else:
            X_validate_scaled = minmax(X_validate, X_min[-1,:][np.newaxis, :, np.newaxis, :], 
                                       X_max[-1,:][np.newaxis, :, np.newaxis, :])
            splits = (X_train_scaled, y_train, X_validate_scaled, y_validate, X_test_scaled, y_test)
    else:
        
        X_mean = X_train.mean(axis=(1,2))
        X_std = X_train.std(axis=(1,2))

        # To avoid division by zero. This is a big assumption but this typically occurs when the frame's feature
        # value is identically zero, which would result in x-x_mean / x_std = 0 / 1 = 0. So it doesn't matter what 
        # the x_std value is changed to as they are always divided into 0.
        #     X_std[np.where(X_std==0.)] = 1

        #     # First two features are time_index and time_index (days_since_first_case)
        #         if date_normalization==False:
        #             X_mean[:,:2] = 0
        #             X_std[:, :2] = 1
        X_train_scaled = normal(X_train, 
                                X_mean[:, np.newaxis, np.newaxis, :],
                                X_std[:, np.newaxis, np.newaxis, :])
        X_test_scaled = normal(X_test, 
                               X_mean[-1,:][np.newaxis, np.newaxis, np.newaxis, :], 
                               X_std[-1,:][np.newaxis, np.newaxis, np.newaxis, :])
        
        if train_test_only:
        # Normalize the training data by each frame's specific mean and std deviation. 
            splits = (X_train_scaled, y_train, X_test_scaled, y_test)
        else:
            X_validate_scaled = normal(X_test, 
                               X_mean[-1,:][np.newaxis, np.newaxis, np.newaxis, :], 
                               X_std[-1,:][np.newaxis, np.newaxis, np.newaxis, :])
            splits = (X_train_scaled, y_train, X_validate_scaled, y_validate, X_test_scaled, y_test)

    return splits



def normalize_Xy_splits(splits, normalization_method='minmax', train_test_only=False, feature_indices=None):
    """ Split into training, validation and test data.
    """
    min_, max_ = (0, 1)
    # Note that the last frame (date_range) that exists in X has already been determined by the choice of the number
    # of steps to predict in the future, this is only slicing the frames. 
    if train_test_only:
        (X_train, y_train, X_test, y_test) = splits
    else:
        (X_train, y_train, X_validate, y_validate, X_test, y_test) = splits

    if normalization_method=='minmax':
        # To encapsulate the time-dependent nature of the problem and ignore the dramatic difference between current
        # and initial behavior, only rescale the validation and testing frames by the most recent frame's values.
        # There is only a single value per feature in this case, meaning that to rescale, the values need to
        # be repeated for each validation, test frame for each country for each timestep.
        X_min = X_train.min(axis=(2))
        X_max = X_train.max(axis=(2))


        X_train_scaled = minmax(X_train, X_min[:, :, np.newaxis, :],
                        X_max[:, :, np.newaxis, :])
        X_test_scaled = minmax(X_test, X_min[-1][np.newaxis, :, np.newaxis, :], 
                               X_max[-1][np.newaxis, :, np.newaxis, :])
        if train_test_only:
        # Normalize the training data by each frame's specific mean and std deviation. 
            splits = (X_train_scaled, y_train, X_test_scaled, y_test)
        else:
            X_validate_scaled = minmax(X_validate, X_min[-1,:][np.newaxis, :, np.newaxis, :], 
                                       X_max[-1,:][np.newaxis, :, np.newaxis, :])
            splits = (X_train_scaled, y_train, X_validate_scaled, y_validate, X_test_scaled, y_test)
    else:
        
        X_mean = X_train.mean(axis=(1,2))
        X_std = X_train.std(axis=(1,2))

        X_train_scaled = normal(X_train, 
                                X_mean[:, np.newaxis, np.newaxis, :],
                                X_std[:, np.newaxis, np.newaxis, :])
        X_test_scaled = normal(X_test, 
                               X_mean[-1,:][np.newaxis, np.newaxis, np.newaxis, :], 
                               X_std[-1,:][np.newaxis, np.newaxis, np.newaxis, :])
        
        if train_test_only:
        # Normalize the training data by each frame's specific mean and std deviation. 
            splits = (X_train_scaled, y_train, X_test_scaled, y_test)
        else:
            X_validate_scaled = normal(X_test, 
                               X_mean[-1,:][np.newaxis, np.newaxis, np.newaxis, :], 
                               X_std[-1,:][np.newaxis, np.newaxis, np.newaxis, :])
            splits = (X_train_scaled, y_train, X_validate_scaled, y_validate, X_test_scaled, y_test)

    return splits


def create_Xy(model_data, target_data, time_index, start_date, frame_size, n_days_into_future, n_countries):
    # can't include the max date because need at least 1 day in future to predict. +1 because of how range doesn't include endpoint
    for max_date_in_window in range(start_date, time_index.max() - n_days_into_future + 2):
        # Take all model_data with date proxy less than numerical value, leading_window_date_not_included
        frame_data = model_data[(time_index < max_date_in_window) & 
                                (time_index >= max_date_in_window-frame_size)]
        #     print(frame_data.shape)
        # Reshape the array such that each element along axis=0 is a time series of all feature model_data of a specific country.
        reshaped_frame_data = frame_data.values.reshape(n_countries, frame_size, -1)
        #     print(reshaped_frame_data.shape)
        # Truncate / pad the windows along the "time" axis, axis=1. (pad_sequences takes in an iterable of iterables;
        # the first axis is always the default iteration axis. 
        # *********************** WARNING: pad_sequences converts to integers by default *********************
        resized_frame_data = pad_sequences(reshaped_frame_data, maxlen=frame_size, dtype=np.float64)
        frame_data_4D = resized_frame_data[np.newaxis, :, :, :]
        if max_date_in_window == start_date:
            X = frame_data_4D.copy()
        else:
            X = np.concatenate((X, frame_data_4D),axis=0)
    y = target_data.values.reshape(-1, time_index.nunique()).transpose()[-X.shape[0]:,:]
    y_time_index = time_index.values.reshape(-1, time_index.nunique()).transpose()[-X.shape[0]:,:]
    # y = model_data.new_cases_weighted.values.reshape(-1, model_data.time_index.nunique()).transpose()[-X.shape[0]:,:]
    return X, (y,y_time_index)


def create_Xy(model_data, target_data, time_index, start_date, frame_size, n_days_into_future, n_countries):
    # can't include the max date because need at least 1 day in future to predict. +1 because of how range doesn't include endpoint
    for max_date_in_window in range(start_date, time_index.max() - n_days_into_future + 1):
        # Take all model_data with date proxy less than numerical value, leading_window_date_not_included
        frame_data = model_data[(time_index <= max_date_in_window) & 
                                (time_index > max_date_in_window-frame_size)]
        #     print(frame_data.shape)
        # Reshape the array such that each element along axis=0 is a time series of all feature model_data of a specific country.
        reshaped_frame_data = frame_data.values.reshape(n_countries, frame_size, -1)
        #     print(reshaped_frame_data.shape)
        # Truncate / pad the windows along the "time" axis, axis=1. (pad_sequences takes in an iterable of iterables;
        # the first axis is always the default iteration axis. 
        # *********************** WARNING: pad_sequences converts to integers by default *********************
        resized_frame_data = pad_sequences(reshaped_frame_data, maxlen=frame_size, dtype=np.float64)
        frame_data_4D = resized_frame_data[np.newaxis, :, :, :]
        if max_date_in_window == start_date:
            X = frame_data_4D.copy()
        else:
            X = np.concatenate((X, frame_data_4D),axis=0)
    y = target_data.values.reshape(-1, time_index.nunique()).transpose()[-X.shape[0]:,:]
    # y = model_data.new_cases_weighted.values.reshape(-1, model_data.time_index.nunique()).transpose()[-X.shape[0]:,:]
    return X, y

def split_Xy(X, y, frame_size, n_validation_frames, n_test_frames, date_normalization=True,
                          train_test_only=False):
    """ Split into training, validation and test data.
    """

    # Note that the last frame (date_range) that exists in X has already been determined by the choice of the number
    # of steps to predict in the future, this is only slicing the frames. 
    if train_test_only:
        X_train= X[:-n_test_frames,:,:,:]
        y_train =  y[:-n_test_frames,:]
        X_test = X[-n_test_frames:, :, :, :] 
        y_test = y[-n_test_frames:, :]
        splits =  (X_train, y_train, X_test, y_test)
    else:
        X_train= X[:-(n_validation_frames+n_test_frames),:,:,:]
        y_train =  y[:-(n_validation_frames+n_test_frames),:]
        X_validate = X[-(n_validation_frames+n_test_frames):-n_test_frames, :, :, :]
        y_validate = y[-(n_validation_frames+n_test_frames):-n_test_frames, :]
        X_test = X[-n_test_frames:, :, :, :] 
        y_test = y[-n_test_frames:, :]
        splits =  (X_train, y_train, X_validate, y_validate,
                   X_test, y_test)
                   
    return splits

def split_Xy(X, y, frame_size, n_validation_frames, n_test_frames, date_normalization=True,
                          train_test_only=False):
    """ Split into training, validation and test data.
    """

    # Note that the last frame (date_range) that exists in X has already been determined by the choice of the number
    # of steps to predict in the future, this is only slicing the frames. 
    if train_test_only:
        X_train= X[:-n_test_frames,:,:,:]
        y_train =  y[:-n_test_frames,:]
        X_test = X[-n_test_frames:, :, :, :] 
        y_test = y[-n_test_frames:, :]
        splits =  (X_train, y_train, X_test, y_test)
    else:
        X_train= X[:-(n_validation_frames+n_test_frames),:,:,:]
        y_train =  y[:-(n_validation_frames+n_test_frames),:]
        X_validate = X[-(n_validation_frames+n_test_frames):-n_test_frames, :, :, :]
        y_validate = y[-(n_validation_frames+n_test_frames):-n_test_frames, :]
        X_test = X[-n_test_frames:, :, :, :] 
        y_test = y[-n_test_frames:, :]
        splits =  (X_train, y_train, X_validate, y_validate,
                   X_test, y_test)
                   
    return splits


def simple_model_forecast(df, drift_features, naive_features, forecast_interval=7):
    """ Simple model forecasting of predictors. 
    
    df : DataFrame
        Dataframe which contains training set only, for
        every country.
    
    drift_features : list-like or pd.Index
        Features which will be forecasted using a drift model.
        
    naive_features :
        Features which will be forecasted using a naive (constant) model.
    
    Notes
    -----
    """
    start, end = df.date_proxy.min(), df.date_proxy.max()
    df_start = df[df.date_proxy == start].set_index('location')
    df_end = df[df.date_proxy == end].set_index('location')
    span = end - start
    
    # This computes the secant line for all "drift features", for all countries, separately.
    drift_slopes = (1.0/span) * (df_end.loc[:, drift_features]
                                 - df_start.loc[:, drift_features])

    # Given a matrix of drifts, the correct way of using them to extrapolate (i.e. every
    # row turns into a number of rows equal to forecast_interval) is to do the following
    delta_t = np.arange(1, forecast_interval+1).reshape(-1, 1)
    # the result is of course of the form, y = mx + b 
    drift_forecasts = pd.DataFrame((np.kron(drift_slopes.values, delta_t.reshape(-1, 1)) 
                       + np.kron(df_end.loc[:, drift_features].values, 
                                 np.ones(forecast_interval).reshape(-1,1))))
    drift_forecasts.columns = drift_features
    
    
    naive_df = df_end.loc[:, naive_features]
    naive_forecasts = pd.concat(forecast_interval * [naive_df], axis=0).sort_index() 
    
    drift_forecasts.index = naive_forecasts.index

    forecast_df = pd.concat((drift_forecasts, naive_forecasts), axis=1)
    
    return forecast_df



def shift_for_forecasting(data, forecast_interval=1, target_name='new_cases_per_million'):
    
    country_groupby_indices = country_groupby_indices_list(data)
    for i, country_indices in enumerate(country_groupby_indices):
        # fundamentally different operation for rate of change
#         if target_name == 'new_cases_change_per_million':
#             y =  data.loc[country_indices, 'new_cases_per_million'].diff(1).fillna(0)
#         else:
        y = data.loc[country_indices, target_name]
        y.index = country_indices 
        
        if i == 0 :
            shifted_y = y.shift(-forecast_interval)
        else:
            shifted_y = pd.concat((shifted_y, y.shift(-forecast_interval)),axis=0)

    return shifted_y

# def box

def minmax(X, X_min, X_max):
    # X_min and X_max need to have already been made into 4-d tensors with np.newaxis
    tile_shape = np.array(np.array(X.shape) / np.array(X_min.shape), dtype=int)
    denominator = np.tile(X_max, tile_shape) - np.tile(X_min, tile_shape)
    denominator[denominator==0] = 1
    X_scaled = (X - np.tile(X_min, tile_shape)) / denominator
    return X_scaled

def rolling_features(df, features, roll_widths):
    new_feature_df_list = []
    for window in roll_widths:
        # order the dataframe so date is index, backfill in the first roll_width values
        rollmean = pd.DataFrame(df.set_index('location').loc[:, features].groupby(level=0).rolling(window).mean().fillna(value=0.))
#         rollstd = pd.DataFrame(df.set_index('location').loc[:, features].groupby(level=0).rolling(window).std().fillna(value=0.))
        new_features = rollmean.reset_index(drop=True)
#         new_features = pd.concat((rollmean, rollstd), axis=1)
        rolling_mean_names = features + '_rolling_mean_' + str(window)
#         rolling_std_names = features +'_rolling_std_' + str(window)
        new_cols = rolling_mean_names
#         new_cols = rolling_mean_names.append(rolling_std_names)
        new_features.columns = new_cols
        new_feature_df_list.append(new_features)
        new_df = pd.concat(new_feature_df_list,axis=1)
        new_df.index = df.index
    return new_df


def reformat_values(values_to_transform, category='columns',dateformat=None):
    """ Reformat column and index names. 
    
    Parameters :
    ----------
    df : Pandas DataFrame
    columns : bool
    index : bool
    
    Notes :
    -----
    Change headers of columns; this needs to be updated to account for their formatting changes. 
    This function converts strings with CamelCase, underscore and space separators to lowercase words uniformly
    separated with underscores. I.e. (hopefully!) following the correct python identifier syntax so that each column
    can be reference as an attribute if desired. 

    For more on valid Python identifiers, see:
    https://docs.python.org/3/reference/lexical_analysis.html#identifiers
    """
    
    """ Reformat column and index names. only works with with pandas MultiIndex for level=0.
    
    Parameters :
    ----------
    df : Pandas DataFrame

    Notes :
    -----
    Different datasets have different naming conventions (for countries that go by multiple names and abbreviations).
    This function imposes a convention on a selection of these country names.  
    """
    # these lists are one-to-one. countries compared via manual inspection, unfortunately. 
    mismatch_labels_bad = ['Lao People\'s Democratic Republic', 'Mainland China',
                           'Occupied Palestinian Territory','Republic of Korea', 'Korea, South', 
                           'Gambia, The ', 'UK', 
                           'USA', 'Iran (Islamic Republic of)',
                           'Bahamas, The', 'Russian Federation', 'Czech Republic', 'Republic Of Ireland',
                          'Hong Kong Sar', 'Macao Sar', 'Uk','Us',
                           'Congo ( Kinshasa)','Congo ( Brazzaville)',
                           'Cote D\' Ivoire', 'Viet Nam','Guinea- Bissau','Guinea','Usa']

    mismatch_labels_good = ['Laos','China',
                            'Palestine', 'South Korea', 'South Korea', 
                            'The Gambia', 'United Kingdom', 
                            'United States','Iran',
                            'The Bahamas','Russia','Czechia','Ireland',
                            'Hong Kong','Macao','United Kingdom', 'United States',
                            'Democratic Republic Of The Congo','Republic Of The Congo',
                            'Ivory Coast','Vietnam', 'Guinea Bissau','Guinea Bissau','United States']
    
    # three cases, column names, country names, or datetime. 
    if category == 'location':
        reformatted_values = []
        for val in values_to_transform:
            reformatted_values.append(' '.join(re.sub(r'([A-Z][a-z]+)', r' \1', 
                                                        re.sub(r'([A-Z]+)|_|\/|\)|\(', r' \1', val).lower())
                                                        .split()).title())
        transformed_values = pd.Series(reformatted_values).replace(to_replace=mismatch_labels_bad, value=mismatch_labels_good)
    
    elif category == 'columns':
        reformatted_values = []
        for val in values_to_transform:
            reformatted_values.append('_'.join(re.sub(r'([A-Z][a-z]+)', r' \1', 
                                                     re.sub(r'([A-Z]+)|_|\/|\)|\(', r' \1', val)
                                                            .lower()).split()))
        transformed_values = pd.Series(reformatted_values)
        
    elif category == 'date':
        transformed_values = pd.to_datetime(pd.Series(
            values_to_transform), errors='coerce',format=dateformat).dt.normalize()


    return transformed_values



def normal(X, X_mean, X_std):
    tile_shape = np.array(np.array(X.shape) / np.array(X_mean.shape), dtype=int)
    mean_ = np.tile(X_mean, tile_shape)
    std_ =  np.tile(X_std, tile_shape)   
    std_[np.where(std_==0.)] = 1
    X_scaled = ((X - mean_) /  std_)
    return X_scaled


def concatenate_4d_into_3d(splits, train_test_only=False):
    
    if train_test_only:
        (X_train, y_train, X_test, y_test) = splits
        X_train = np.concatenate(X_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        X_test = np.concatenate(X_test, axis=0)
        y_test = np.concatenate(y_test, axis=0)
        concat_splits = (X_train, y_train, X_test, y_test) 
    else:
        (X_train, y_train, X_validate, y_validate, X_test, y_test) = splits
        X_train = np.concatenate(X_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        X_validate = np.concatenate(X_validate, axis=0)
        y_validate = np.concatenate(y_validate, axis=0)
        X_test = np.concatenate(X_test, axis=0)
        y_test = np.concatenate(y_test, axis=0)
        concat_splits = (X_train, y_train, X_validate, y_validate, X_test, y_test) 
    return concat_splits


def concatenate_4d_into_3d(splits, train_test_only=False):
    
    if train_test_only:
        (X_train, y_train, X_test, y_test) = splits
        X_train = np.concatenate(X_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        X_test = np.concatenate(X_test, axis=0)
        y_test = np.concatenate(y_test, axis=0)
        concat_splits = (X_train, y_train, X_test, y_test) 
    else:
        (X_train, y_train, X_validate, y_validate, X_test, y_test) = splits
        X_train = np.concatenate(X_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        X_validate = np.concatenate(X_validate, axis=0)
        y_validate = np.concatenate(y_validate, axis=0)
        X_test = np.concatenate(X_test, axis=0)
        y_test = np.concatenate(y_test, axis=0)
        concat_splits = (X_train, y_train, X_validate, y_validate, X_test, y_test) 
    return concat_splits

def transpose_for_separable2d(splits, train_test_only=False):
    if train_test_only:
        (X_train, y_train, X_test, y_test) = splits
        X_train = np.transpose(X_train, axes=[0,2,1,3])
        X_test = np.transpose(X_test, axes=[0,2,1,3])
        transpose_split = (X_train, y_train, X_test, y_test) 
    else:
        (X_train, y_train, X_validate, y_validate, X_test, y_test) = splits
        X_train = np.transpose(X_train, axes=[0,2,1,3])
        X_validate = np.transpose(X_validate, axes=[0,2,1,3])
        X_test = np.transpose(X_test, axes=[0,2,1,3])
        transpose_split = (X_train, y_train, X_validate, y_validate, X_test, y_test) 
    return transpose_split
def transpose_for_separable2d(splits, train_test_only=False):
    if train_test_only:
        (X_train, y_train, X_test, y_test) = splits
        X_train = np.transpose(X_train, axes=[0,2,1,3])
        X_test = np.transpose(X_test, axes=[0,2,1,3])
        transpose_split = (X_train, y_train, X_test, y_test) 
    else:
        (X_train, y_train, X_validate, y_validate, X_test, y_test) = splits
        X_train = np.transpose(X_train, axes=[0,2,1,3])
        X_validate = np.transpose(X_validate, axes=[0,2,1,3])
        X_test = np.transpose(X_test, axes=[0,2,1,3])
        transpose_split = (X_train, y_train, X_validate, y_validate, X_test, y_test) 
    return transpose_split



def normal(X, X_mean, X_std):
    tile_shape = np.array(np.array(X.shape) / np.array(X_mean.shape), dtype=int)
    mean_ = np.tile(X_mean, tile_shape)
    std_ =  np.tile(X_std, tile_shape)   
    std_[np.where(std_==0.)] = 1
    X_scaled = ((X - mean_) /  std_)
    return X_scaled


def regularize_country_names(df):
    """ Reformat column and index names. only works with with pandas MultiIndex for level=0.
    
    Parameters :
    ----------
    df : Pandas DataFrame

    Notes :
    -----
    Different datasets have different naming conventions (for countries that go by multiple names and abbreviations).
    This function imposes a convention on a selection of these country names.  
    """
    # these lists are one-to-one. countries compared via manual inspection, unfortunately. 
    mismatch_labels_bad = ['Lao People\'s Democratic Republic', 'Mainland China',
                           'Occupied Palestinian Territory','Republic of Korea', 'Korea, South', 
                           'Gambia, The ', 'UK', 
                           'USA', 'Iran (Islamic Republic of)',
                           'Bahamas, The', 'Russian Federation', 'Czech Republic', 'Republic Of Ireland',
                          'Hong Kong Sar', 'Macao Sar', 'Uk','Us',
                           'Congo ( Kinshasa)','Congo ( Brazzaville)',
                           'Cote D\' Ivoire', 'Viet Nam','Guinea- Bissau','Guinea','Usa']

    mismatch_labels_good = ['Laos','China',
                            'Palestine', 'South Korea', 'South Korea', 
                            'The Gambia', 'United Kingdom', 
                            'United States','Iran',
                            'The Bahamas','Russia','Czechia','Ireland',
                            'Hong Kong','Macao','United Kingdom', 'United States',
                            'Democratic Republic Of The Congo','Republic Of The Congo',
                            'Ivory Coast','Vietnam', 'Guinea Bissau','Guinea Bissau','United States']
    if len(df.index.names) == 1:
        placeholder = df.index.name
        df = df.reset_index()
        df.loc[:,placeholder] = df.loc[:,placeholder].replace(to_replace=mismatch_labels_bad, value=mismatch_labels_good)
        print(len(df))
        df = df.drop_duplicates()
        print(len(df))
        df = df.set_index(placeholder)#.sum()
    else:
        placeholder = df.index.names[0]
        df = df.reset_index(level=0)
        df.loc[:,placeholder] = df.loc[:,placeholder].replace(to_replace=mismatch_labels_bad, value=mismatch_labels_good)
        print(len(df))
        df = df.drop_duplicates()
        print(len(df))
        df = df.set_index([placeholder, df.index])
    return df

def clean_DataFrame(df):
    """ Remove all NaN or single value columns. 
    
    """
    # if 0 then column is all NaN, if 1 then could be mix of NaN and a
    # single value at most. 
    df = df.loc[:, df.columns[(df.nunique() > 0)]]
    return df


def drop_all_but_least_missing(df, feature):
    matching_columns = column_search(df, feature, return_style='iloc', threshold='match') 
    feature_index =  matching_columns[df.iloc[:, matching_columns].isna().sum().argmin()]

In [None]:
# data[data.location=='Afghanistan'].set_index(['time_index','location'])#.transpose()#.droplevel(1, axis=1)#.unstack()#.drop_level(1)

# test = data.set_index(['time_index','location'])
# pd.DataFrame(test.values.reshape(-1, 22*data.time_index.nunique())[0,:].reshape(-1, 22))

# model_data = data.drop(columns=data.columns[data.columns.str.contains('flag')])
# model_data.columns

# model_data.iloc[:,2:].corr().replace(to_replace=1., value=np.nan).max()

# model_data.iloc[:,2:].corr().replace(to_replace=1., value=np.nan).idxmax()

# null_info = (data.groupby('location').mean()==0).sum(1).sort_values()==0
# null_info.index[np.where(null_info)[0]].sort_values()

# np.where(null_info)#[0]

# null_info = (data.groupby('location').mean()==0).sum(1).sort_values()==0
# full_data = data[data.location.isin(null_info.index[np.where(null_info)[0]])]

# n_test_days = 1
# n_days_into_future = 1
# n_prune = 4
# # model_data = data.drop(columns=column_search(data, 'test'))
# # model_data = data[data.time_index>40]
# # model_data = data.drop(columns=data.columns[data.columns.str.contains('flag')])#.drop(columns=data.columns[data.columns.str.contains('mean')])
# model_data = full_data
# # model_data = data.iloc[:,:8]
# Xs, ys, model =  n_day_forecasting(model_data, n_test_days, n_days_into_future,
#                                    n_prune=n_prune, col_transformer=MinMaxScaler())
# y_true, y_naive, y_predict = ys
# print('There were {} negative predictions, setting these values to 0.'.format(len(y_predict[y_predict<0])))
# # y_predict[y_predict<0]=0


In [None]:
# n_prune = 2
# mae_list_naive = []
# r2_list_naive = []
# mae_list_predict = []
# r2_list_predict = []
# # data = data[data.time_index >= first_day]
# model_data = data.iloc[:, n_prune:].copy()#.apply(lambda x : np.log(x+1))
# new_cases_index = column_search(model_data,'new_cases_weighted',threshold='match', return_style='iloc')[0]
# n_countries = data.location.nunique()
# target_data = data.new_cases_weighted
# time_index = data.time_index
# frame_size = 14
# start_date = frame_size + time_index.min()
# n_validation_frames = 0
# n_test_frames = 1
# n_days_into_future = 14
# train_or_test = 'train'

# for max_date_in_window in range(start_date, time_index.max() - n_days_into_future + 2):
#     # Take all model_data with date proxy less than numerical value, leading_window_date_not_included
#     frame_data = model_data[(time_index < max_date_in_window) & 
#                             (time_index >= max_date_in_window-frame_size)]
#     #     print(frame_data.shape)
#     # Reshape the array such that each element along axis=0 is a time series of all feature model_data of a specific country.
#     reshaped_frame_data = frame_data.values.reshape(n_countries, frame_size, -1)
#     #     print(reshaped_frame_data.shape)
#     # Truncate / pad the windows along the "time" axis, axis=1. (pad_sequences takes in an iterable of iterables;
#     # the first axis is always the default iteration axis. 
#     # *********************** WARNING: pad_sequences converts to integers by default *********************
#     resized_frame_data = pad_sequences(reshaped_frame_data, maxlen=frame_size, dtype=np.float64)
#     frame_data_4D = resized_frame_data[np.newaxis, :, :, :]
#     if max_date_in_window == start_date:
#         X = frame_data_4D.copy()
#     else:
#         X = np.concatenate((X, frame_data_4D),axis=0)

        

# days_in_dataset_list = []
# earliest_date_in_dataset = []
# # X = X[80:,:,:,:]
# for first_day in range(0, X.shape[0]-2):
#     X = X[1:,:,:,:]
#     days_in_dataset_list.append(X.shape[0])
#     earliest_date_in_dataset.append(int(X[0, 0, 0, 0]))
#     print(X.shape[0],end=' ')
#     y = target_data.values.reshape(-1, time_index.nunique()).transpose()[-X.shape[0]:,:]
#     y_time_index = time_index.values.reshape(-1, time_index.nunique()).transpose()[-X.shape[0]:,:]
#     # y = model_data.new_cases_weighted.values.reshape(-1, model_data.time_index.nunique()).transpose()[-X.shape[0]:,:]

#     X_train= X[:-n_test_frames,:,:,:]
#     y_train =  y[:-n_test_frames,:]
#     X_test = X[-n_test_frames:, :, :, :] 
#     y_test = y[-n_test_frames:, :]
#     splits =  (X_train, y_train, X_test, y_test)
#     y_train_time = y_time_index[:-n_test_frames,:]
#     y_test_time = y_time_index[-n_test_frames:, :]

#     X_train_model = np.concatenate(X_train.reshape(X_train.shape[0], X_train.shape[1], -1), axis=0)
#     X_test_model = np.concatenate(X_test.reshape(X_test.shape[0], X_test.shape[1], -1), axis=0)#[:,2:23]
#     y_train_model = y_train.ravel()
#     y_test_model = y_test.ravel()

#     model = Ridge(fit_intercept=False, tol=1e-12) 
#     _ = model.fit(X_train_model, y_train_model.ravel())


#     y_true = y_train_model
#     _, y_predict, mae = classifier_analysis(model, X_train_model, 
#                                                  y_train_model.ravel(), 
#                                                  plot=False, metric='mae')


# #     plt.plot(y_predict)
# #     plt.plot(y_true.ravel())

#     y_train_naive = X_train[:,:,-1,new_cases_index].ravel()
#     y_test_naive = X_test[:,:,-1,new_cases_index].ravel()
    
# #     print('There were {} negative predictions'.format(len(y_predict[y_predict<0])))
#     y_predict[y_predict<0]=0


#     mae_list_naive.append(mean_absolute_error(y_true.ravel(), y_train_naive.ravel()))
#     mae_list_predict.append(mean_absolute_error(y_true.ravel(), y_predict))
#     r2_list_naive.append(explained_variance_score(y_true.ravel(), y_train_naive.ravel()))
#     r2_list_predict.append(explained_variance_score(y_true.ravel(), y_predict))

# #     print('{}-step MAE [Naive, Ridge Regression] = [{},{}]'.format(
# #     n_days_into_future, mae_train_naive, mae_predict))
# #     print('{}-step R^2 [Naive, Ridge Regression] = [{},{}]'.format(
# #     n_days_into_future, r2_train_naive, r2_predict))

# #     true_predict_plot(y_true.ravel(), y_train_naive.ravel(), y_predict)
# #     residual_diff_plots(y_true.ravel(), y_train_naive.ravel(), y_predict , n_days_into_future, data.location.nunique())

# train_or_test = 'train'
# if train_or_test == 'train':
#     y_true = y_train_model
#     _, y_predict, mae = classifier_analysis(model, X_train_model, 
#                                                  y_train_model.ravel(), 
#                                                  plot=False, metric='mae')
# else:
#     y_true = y_test
#     _, y_predict, mae = classifier_analysis(model, X_test_model,
#                                                  y_test.ravel(),
#                                                  plot=False, metric='mae')
    
# y_train_naive = X_train[:, :, -1, new_cases_index].ravel()
# y_test_naive = X_test[:, :, -1, new_cases_index].ravel()

# print('There were {} negative predictions'.format(len(y_predict[y_predict<0])))
# y_predict[y_predict<0]=0


# mae_train_naive = mean_absolute_error(y_true.ravel(), y_train_naive.ravel())
# mae_predict = mean_absolute_error(y_true.ravel(), y_predict)
# r2_train_naive = explained_variance_score(y_true.ravel(), y_train_naive.ravel())
# r2_predict = explained_variance_score(y_true.ravel(), y_predict)

# print('{}-step MAE [Naive, Ridge Regression] = [{},{}]'.format(
# n_days_into_future, mae_train_naive, mae_predict))
# print('{}-step R^2 [Naive, Ridge Regression] = [{},{}]'.format(
# n_days_into_future, r2_train_naive, r2_predict))

# true_predict_plot(y_true.ravel(), y_train_naive.ravel(), y_predict)
# residual_diff_plots(y_true.ravel(), y_train_naive.ravel(), y_predict , n_days_into_future, data.location.nunique())

In [None]:
plt.plot(days_in_dataset_list, r2_list_naive)
plt.xlabel('Number of days in training set')
plt.ylabel('Naive baseline R2 value')

plt.plot(days_in_dataset_list,r2_list_predict)
plt.xlabel('Number of days in training set')
plt.ylabel('Predictions (on training data) R2 value')

plt.plot(days_in_dataset_list, mae_list_predict)
plt.xlabel('Number of days in training set')
plt.ylabel('Predictions (on training data) MAE')

plt.plot(days_in_dataset_list,mae_list_naive)
plt.xlabel('Number of days in training set')
plt.ylabel('Naive MAE')

The next plot is a *truncation* of the *earlier* days in the data set. This is only predictions on the training data;
the x axis corresponds to the earliest date in the dataset. In other words, the left endpoint of each curve includes all available dates (115 days), while the endpoint on the right includes only the most recent dates (2 days).


In [None]:
plt.plot(np.array(earliest_date_in_dataset)-X[-1,0,0,0], mae_list_predict, label='MAE of predictions')
plt.plot(np.array(earliest_date_in_dataset)-X[-1,0,0,0], mae_list_naive, label='MAE of naive baseline')
plt.xlabel('Earliest date in dataset, relative to present date')
plt.ylabel('MAE')
plt.title('MAE vs. Number of days in dataset, 14-day predictions')
plt.legend()

# framewise mean std

In [1]:
# for i in range(0, X_train_model.shape[0]):
#     current_frame_mean = X_train_model[i,:,:,:].mean(axis=1)
#     current_frame_std = X_train_model[i,:,:,:].std(axis=1)
#     latest_mean_array = np.tile(current_frame_mean[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
#     latest_std_array = np.tile(current_frame_std[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
#     if i == 0:
#         frame_only_mean_array = latest_mean_array
#         frame_only_std_array = latest_std_array
#     else:

#         frame_only_mean_array = np.concatenate((frame_only_mean_array, 
#                                                latest_mean_array)
#                                               ,axis=0)
#         frame_only_std_array = np.concatenate((frame_only_std_array, 
#                                                latest_std_array)
#                                               ,axis=0)
        

# frame_only_std_array[np.where(frame_only_std_array==0)]=1
# X_train_model_model = (X_train_model - frame_only_mean_array) / frame_only_std_array
# # Use the latest min and max for test scaling.

# latest_std_array[np.where(latest_std_array==0)] = 1
# X_test_model_model = (X_test_model - latest_mean_array) / latest_std_array

# X_train_model_model = np.concatenate(X_train_model_model.reshape(X_train_model_model.shape[0], 
#                                                                      X_train_model_model.shape[1], -1), axis=0)
# X_test_model_model = np.concatenate(X_test_model_model.reshape(X_test_model_model.shape[0], 
#                                                                    X_test_model_model.shape[1], -1), axis=0)

# y_train_model = y_train.ravel()
# y_test_model = y_test.ravel()

# framewise min max

In [None]:
# for i in range(0, X_train.shape[0]):
#     # find the minima and maxima of all features for all countries, ranging up to current frame and 
#     # each time step in the frame. 
#     current_frame_min = X_train[i,:,:,:].min(axis=1)
#     current_frame_max = X_train[i,:,:,:].max(axis=1)
#     latest_min_array = np.tile(current_frame_min[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
#     latest_max_array = np.tile(current_frame_max[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
#     if i == 0:
#         frame_only_min_array = latest_min_array
#         frame_only_max_array = latest_max_array
#     else:

#         frame_only_min_array = np.concatenate((frame_only_min_array, 
#                                                latest_min_array)
#                                               ,axis=0)
#         frame_only_max_array = np.concatenate((frame_only_max_array, 
#                                                latest_max_array)
#                                               ,axis=0)
        

# frame_only_minmax_denominator = (frame_only_max_array-frame_only_min_array)
# num_zeros_train = (frame_only_minmax_denominator==0).sum()

# frame_only_minmax_denominator[np.where(frame_only_minmax_denominator==0)]=1
# X_train_normalized = (X_train - frame_only_min_array) / frame_only_minmax_denominator
# # Use the latest min and max for test scaling.

# frame_only_denom_for_test = latest_max_array - latest_min_array
# num_zeros_test = (frame_only_denom_for_test==0).sum()

# frame_only_denom_for_test[np.where(frame_only_denom_for_test==0)] = 1
# X_test_normalized = (X_test - latest_min_array) / frame_only_denom_for_test

# X_train_model = np.concatenate(X_train_normalized.reshape(X_train_normalized.shape[0], 
#                                                                      X_train_normalized.shape[1], -1), axis=0)
# X_test_model = np.concatenate(X_test_normalized.reshape(X_test_normalized.shape[0], 
#                                                                    X_test_normalized.shape[1], -1), axis=0)

# y_train_model = y_train.ravel()
# y_test_model = y_test.ravel()

In [None]:
for i in range(0, X_train.shape[0]):
    # find the minima and maxima of all features for all countries, ranging up to current frame and 
    # each time step in the frame. 
    current_frame_min = X_train[i,:,:,:].min(axis=1)
    current_frame_max = X_train[i,:,:,:].max(axis=1)
    latest_min_array = np.tile(current_frame_min[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
    latest_max_array = np.tile(current_frame_max[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
    if i == 0:
        frame_only_min_array = latest_min_array
        frame_only_max_array = latest_max_array
    else:

        frame_only_min_array = np.concatenate((frame_only_min_array, 
                                               latest_min_array)
                                              ,axis=0)
        frame_only_max_array = np.concatenate((frame_only_max_array, 
                                               latest_max_array)
                                              ,axis=0)
        

frame_only_minmax_denominator = (frame_only_max_array-frame_only_min_array)
num_zeros_train = (frame_only_minmax_denominator==0).sum()

frame_only_minmax_denominator[np.where(frame_only_minmax_denominator==0)]=1
X_train_normalized = (X_train - frame_only_min_array) / frame_only_minmax_denominator
# Use the latest min and max for test scaling.

frame_only_denom_for_test = latest_max_array - latest_min_array
num_zeros_test = (frame_only_denom_for_test==0).sum()

frame_only_denom_for_test[np.where(frame_only_denom_for_test==0)] = 1
X_test_normalized = (X_test - latest_min_array) / frame_only_denom_for_test

# for i in range(1, X_train.shape[0]+1):
#     # find the minima and maxima of all features for all countries, ranging up to current frame and 
#     # each time step in the frame. 
#     up_to_current_frame_min = X_train[:i,:,:,:].min((0,2))
#     up_to_current_frame_max = X_train[:i,:,:,:].max((0,2))
#     latest_min_array = np.tile(up_to_current_frame_min[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
#     latest_max_array = np.tile(up_to_current_frame_max[np.newaxis, :, np.newaxis, :],(1,1,frame_size,1))
#     if i == 1:
#         frame_wise_min_array = latest_min_array
#         frame_wise_max_array = latest_max_array
#     else:

#         frame_wise_min_array = np.concatenate((frame_wise_min_array, 
#                                                latest_min_array)
#                                               ,axis=0)
#         frame_wise_max_array = np.concatenate((frame_wise_max_array, 
#                                                latest_max_array)
#                                               ,axis=0)
        
# frame_wise_minmax_denominator = (frame_wise_max_array-frame_wise_min_array)
# num_zeros_train = (frame_wise_minmax_denominator==0).sum()
# print('num zeros train', num_zeros_train)
# frame_wise_minmax_denominator[np.where(frame_wise_minmax_denominator==0)]=1
# X_train_normalized = (X_train - frame_wise_min_array) / frame_wise_minmax_denominator
# # Use the latest min and max for test scaling.

# frame_wise_denom_for_test = latest_max_array - latest_min_array
# num_zeros_test = (frame_wise_denom_for_test==0).sum()

# frame_wise_denom_for_test[np.where(frame_wise_denom_for_test==0)] = 1
# X_test_normalized = (X_test - latest_min_array) / frame_wise_denom_for_test

# In-frame minmax, then up to present day minmaxing. 

In [None]:
n_prune = 4
first_day = 40
mae_list_naive = []
r2_list_naive = []
mae_list_predict = []
r2_list_predict = []
data = data[data.time_index >= first_day]
model_data = data.iloc[:, n_prune:].copy()#.apply(lambda x : np.log(x+1))
new_cases_index = column_search(model_data,'new_cases_weighted',threshold='match', return_style='iloc')[0]
n_countries = data.location.nunique()
target_data = data.new_cases_weighted
time_index = data.time_index
frame_size = 14
start_date = frame_size + time_index.min()
n_validation_frames = 0
n_test_frames = 1
n_days_into_future = 1
train_or_test = 'train'
n_features = model_data.shape[-1]


for max_date_in_window in range(start_date, time_index.max() - n_days_into_future + 2):
    # Take all model_data with date proxy less than numerical value, leading_window_date_not_included
    frame_data = model_data[(time_index < max_date_in_window) & 
                            (time_index >= max_date_in_window-frame_size)]
    #     print(frame_data.shape)
    # Reshape the array such that each element along axis=0 is a time series of all feature model_data of a specific country.
    reshaped_frame_data = frame_data.values.reshape(n_countries, frame_size, -1)
    #     print(reshaped_frame_data.shape)
    # Truncate / pad the windows along the "time" axis, axis=1. (pad_sequences takes in an iterable of iterables;
    # the first axis is always the default iteration axis. 
    # *********************** WARNING: pad_sequences converts to integers by default *********************
    resized_frame_data = pad_sequences(reshaped_frame_data, maxlen=frame_size, dtype=np.float64)
    frame_data_4D = resized_frame_data[np.newaxis, :, :, :]
    if max_date_in_window == start_date:
        X = frame_data_4D.copy()
    else:
        X = np.concatenate((X, frame_data_4D),axis=0)


y = target_data.values.reshape(-1, time_index.nunique()).transpose()[-X.shape[0]:,:]
y_time_index = time_index.values.reshape(-1, time_index.nunique()).transpose()[-X.shape[0]:,:]
# y = model_data.new_cases_weighted.values.reshape(-1, model_data.time_index.nunique()).transpose()[-X.shape[0]:,:]

X_train= X[:-n_test_frames,:,:,:]
y_train =  y[:-n_test_frames,:]
X_test = X[-n_test_frames:, :, :, :] 
y_test = y[-n_test_frames:, :]
splits =  (X_train, y_train, X_test, y_test)
y_train_time = y_time_index[:-n_test_frames,:]
y_test_time = y_time_index[-n_test_frames:, :]
X_train_model = X_train
X_test_model = X_test

y_train_naive = X_train[:, :, -1, new_cases_index]#.ravel()
y_test_naive = X_test[:, :, -1, new_cases_index]#.ravel()

# I think its likely that I need to aggregate/normalize with respect to all countries, otherwise the relative values won't be predicted accurately, as well as change the order of the data so that it includes all countries 

# To capture seasonality, normalize in-frame. To capture trend, normalize up to frame, AFTER in-frame. 

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 5))
_ = ax.plot(ev_scores, marker='.', markersize=2, linewidth=0.5, color='k')

_ = ax.add_patch(Rectangle((0, 0), missing_flag_epoch.min(),  max(ev_scores), 
                           angle=0.0, alpha=0.5, color='g', label='original data'))

_ = ax.add_patch(Rectangle((missing_flag_epoch.min(), 0), 
                           missing_flag_epoch.max()-missing_flag_epoch.min(),  max(ev_scores), 
                           angle=0.0, alpha=0.5, color='k', label='Missing value flags'))

_ = ax.add_patch(Rectangle((rolling_epoch.min(), 0), 
                           rolling_epoch.max()-rolling_epoch.min(), max(ev_scores), 
                           angle=0.0, alpha=0.25, color='red', label='Rolling averages'))

_ = ax.add_patch(Rectangle((location_dummies.min(), 0),
                           location_dummies.max()-location_dummies.min(),  max(ev_scores), 
                           angle=0.0, alpha=0.5, color='gray', label='Location one-hot'))

_ = ax.add_patch(Rectangle((date_dummies.min(), 0), 
                           date_dummies.max()-date_dummies.min(), max(ev_scores), 
                           angle=0.0, alpha=0.25, color='blue', label='Date one-hot'))

_ = ax.set_ylabel('Explained Variance')
_ = ax.set_xlabel('Number of features included in model')
_ = plt.legend(loc=(1.05,0.6), title='Feature Origin')

drift_features = model_data.loc[:, :'time_index'].iloc[:, :-2].columns.tolist()

naive_features = (time_independent.columns.tolist() 
                  + location_one_hot.columns.tolist()
                  +tests_units_one_hot.columns.tolist()
                  + flag_and_misc.columns.tolist()[:-2])


In [None]:
# 2 slices date, location, 4 slices date,location,time_index, days_since_first_case

X_train = X.loc[train_indices, :]#.apply(lambda x : np.log(x+1))
y_train = y.loc[train_indices,['time_index','new_cases_weighted']]

X_test = X.loc[test_indices,:]#.apply(lambda x : np.log(x+1))
y_test =  y.loc[test_indices,['time_index','new_cases_weighted']]#.values.ravel()

# if train_or_test == 'train':
y_train_naive = X_train.loc[:, ['time_index','new_cases_weighted']]
# else:
y_test_naive = X_test.loc[:, ['time_index','new_cases_weighted']]
X_train = X_train.iloc[:,n_prune:]
X_test = X_test.iloc[:,n_prune:]
col_transformer = MinMaxScaler()
_ = col_transformer.fit(X_train)
X_train_normalized =  col_transformer.transform(X_train)
X_test_normalized =  col_transformer.transform(X_test)