# *Benchmark Models Generation - to provide a baseline for the prediction competition

In [None]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import os
from functools import partial
import random


# Views 3
# These imports require a certificate, these won't work without the certificate
import views_runs
from viewser.operations import fetch
from views_forecasts.extensions import *
from viewser import Queryset, Column

#All the functions are in the BenchmarkModels.py file, importing them to use them in this notebook
from BenchmarkModels import *

In [None]:
# Common parameters:

dev_id = 'Fatalities002' # This is used when using ensemble prediction functions later in the notebook - pd.DataFrame.forecasts.read_store(cm_ensemble_name, run=dev_id)[stepcols]
run_id = 'Fatalities002'
EndOfHistory = 508
get_future = False

username = os.getlogin() # This is your PC's username to use Dropbox - install Dropbox App to avail this functionality

steps = [*range(1, 36+1, 1)] # Which steps to train and predict for - Creates a list from 1 to 36 with 36 numbers

fi_steps = [1,3,6,12,36] # this list is never used


# Specifying partitions

calib_partitioner_dict = {"train":(121,396),"predict":(397,456)} #Calibration partition - the numbers are month codes 121 - January 1990 
test_partitioner_dict = {"train":(121,444),"predict":(457,504)} #Test partition
future_partitioner_dict = {"train":(121,492),"predict":(505,512)} #Future Prediction partition
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict}) # calling a function from viewser
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict}) # calling a function from viewser
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict}) # calling a function from viewser

Mydropbox = f'/Users/{username}/Dropbox (ViEWS)/ViEWS/' #download Dropbox app
overleafpath = f'/Users/{username}/Dropbox (ViEWS)/Apps/Overleaf/Prediction competition 2023/' #download Overleaf app


print('Dropbox path set to',Mydropbox)
print('Overleaf path set to',overleafpath)

# From here we start creating benchmark models - we will create 1000 or 100 draws or probabilty distribution around the point predictions from the viewser prediction ensemble function

In [5]:
# Benchmark model parameters 
filepath = Mydropbox + 'Prediction_competition_2023/'

year_list = [2018, 2019, 2020, 2021] # Creating predictions for 4 years
draws_cm = 1000 # 1000 draws at cm level
draws_pgm = 100 # 100 draws at pgm level

steps = [3,4,5,6,7,8,9,10,11,12,13,14] #list of steps to create a list of names below
stepcols = ['ln_ged_sb_dep']
for step in steps:
    stepcols.append('step_pred_' + str(step))
print(stepcols)

['ln_ged_sb_dep', 'step_pred_3', 'step_pred_4', 'step_pred_5', 'step_pred_6', 'step_pred_7', 'step_pred_8', 'step_pred_9', 'step_pred_10', 'step_pred_11', 'step_pred_12', 'step_pred_13', 'step_pred_14']


# CM Level Benchmark Models

### Based on ensemble from viewser prediction function; the point prediciton are expanded using a Poisson draw with mean=variance=\hat{y}_{it} Mean and variance are equal. They are equal to the point prediction here.

In [None]:
# Assembling benchmark based on VIEWS ensemble predictions
sc_predictions_ensemble = [] #empty list to store the predictions
cm_ensemble_name = 'cm_ensemble_genetic_test' #internal name to feed to the viewser function
    
ensemble_df = pd.DataFrame.forecasts.read_store(cm_ensemble_name, run=dev_id)[stepcols] #need viewser certificate, read_store is a function 
ensemble_df.head() # check the ensemble dataframe

for year in year_list: # year_list = [2018, 2019, 2020, 2021]
    sc_dict = {
        'year': year,
        'prediction_df': extract_sc_predictions(year=year,ss_predictions=ensemble_df)
    } # creating a dictionary year-wise
    sc_predictions_ensemble.append(sc_dict) # appended the dictionary to the empty list
    

In [None]:
# Extracting actuals from the ensemble
actuals=np.expm1(ensemble_df['ln_ged_sb_dep'].fillna(0)) #using numpy Calculate exp(x) - 1 for all elements in the array on one column of pandas
actuals_by_year=[] #year list
for year in year_list:
    actuals_dict = {
        'year': year,
        'actuals_df': extract_year(year=year,df=actuals) #extract info from actuals year-wise, see BenchmarkModels.py for the function details
    }
    actuals_by_year.append(actuals_dict)

In [None]:
actuals.isna().sum() #finds the number of missing values in the dataframe - actuals

### CM - Poisson

In [None]:
# Expanding by drawing n draws from "Poisson distribution"   

for year_record in sc_predictions_ensemble:
    print(year_record['year'])
    df = year_record.get('prediction_df')
    year_record['expanded_df_poisson'] = expanded_df_poisson(df,ndraws=1000,level='cm') #create a new column in the 
    
describe_expanded(df=sc_predictions_ensemble[0]['prediction_df'], df_expanded=sc_predictions_ensemble[0]['expanded_df_poisson'], month=457, country=57)  #comparing actuals with expanded dataframe   #month 457 - Jan 2018 and country 57? why this particularly? 

sc_predictions_ensemble[0]['expanded_df_poisson'].head()

### CM - Bootstrap

In [None]:
# Expanding by drawing n draws bootstrap-fashion from the actuals   

for year_record in actuals_by_year:
#    print(year_record)
    df = year_record.get('actuals_df')
    
    year_record['expanded_df_bootstrap'] = expanded_df_bootstrap(df,ndraws=1000,draw_from=df['ln_ged_sb_dep'],level='cm')
    
describe_expanded(df=actuals_by_year[0]['actuals_df'], df_expanded=actuals_by_year[0]['expanded_df_bootstrap'], month=457, country=57) #comparing actuals with expanded dataframe   #month 457 - Jan 2018 and country 57? why this particularly?

actuals_by_year[0]['expanded_df_bootstrap'].head()

In [None]:
actuals_by_year[0]['actuals_df']['ln_ged_sb_dep']

In [None]:
bootstrapped = pd.DataFrame(actuals_by_year[0]['expanded_df_bootstrap']['outcome']).astype('float64')

In [None]:
bootstrapped.describe()

In [None]:
actuals_by_year[0]['actuals_df']['ln_ged_sb_dep'].describe()

### CM - Constituent Models

#### Based on constituent models

Short version, 20 models: 
1 "draw"
from each of 20 constituent models

Plus version with 45 draws from Poisson distribution for each model.

Possibly obsolete:
Long version, 440 models:
20 "draws" from each of 22 constituent models, using predictions for adjacent steps (from s-4 to s+6). Some duplications to weight the most proximate steps more.



In [None]:

# Fatalities002 stuff - contains the list of the current fatalities002 ensemble models

from ModelDefinitions import DefineEnsembleModels

level = 'cm'
ModelList_cm = DefineEnsembleModels(level)
ModelList_cm = ModelList_cm[0:20] # Drop Markov models

i = 0
for model in ModelList_cm:
    print(i, model['modelname'], model['data_train'])
    i = i + 1

# Retrieving the predictions for calibration and test partitions
# The ModelList contains the predictions organized by model
from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated

ModelList_cm = RetrieveStoredPredictions(ModelList_cm, steps, EndOfHistory, dev_id, level, get_future)

ModelList_cm = CalibratePredictions(ModelList_cm, EndOfHistory, steps)

In [None]:
# Assembling benchmark based on VIEWS constituent model predictions
draws_per_model = np.floor_divide(draws_cm,len(ModelList_cm))

for model in ModelList_cm:
    print(model['modelname'])

    model['sc_predictions_constituent'] = []

    for year in year_list:
        sc_dict = {
            'year': year,
            'prediction_df': extract_sc_predictions(year=year,ss_predictions=model['predictions_test_df'])
        }
        model['sc_predictions_constituent'].append(sc_dict)

    # Expanding by drawing n draws from Poisson distribution   
    for year_record in model['sc_predictions_constituent']:
        print(year_record['year'])
        df = year_record.get('prediction_df')
        year_record['expanded_df'] = expanded_df(df,ndraws=50,level='cm')

In [None]:
sc_predictions_constituent = []

for year in year_list:
    print(year)
    print(ModelList_cm[0]['modelname'])
    merged_expanded_df = ModelList_cm[0]['sc_predictions_constituent'][year-2018]['expanded_df']
#    print(expanded_df.describe())
    i = 0
    for model in ModelList_cm[1:19]:
        print(model['modelname'])
        merged_expanded_df = pd.concat([merged_expanded_df,model['sc_predictions_constituent'][year-2018]['expanded_df']])
#        print(expanded_df.describe())
        
    sc_dict = {
        'year': year,
        'expanded_df': merged_expanded_df
    }
    sc_predictions_constituent.append(sc_dict)
    i = i + 1
       
#sc_predictions

### Saving the CM benchmark models & actuals in separate Parquet files in Dropbox

In [None]:
model_names = ['ensemble','constituent']
i = 0
for bm_model in [sc_predictions_ensemble,sc_predictions_constituent]:
    for record in bm_model:
        year_record = record # First part of record list is list of yearly predictions, second is string name for benchmark model
        print(year_record['year'])
        filename = filepath + 'bm_cm_' + model_names[i] + '_expanded_' + str(year_record['year']) + '.parquet'
        print(filename)
        year_record['expanded_df'].to_parquet(filename) #save to file
    i = i + 1

# Dataframe with actuals
df_actuals = pd.DataFrame(ModelList_cm[0]['predictions_test_df']['ln_ged_sb_dep'])
cm_actuals = df_actuals
cm_actuals['ged_sb'] = np.expm1(cm_actuals['ln_ged_sb_dep'])
cm_actuals.drop(columns=['ln_ged_sb_dep'], inplace=True)
print(cm_actuals.head())
print(cm_actuals.tail())
print(cm_actuals.describe())


# Annual dataframes with actuals, saved to disk
for year in year_list:
    first_month = (year - 1980)*12 + 1
    last_month = (year - 1980 + 1)*12
    df_annual = cm_actuals.loc[first_month:last_month]
    filename = filepath + 'cm_actuals_' + str(year) + '.parquet'
    print(year, first_month, last_month, filename)
    print(df_annual.head())
    df_annual.to_parquet(filename)
# For all four years
filename = filepath + 'cm_actuals_allyears.parquet' #save to file
cm_actuals.to_parquet(filename)



# PGM Level Benchmark Models

In [None]:
# Assembling benchmark based on VIEWS ensemble predictions
sc_predictions_ensemble_pgm = []
# any old pgm data
pgm_ensemble_name = 'pgm_ensemble_cm_calib_test'
    
ensemble_pgm_df = pd.DataFrame.forecasts.read_store(pgm_ensemble_name, run=dev_id)[stepcols]
ensemble_pgm_df.head()

for year in year_list[0:3]:
    sc_dict = {
        'year': year,
        'prediction_df': extract_sc_predictions(year=year,ss_predictions=ensemble_pgm_df)
    }
    sc_predictions_ensemble_pgm.append(sc_dict)
    

In [None]:
# Expanding by drawing n draws from Poisson distribution   
function_with_draws = partial(sample_poisson_row, ndraws=500) #here 500 draws are used?
for year_record in sc_predictions_ensemble_pgm:
    print(year_record['year'])
    df = year_record.get('prediction_df')
    year_record['expanded_df'] = expanded_df(df,ndraws=500,level='pgm')

#describe_expanded(df=sc_predictions_ensemble_pgm[0]['prediction_df'], df_expanded=sc_predictions_ensemble_pgm[0]['expanded_df'], month=457, country=57)   


# Saving the pgm models

In [None]:
model_names = ['ensemble','constituent']
i = 0
for bm_model in [sc_predictions_ensemble_pgm]:
    for record in bm_model:
        year_record = record # First part of record list is list of yearly predictions, second is string name for benchmark model
        print(year_record['year'])
        filename = filepath + 'bm_pgm_' + model_names[i] + '_expanded_' + str(year_record['year']) + '.parquet'
        print(filename)
        year_record['expanded_df'].to_parquet(filename)
    i = i + 1

# Dataframe with actuals
df_actuals = pd.DataFrame(ensemble_pgm_df)
pgm_actuals = df_actuals
pgm_actuals['ged_sb'] = np.expm1(pgm_actuals['ln_ged_sb_dep'])
pgm_actuals.drop(columns=['ln_ged_sb_dep'], inplace=True)
print(pgm_actuals.head())
print(pgm_actuals.tail())
print(pgm_actuals.describe())


# Annual dataframes with actuals, saved to disk
for year in year_list:
    first_month = (year - 1980)*12 + 1
    last_month = (year - 1980 + 1)*12
    df_annual = pgm_actuals.loc[first_month:last_month]
    filename = filepath + 'cm_actuals_' + str(year) + '.parquet'
    print(year, first_month, last_month, filename)
    print(df_annual.head())
    df_annual.to_parquet(filename)
# For all four years
filename = filepath + 'pgm_actuals_allyears.parquet'
pgm_actuals.to_parquet(filename)




# Old stuff from here

In [None]:
%%time
test_data = sc_predictions_ensemble[0].get('prediction_df')
test_data['draws'] = test_data.apply(function_with_draws, axis=1)
test_data.explode('draws')

In [None]:
# Test version, cm
test_data = sc_predictions_ensemble[0].get('prediction_df')
test_data['draws'] = test_data.apply(function_with_draws, axis=1)
td = test_data.explode('draws')
td.describe()

# Old cm stuff from here

In [None]:
df_annual.head()

In [None]:
def reshape_df_cm(df, draw):
    ''' Drops steps we will not need in the benchmark model. 
    Another round of drops are done below '''
    steps_to_drop = ['ln_ged_sb_dep','step_pred_25','step_pred_26','step_pred_27','step_pred_28','step_pred_29','step_pred_30',
                     'step_pred_31','step_pred_32','step_pred_33','step_pred_34','step_pred_35','step_pred_36',]
    df = df.drop(steps_to_drop,axis=1)
    df.reset_index(inplace=True)
    df['draw'] = draw
    df_long = pd.wide_to_long(df, 'step_pred_', i = ['month_id', 'country_id'], j = 'step')
    df_long.reset_index(inplace=True)
    df_long.set_index(['month_id','country_id','step','draw'],inplace=True)
    return(df_long)
    
model_draw = 0
df = ModelList_cm[model_draw]['predictions_test_df'].copy()
df_cm_results_long = reshape_df_cm(df,model_draw)
print('Starting with model/draw',model_draw, model['modelname'])
print(df_cm_results_long.describe())
print(df_cm_results_long.head())


for model in ModelList_cm[1:]:
    model_draw += 1
    print('Appending model/draw',model_draw, model['modelname'])
    df = ModelList_cm[model_draw]['predictions_test_df'].copy()
    df_reshaped = reshape_df_cm(df,model_draw)
    df_cm_results_long = pd.concat([df_cm_results_long ,df_reshaped], axis=0)
    

df_cm_results_long['prediction'] = np.round_(np.expm1(df_cm_results_long['step_pred_'])).astype('int32')
df_cm_results_long.drop(columns=['step_pred_'], inplace=True)
# Results file in long format
print(df_cm_results_long.describe())
print(df_cm_results_long.tail())

print(df_cm_results_long.loc[492].describe())


In [None]:
df_cm_final_extended = df_cm_results_long.copy()

def make_dfcopy_cm(df_in, step, shifted_step, repetition):
    ''' Makes a 'copy' of the df with a shifted step '''
#    print(step, shifted_step, repetition)
    df = pd.DataFrame(df_in[df_in.index.get_level_values('step').isin([shifted_step])]).copy()
    df.reset_index(inplace = True)
    df['step'].replace(shifted_step, step, inplace = True)
#    print(df.describe())
    df['draw'] = (df['draw'] + len(ModelList_cm) * repetition)
#    print(df.describe())
    df.set_index(['month_id', 'country_id', 'step', 'draw'], inplace=True)
    return(df)
    
df_list_steps = []
df_list_steps.append(df_cm_results_long)
for step in range(3,14+1):     
#    print(80*'*')
    print('Step', step)
    df = pd.DataFrame(df_cm_results_long[df_cm_results_long.index.get_level_values('step').isin([step])])
#    print(df.head(3))
    repetition = 1
    df_list = []
    for shift in [(-4,2),(-3,4),(-2,5),(-1,6),(0,6),(1,5),(2,4),(3,3),(4,2),(5,2),(6,2),(7,1),(8,1),(9,1)]:
        for copy in range(1,shift[1]+1):
            shifted_step = step+shift[0]
            if shifted_step < 1:
                shifted_step = step
            step_list = [shifted_step]
            df = make_dfcopy_cm(df_in = df_cm_results_long,step = step, shifted_step = shifted_step, repetition = repetition)
            df_list.append(df)
            repetition += 1
    df_cm_temp = pd.concat(df_list)
    df_list_steps.append(df_cm_temp)

df_cm_final_extended = pd.concat(df_list_steps)
#df.reorder_levels(['month_id','country_id','steps','draw'])

In [None]:
def from_ss48_to_sc12(df, level,firstmonth,years):
    ''' Converts a dataframe in long format from one including all VIEWS ss predictions 
        into a set of dataframes containing only sc predictions for 12 months '''
    df_list = []
    for year in range(1,years+1):
        this_firstmonth = firstmonth + (year-1)*12
        print(year, this_firstmonth)
#        this_df = df.query(f'month_id >= {this_firstmonth} and month_id <= {this_firstmonth+12-1}')
        month_df_list = []
        for step in range(3,14+1):
            select_month = this_firstmonth + step - 3
#            print('retaining month',select_month,'step',step)
            month_df = df.query(f'month_id == {select_month} and step == {step}')
            month_df_list.append(month_df)
        year_df = pd.concat(month_df_list)
        df_list.append(year_df)
    return(df_list)

    
cm_ensemble_predictions = from_ss48_to_sc12(df_cm_final_extended,'cm',445,4)

cm_ensemble_predictions[1].tail()

### cm last historical values benchmark model

In [None]:
qs = (Queryset("benchmark_cm", "country_month")

   # target variable
   .with_column(Column("ged_sb", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                .transform.missing.fill()
                .transform.missing.replace_na()
                )


   .with_theme("benchmark")
   .describe("""Data for empirical benchmark model, cm level


            """)
   )

#queryset = Queryset("name", "loa") # if not already defined
column = "ged_sb_best_sum_nokgi"
table = "ged2_cm"
lags=range(1,65)
for lag in lags: 
    qs = qs.with_column(Column(column+'_' + str(lag), from_table = table, from_column=column)
                        .transform.missing.replace_na()
                        .transform.temporal.tlag(lag)
                       )
df_cm_historical_values = qs.publish().fetch()
df_cm_historical_values = df_cm_historical_values.loc[445:492]

df_cm_historical_values.describe()

In [None]:
# Creating predictions for test partition
number_of_lags = 45
maxstep = 14
df_list_bystep = []
for step in range(3,maxstep+1):
    lags=range(step,step + number_of_lags)
    draw = 0
    print('step:',step,'lag:',lag, 'draw:',draw)
    df_list = []
    for lag in lags:
        number_of_repetitions = number_of_lags+step-lag
#        print('lag:',lag,'repetitions:',number_of_repetitions)
#        print('step:',step,'lag:',lag, 'draw:',draw)
        for repetition in range(1,number_of_repetitions):
            lagged_col = 'ged_sb_best_sum_nokgi_' + str(lag)
            df = pd.DataFrame(df_cm_historical_values[lagged_col].copy())
            df.reset_index(inplace=True)
            df['prediction'] = df[lagged_col]
#            print(df.head())
            df.drop(columns=[lagged_col], inplace=True)
            df['step'] = step
            df['draw'] = draw
            df.set_index(['month_id', 'country_id', 'step','draw'], inplace=True)
            df_list.append(df)
#            if draw == 1 and step == 1:
#                df_cm_predictions_historical_values = df.copy()
#            else:
#                df_cm_predictions_historical_values = pd.concat([df_cm_predictions_historical_values,df])
            draw = draw + 1
    df_cm_predictions_lag = pd.concat(df_list)
    df_list_bystep.append(df_cm_predictions_lag)
    print('Number of draws:',draw + 1)
df_cm_predictions_historical_values = pd.concat(df_list_bystep) 

In [None]:
print(df_cm_predictions_historical_values.describe())
print(df_cm_predictions_historical_values.head())
print(df_cm_predictions_historical_values.tail())

In [None]:
cm_historical_values_predictions = from_ss48_to_sc12(df_cm_predictions_historical_values,'cm',445,4)

In [None]:
def aggregate_and_categorize(df, level):
    ''' This function aggregates the input df across all draws, and returns summary statistics for the prediction model '''
    if level == 'cm':
        index = ['month_id','country_id']
    if level == 'pgm':
        index = ['month_id', 'priogrid_gid']
    if level == 'pgm2':
        index = ['month_id', 'priogrid_id']
    df_to_aggregate = df.copy()
    df_to_aggregate['log_prediction'] = np.log1p(df_to_aggregate['prediction'] )

    # Proportion of draws in fatality categories
    #for cutoffs in [0,1,10,100,1000,10000]:
    bins = pd.IntervalIndex.from_tuples([(-1, 0), (1, 10), (11, 100), (101, 1000), (1001, 10000), (10001,100000000)])
    df_to_aggregate['categorical'] = pd.cut(df_to_aggregate['prediction'],bins)
    df_to_aggregate_dummies = pd.get_dummies(df_to_aggregate['categorical'],prefix='cat')
    df_to_aggregate = pd.concat([df_to_aggregate,df_to_aggregate_dummies],axis=1)

    # Mean and standard deviation of log predictions
    df_aggregated = pd.DataFrame(df_to_aggregate['log_prediction'].groupby(level=index).mean())
    df_aggregated.rename(columns={'log_prediction':'mean_log_prediction'},inplace=True)
    df_aggregated['std_log_prediction'] = df_to_aggregate['log_prediction'].groupby(level=index).std()
    for col in ('cat_(-1, 0]','cat_(1, 10]','cat_(11, 100]','cat_(101, 1000]','cat_(1001, 10000]','cat_(10001, 100000000]'):
        df_aggregated[col] = df_to_aggregate[col].groupby(level=index).mean()
    return(df_aggregated)
    

In [None]:
# Aggregate across draws/samples to extract means, standard deviations, and category probabilities
df_cm_predictions_historical_values_aggregated = aggregate_and_categorize(df_cm_predictions_historical_values,'cm')
df_cm_predictions_ensemble_aggregated = aggregate_and_categorize(df_cm_final_extended,'cm')

print(df_cm_predictions_historical_values_aggregated.describe())
print(df_cm_predictions_historical_values_aggregated.mean())

In [None]:


# Saving the annual sc files and the aggregated versions of them
filepath = Mydropbox + 'Prediction_competition_2023/'
year = 2018
for df in cm_historical_values_predictions:
    # Simplifying the indices: removing the step column
    print('cm_historical', year)
    df = df.reset_index()
    df.set_index(['month_id','country_id','draw'],inplace=True)
    df.drop(columns=['step'],inplace=True)
    df['prediction'] = df['prediction'].astype('int32') 
    print(df.head())
    print(df.dtypes)
    filename = filepath + 'bm_cm_historical_values_' + str(year) + '.parquet'
    print(filename)
    df.to_parquet(filename)
    df_aggregated = aggregate_and_categorize(df,'cm')
    filename = filepath + 'bm_cm_historical_values_agg' + str(year) + '.parquet'
    df_aggregated.to_parquet(filename)
    year = year + 1
year = 2018
for df in cm_ensemble_predictions:
    print('cm_ensemble', year)
    df = df.reset_index()
    df.set_index(['month_id','country_id','draw'],inplace=True)
    df.drop(columns=['step'],inplace=True)
    df['prediction'] = df['prediction'].astype('int32') 
    filename = filepath + 'bm_cm_ensemble_' + str(year) + '.parquet'
    print(filename)
    df.to_parquet(filename)
    df_aggregated = aggregate_and_categorize(df,'cm')
    filename = filepath + 'bm_cm_ensemble_agg' + str(year) + '.parquet'
    df_aggregated.to_parquet(filename)
    year = year + 1    

In [None]:
# Export to csv
include_expansive = False

if include_expansive:
    filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_actuals.parquet'
    cm_actuals.to_parquet(filename)
    filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_benchmark_ensemble_22.parquet'
    df_cm_results_long_pruned.to_parquet(filename)
    filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_benchmark_ensemble_550.parquet'
    df_cm_final_extended.to_parquet(filename)
    filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_benchmark_ensemble_550_aggregated.parquet'
    df_cm_predictions_ensemble_aggregated.to_parquet(filename)
    filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_benchmark_historical_values.parquet'
    df_cm_predictions_historical_values.to_parquet(filename)
    filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_predictions_historical_values_aggregated.parquet'
    df_cm_predictions_historical_values_aggregated.to_parquet(filename)

# pgm level

### Based on ensemble; expanded using a Poisson draw with mean=variance=\hat{y}_{it}

In [None]:
# Assembling benchmark based on VIEWS ensemble predictions
sc_predictions_ensemble_pgm = []
pgm_ensemble_name = 'pgm_ensemble_cm_calib_test'
    
ensemble_pgm_df = pd.DataFrame.forecasts.read_store(pgm_ensemble_name, run=dev_id)[stepcols]
ensemble_pgm_df.head()

for year in year_list[0:3]:
    sc_dict = {
        'year': year,
        'prediction_df': extract_sc_predictions(year=year,ss_predictions=ensemble_pgm_df)
    }
    sc_predictions_ensemble_pgm.append(sc_dict)
    

In [None]:
ensemble_pgm_df.head()

In [None]:
# Expanding by drawing n draws from Poisson distribution   
for year_record in sc_predictions_ensemble_pgm:
    print(year_record['year'])
    year_record['expanded_df'] = expanded_predictions(sc_predictions = year_record['prediction_df'],draws = draws_pgm, level = 'pgm')

describe_expanded(df=sc_predictions_ensemble[0]['prediction_df'], df_expanded=sc_predictions_ensemble[0]['expanded_df'], month=457, country=57)   

# Old pgm stuff from here

## Ensemble model pgm benchmark

kj

In [None]:

level = 'pgm'
ModelList_pgm = DefineEnsembleModels(level)
    
i = 0
for model in ModelList_pgm:
    print(i, model['modelname'], model['data_train'])
    i = i + 1
    
# Retrieving the predictions for calibration and test partitions
# The ModelList contains the predictions organized by model
from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated

ModelList_pgm = RetrieveStoredPredictions(ModelList_pgm, steps, EndOfHistory, dev_id, level, get_future)

#ModelList_pgm = CalibratePredictions(ModelList_pgm, EndOfHistory, steps)

# Dataframe with actuals
df_actuals_pgm = pd.DataFrame(ModelList_pgm[0]['predictions_test_df']['ln_ged_sb_dep'])
print(df_actuals_pgm.head())
print(df_actuals_pgm.tail())


In [None]:
# Reshaping
def reshape_df_pgm(df, draw):
    ''' Drops steps we will not need in the benchmark model. 
    Another round of drops are done below '''
    steps_to_drop = ['ln_ged_sb_dep','step_pred_23','step_pred_24',
                     'step_pred_25','step_pred_26','step_pred_27','step_pred_28','step_pred_29','step_pred_30',
                     'step_pred_31','step_pred_32','step_pred_33','step_pred_34','step_pred_35','step_pred_36',]
    df = df.drop(steps_to_drop,axis=1)
    df.reset_index(inplace=True)
    df['draw'] = draw
    df_long = pd.wide_to_long(df, 'step_pred_', i = ['month_id', 'priogrid_id', 'draw'], j = 'step')
    return(df_long)

model_draw = 0
df = ModelList_pgm[model_draw]['predictions_test_df'].copy()
df_pgm_results_long = reshape_df_pgm(df,model_draw)
print('Starting with model/draw',model_draw, model['modelname'])
print(df_pgm_results_long.describe())


for model in ModelList_pgm[1:]:
    model_draw += 1
    print('Appending model/draw',model_draw, model['modelname'])
    df = ModelList_pgm[model_draw]['predictions_test_df'].copy()
    df_reshaped = reshape_df_pgm(df,model_draw)
    df_pgm_results_long = pd.concat([df_pgm_results_long ,df_reshaped], axis=0)
    

In [None]:
df_pgm_results_long['prediction'] = np.round_(np.expm1(df_pgm_results_long['step_pred_'])).astype('int32')

In [None]:
df_pgm_results_long.drop(columns=['step_pred_'], inplace=True)
#df_pgm_results_extended.index.set_names('priogrid_gid', level=1,inplace=True)

In [None]:
# Results file in long format
print(df_pgm_results_long.describe())
print(df_pgm_results_long.tail())

print(df_pgm_results_long.loc[492].describe())

# Extending by copying adjacent steps

df_pgm_results_extended=df_pgm_results_long.copy()

print(df_pgm_results_extended.describe())
print(df_pgm_results_extended.head())

In [None]:
# Split into separate files by step
df_ensembles_pgm_by_step = []
for step in range(3,14+1):
    print(step)
    df = df_pgm_results_extended.xs(step, level=3).copy()
    #print(df.describe())
    df_ensembles_pgm_by_step.append(df)

In [None]:

def make_dfcopy_pgm(df_in, step, shifted_step, repetition):
    ''' Makes a 'copy' of the df with a shifted step '''
#    print(step, shifted_step, repetition)
    df = pd.DataFrame(df_in[df_in.index.get_level_values('step').isin([shifted_step])]).copy()
    df.reset_index(inplace = True)
    df['step'].replace(shifted_step, step, inplace = True)
#    print(df.describe())
    df['draw'] = (df['draw'] + len(ModelList_pgm) * repetition)
#    print(df.describe())
    df.set_index(['month_id', 'priogrid_id', 'step', 'draw'], inplace=True)
    return(df)

for step in range(3,14+1):     
    print(80*'*')
    print('Step', step, '-- Original dataframe for step', step, 'is:')
    df = pd.DataFrame(df_pgm_results_extended[df_pgm_results_extended.index.get_level_values('step').isin([step])])
    print(df.describe())
    repetition = 1
    df_list_pgm = []
    for shift in [(-4,1),(-3,1),(-2,3),(-1,4),(0,3),(1,3),(2,2),(3,2),(4,1),(5,1),(6,1),(7,1),(8,1)]:
        for copy in range(1,shift[1]+1):
            shifted_step = step+shift[0]
            if shifted_step < 1:
                shifted_step = step
            step_list = [shifted_step]
            df = make_dfcopy_pgm(df_in = df_pgm_results_extended,step = step, shifted_step = shifted_step, repetition = repetition)
            df_list_pgm.append(df)
            repetition += 1
    df_pgm_temp = pd.concat(df_list_pgm)
    print('Extended:')
    print(df_pgm_temp.describe())
    # Export to parquet
    filename = Mydropbox + 'Prediction_competition_2023/' + 'pgm_benchmark_ensemble_step_' + str(step) + '.parquet'
    df_pgm_temp.to_parquet(filename)
    # Aggregate across draws, save    
    filename = Mydropbox + 'Prediction_competition_2023/' + 'pgm_benchmark_ensemble_step_' + str(step) + '_aggregated.parquet'
    df_aggregated_pgm = aggregate_and_categorize(df_pgm_temp,'pgm2')
    print('Aggregated:')
    print(df_aggregated_pgm.describe())
    df_aggregated_pgm.to_parquet(filename)


In [None]:
# Creating sc prediction files for ensemble model
pgm_ensemble_predictions = from_ss48_to_sc12(df_pgm_results_extended,'pgm2',445,4)

# Saving the annual sc files and the aggregated versions of them
filepath = Mydropbox + 'Prediction_competition_2023/'
year = 2018
for df in pgm_ensemble_predictions:
    filename = filepath + 'bm_pgm_ensemble_' + str(year) + '.parquet'
    print(filename)
    df.to_parquet(filename)
    df_aggregated = aggregate_and_categorize(df,'pgm')
    filename = filepath + 'bm_pgm_ensemble_agg' + str(year) + '.parquet'
    df_aggregated.to_parquet(filename)
    year = year + 1    

## Historical values pgm benchmark

In [None]:
# lag settings (number of temporal lags at each spatial lag level)
tlags_cell = 40
tlags_firstorder = 27
tlags_secondorder = 21

# Spatial lags, first-order lag 1:
kernel_inner=1
kernel_width=1
kernel_power=0
norm_kernel=0

rerun_querysets = False

def retrieve_qs(qs_to_retrieve=qs,rerun=True,filename=''):
    if rerun:
        df = qs_to_retrieve.publish().fetch().loc[445:492]    
        df.to_parquet(filename)
    else:
        df = pd.read_parquet(filename)
    return(df)
    

print('Retrieving data for inner cells')

column = "ged_sb_best_sum_nokgi"
table = "ged2_pgm"

qs = (Queryset("benchmark_pgm", "priogrid_month")

   # target variable at t0
   .with_column(Column("ged_sb", from_table="ged2_pgm", from_column="ged_sb_best_sum_nokgi")
                .transform.missing.fill()
                .transform.missing.replace_na()
                )
    # spatial lag at t0
   .with_column(Column("splag_ged_sb_0", from_table = table, from_column = column)
                     .transform.missing.replace_na()
                     .transform.spatial.lag(kernel_inner,kernel_width,kernel_power,norm_kernel)
                    )


   .with_theme("benchmark")
   .describe("""Data for empirical benchmark model, pgm level
            """)
   )

#queryset = Queryset("name", "loa") # if not already defined
column = "ged_sb_best_sum_nokgi"
table = "ged2_pgm"
tlags_0=range(1,tlags_cell + 1)
qs0 = qs.copy()
for lag in tlags_0: 
    qs0 = qs0.with_column(Column(column + '_' + str(lag), from_table = table, from_column=column)
                        .transform.missing.replace_na()
                        .transform.temporal.tlag(lag)
                       )
    
filename = Mydropbox + 'Prediction_competition_2023/' + 'df_pgm_historical_values_0.parquet'
df_pgm_historical_values_0 = retrieve_qs(qs_to_retrieve=qs0,rerun = rerun_querysets,filename=filename)

#if rerun_querysets:
#    df_pgm_historical_values_0 = qs0.publish().fetch().loc[445:492]    
#    df_pgm_historical_values_0.to_parquet(filename)
#else:
#    df_pgm_historical_values_0 = pd.read_parquet(filename)

# Spatial lags, first-order:
print('Retrieving data for first-order neighbors')

kernel_inner=1
kernel_width=1
kernel_power=0
norm_kernel=0

tlags_1=range(1,tlags_firstorder + 1)
qs1 = qs.copy()
for lag in tlags_1:
    qs1 = qs1.with_column(Column(column + '_splag_1_' + str(lag), from_table = table, from_column=column)
                        .transform.missing.replace_na()
                        .transform.temporal.tlag(lag)
                        .transform.spatial.lag(kernel_inner,kernel_width,kernel_power,norm_kernel)
                       )

filename = Mydropbox + 'Prediction_competition_2023/' + 'df_pgm_historical_values_1.parquet'
#df_pgm_historical_values_1 = qs1.publish().fetch().loc[445:492]
df_pgm_historical_values_1 = retrieve_qs(qs_to_retrieve=qs1,rerun = rerun_querysets,filename=filename)

# Spatial lags; second-order:
print('Retrieving data for second-order neighbors')

kernel_inner=2
kernel_width=1
kernel_power=0
norm_kernel=0

tlags_2=range(1,tlags_secondorder + 1)
qs2 = qs.copy()
for lag in tlags_2:
    qs2 = qs2.with_column(Column(column + '_splag_2_' + str(lag), from_table = table, from_column=column)
                        .transform.missing.replace_na()
                        .transform.temporal.tlag(lag)
                        .transform.spatial.lag(kernel_inner,kernel_width,kernel_power,norm_kernel)
                       )

filename = Mydropbox + 'Prediction_competition_2023/' + 'df_pgm_historical_values_2.parquet'
df_pgm_historical_values_2 = retrieve_qs(qs_to_retrieve=qs2,rerun = rerun_querysets,filename=filename)

#df_pgm_historical_values_2 = qs2.publish().fetch().loc[445:492]
print('Done retrieving data')

In [None]:
df_pgm_historical_values_1.head()

In [None]:
# Merging the data frames
df_pgm_historical_values = pd.concat([df_pgm_historical_values_0, df_pgm_historical_values_1, df_pgm_historical_values_2], axis=1, ignore_index=False)

In [None]:
df_pgm_historical_values = df_pgm_historical_values.loc[445:492]
# Computing averages from sums
for lag in tlags_1:
    col = column + '_splag_1_' + str(lag)
    df_pgm_historical_values[col] = df_pgm_historical_values[col]
for lag in tlags_2:
    col = column + '_splag_2_' + str(lag)
    df_pgm_historical_values[col] = df_pgm_historical_values[col]



df_pgm_historical_values.describe()

In [None]:
df_pgm_historical_values_2.describe()

In [None]:
# Creating predictions for test partition
number_of_lags_inner = 24
number_of_lags_1 = 12
number_of_lags_2 = 6
maxstep = 14
df_list_bystep = []
for step in range(3,maxstep+1):
    lags=range(step,number_of_lags_inner+step)
    draw = 1
#    print('step:',step,'lag:',lag, 'draw:',draw)
    df_list = []
    for lag in lags:
        for coltype in [('ged_sb_best_sum_nokgi_',number_of_lags_inner),('ged_sb_best_sum_nokgi_splag_1_',number_of_lags_1),('ged_sb_best_sum_nokgi_splag_2_',number_of_lags_2)]:
            number_of_repetitions = coltype[1]+step-lag
            for repetition in range(1,number_of_repetitions+1):
                if lag <= coltype[1] + step:
                    lagged_col = coltype[0] + str(lag)
#                    print('draw:',draw, 'step:', step, 'lag:',lag,'repetition:', repetition, 'colname:', lagged_col)
                    df = pd.DataFrame(df_pgm_historical_values[lagged_col].copy())
                    df.reset_index(inplace=True)
                    df['prediction'] = df[lagged_col].astype('int32')
                    df.drop(columns=[lagged_col], inplace=True)
                    df['step'] = step
                    df['draw'] = draw
                    df.set_index(['month_id', 'priogrid_gid', 'step','draw'], inplace=True)
                    df_list.append(df)
                    draw = draw + 1
    print('Concatenating', draw-1, 'repetitions for step', step)
    df_pgm_predictions_lag = pd.concat(df_list)
    df_list_bystep.append(df_pgm_predictions_lag)
    
#df_pgm_predictions_historical_values = pd.concat(df_list_bystep) 
#df_pgm_predictions_historical_values.describe()

In [None]:
df_list_bystep[0]

In [None]:
# Creating sc prediction files for historical values model -- step by step
step = 3
df_sc_bystep = [[],[],[],[]]
for step_df in df_list_bystep:
    print('step:', step)
    pgm_hv_step = from_ss48_to_sc12(step_df,'pgm2',445,4)
    per = 0
    for period in pgm_hv_step:
        df_sc_bystep[per].append(pgm_hv_step[per])
        per += 1
    step += 1

In [None]:
print('Aggregating')
pgm_historical_values_predictions = [[],[],[],[]]
for period in range(0,4):
    pgm_historical_values_predictions[period] = pd.concat(df_sc_bystep[period]) 
print('Done')

In [None]:
# Saving the annual sc files and the aggregated versions of them
filepath = Mydropbox + 'Prediction_competition_2023/'
year = 2018
for df in pgm_historical_values_predictions:
    filename = filepath + 'bm_pgm_historical_values_' + str(year) + '.parquet'
    print(filename)
    df.to_parquet(filename)
    df_aggregated = aggregate_and_categorize(df,'pgm')
    filename = filepath + 'bm_pgm_historical_values_agg' + str(year) + '.parquet'
    df_aggregated.to_parquet(filename)
    year = year + 1    

In [None]:
pgm_historical_values_predictions[1]

In [None]:
# Concatenating step-level dataframes
single_file = False

if single_file:
    df_pgm_predictions_historical_values = df_list_bystep[0]
    list_item = 1
    for step in range(3+1,maxstep+1):
        print('adding data for step', step)
        df_pgm_predictions_historical_values = pd.concat([df_pgm_predictions_historical_values,df_list_bystep[list_item]])
        list_item += 1

    df_pgm_predictions_historical_values.describe()

In [None]:
# Dataframe with actuals
df_actuals_pgm = pd.DataFrame(df_pgm_historical_values_0['ged_sb'])
print(df_actuals_pgm.head())
print(df_actuals_pgm.tail())

In [None]:
# Export to csv
filename = Mydropbox + 'Prediction_competition_2023/' + 'pgm_actuals.parquet'
df_actuals_pgm.to_parquet(filename)

# Probably obsolete from here..

In [None]:
# Aggregate across draws/samples to extract means, standard deviations, and category probabilities
# export to parquet step by step
step = 3
for df in df_list_bystep:
    print(step)
    filename = Mydropbox + 'Prediction_competition_2023/' + 'pgm_benchmark_historical_values_step_' + str(step) + '.parquet'
    df.to_parquet(filename)
    filename = Mydropbox + 'Prediction_competition_2023/' + 'pgm_benchmark_historical_values_step_' + str(step) + '_aggregated.parquet'
    df_aggregated = aggregate_and_categorize(df,'pgm')
    df_aggregated.to_parquet(filename)

    print(df_aggregated.describe())
    print(df_aggregated.mean())
        
    step = step+1

In [None]:
filename = Mydropbox + 'Prediction_competition_2023/' + 'pgm_benchmark_historical_values.parquet'
df_from_file = pd.read_parquet(filename)

In [None]:
filename = Mydropbox + 'Prediction_competition_2023/' + 'pgm_benchmark_historical_values_step_3.parquet'
df_from_file = pd.read_parquet(filename)

In [None]:
df_from_file