# ifo Business Climate Index forecasting

First of all we just make some simple normalisations. Our goal here is to train a super simple statistical model that is not going to embarrass us when someone runs it on a new batch of data next year. First of all we do some normalisation to get some clean data for fitting.

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv("/home/ifo_vo_ifo_hack2023_user/Desktop/project-directory/input/IBS_paneldata_train.csv")
unique_ids_original = data['idnum'].unique()

data = data[data['year'] > 2019]
data['monthyear'] = 12 * (data['year']-1980) + data['month'] # Months since the beginning of the data series given to us
columns_of_interest = ['vg_statebus', 'vg_comexp', 'vg_priceexp']
data = data[['idnum', 'monthyear'] + columns_of_interest] # We're only looking at a small subset of the data here.
data = data.dropna()
data['idnum'] = data['idnum'].astype(int)
for single_column_of_interest in columns_of_interest:
    data[single_column_of_interest] = data[single_column_of_interest].astype(int)

  data = pd.read_csv("/home/ifo_vo_ifo_hack2023_user/Desktop/project-directory/input/IBS_paneldata_train.csv")


In [2]:
data

Unnamed: 0,idnum,monthyear,vg_statebus,vg_comexp,vg_priceexp
470,8034913,481,3,2,2
7982,8035588,484,3,3,2
7983,8035588,485,2,2,2
7987,8035588,489,2,2,2
7988,8035588,490,3,2,2
...,...,...,...,...,...
1808353,8164562,503,1,3,1
1808354,8164562,504,1,2,1
1808463,8164709,502,2,1,1
1808464,8164709,503,2,2,1


In [3]:
# It's hard to to machine learning on data that is in the sparse form given to us where we have some observations some of the time.
# Instead, we would ideally like a dense time-series where for every respondent we have a response at every time in question.
# So we make a table indexed by  [companies, monthly survey, question].
arrays = []
for single_column_of_interest in columns_of_interest:
    pivot_table = data.pivot(index='idnum', columns='monthyear', values=single_column_of_interest).ffill(axis=1).fillna(1)
    #print(pivot_table)
    array = pivot_table.to_numpy()
    arrays.append(array)
big_array = np.stack(arrays).transpose((1,2,0))
print(big_array.shape)

(3123, 24, 3)


In [5]:
big_array_flat = big_array.reshape((big_array.shape[0],-1))
print(big_array_flat.shape)

(3123, 72)


In [6]:
# We do a train-test split so we can easily test a bunch of different models. The logistic regression below has virtually the same train and test accuracy
# since it had so few parameters. But if you replaced the LogisticRegression(max_iter=10000) by a DecisionTreeClassifier() for example,you would see a wide divergence.
prediction_results = {}
n_shifts = 12
max_lag = 5
models = []
n_cols = len(columns_of_interest)

total_train_loss = 0
total_test_loss = 0

for i in range(n_cols):
    #print(df_wide)
    width = big_array_flat.shape[1]
    array = []
    for shift in range(n_shifts):
        # This line is a bit misterious. We train the regression task to predict values one month into the future.
        # For the training data we use train that for several amount of time shifted int he past.
        array_part = big_array_flat[:,width-1-n_cols*shift-n_cols*max_lag:width-n_cols*shift]
        array.append(array_part)
    array = np.vstack(array)
                                         
    X = array[:,:-1]
    y = array[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)
    models.append(model)
    train_loss = model.score(X_train, y_train)
    total_train_loss += train_loss
    print(f'Training accuracy for metric {single_column_of_interest}: {train_loss}')
    
    test_loss = model.score(X_test, y_test)
    total_test_loss += test_loss
    print(f'Testing accuracy for metric {single_column_of_interest}: {test_loss}')

print(f'Average test accuracy: {total_test_loss / 3}')
print(f'Average train accuracy: {total_train_loss / 3}')

Training accuracy for metric vg_priceexp: 0.8328886042622834
Testing accuracy for metric vg_priceexp: 0.8291173017397802
Training accuracy for metric vg_priceexp: 0.8328886042622834
Testing accuracy for metric vg_priceexp: 0.8291173017397802
Training accuracy for metric vg_priceexp: 0.8328886042622834
Testing accuracy for metric vg_priceexp: 0.8291173017397802
Average test accuracy: 0.8291173017397803
Average train accuracy: 0.8328886042622834


In [7]:
# Here we then then apply or models repeatedly to forecast further into the future
X_predicted = big_array_flat
for j in range(12 * n_cols):
    model = models[j % n_cols]
    X_in = X_predicted[:,-n_cols*max_lag:]
    #print('x_in',X_in.shape)
    prediction = model.predict(X_in)
    X_predicted = np.append(X_predicted, prediction[:,np.newaxis], axis=1)

In [8]:
predicted_numpy = X_predicted.reshape((big_array_flat.shape[0],-1,3))
predicted_numpy.shape

(3123, 36, 3)

In [10]:
print(predicted_numpy[700,:,0])
print(big_array[700,:,0])

[1. 2. 1. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
[1. 2. 1. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2.]


In [11]:
dfs = []
for i in range(12):
    single_month_df = pd.DataFrame(
        {columns_of_interest[j]: predicted_numpy[:,-12+i,j] 
                                    for j
                                 in range(len(columns_of_interest))
        }
    )
    single_month_df['year'] = 2022
    single_month_df['month'] = i+1
    single_month_df['idnum'] = pivot_table.index
    dfs.append(single_month_df)

In [12]:
all_results = pd.concat(dfs)
all_results

Unnamed: 0,vg_statebus,vg_comexp,vg_priceexp,year,month,idnum
0,3.0,2.0,2.0,2022,1,8000019
1,1.0,2.0,2.0,2022,1,8000289
2,3.0,2.0,2.0,2022,1,8000473
3,3.0,2.0,2.0,2022,1,8000705
4,3.0,1.0,2.0,2022,1,8000879
...,...,...,...,...,...,...
3118,2.0,1.0,1.0,2022,12,8164814
3119,1.0,2.0,1.0,2022,12,8164815
3120,1.0,1.0,1.0,2022,12,8164816
3121,1.0,2.0,1.0,2022,12,8164817


In [13]:
# We don't actually end up producing an explicit forecast for the majority of businesses in the dataset.
# This is because most businesse shave not answered in the recent past where we train.
# They might be unlikely to give an answer now (a lot of the businesses might be long out of business) but it doesn't hurt to have some forecast for the scoring.
# To still have some prediction even without a real recent history for us to regress on, we just guess.
predicted_ids = all_results['idnum'].unique()
print(len(unique_ids_original))
print(len(predicted_ids))

13421
3123


In [14]:
type(predicted_ids)

numpy.ndarray

In [15]:
leftover_users = np.setdiff1d(unique_ids_original, predicted_ids)

In [16]:
dfs = []
for i in range(12):
    single_month_df = pd.DataFrame(
        {'idnum': leftover_users}
    )
    single_month_df['year'] = 2022
    single_month_df['month'] = i+1
    single_month_df['vg_statebus'] = 1
    single_month_df['vg_comexp'] = 1
    single_month_df['vg_priceexp'] = 1
    dfs.append(single_month_df)
leftover_results = pd.concat(dfs)
all_results = all_results.append(leftover_results)

  all_results = all_results.append(leftover_results)


In [17]:
len(all_results['idnum'].unique())

13421

In [18]:
all_results.to_csv('ForecastFanatics_eval_submission.csv', index=False)