In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_squared_error, mean_absolute_error

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
SEED = 42

In [3]:
file_path = "../data/final/merged_double_digit.csv"

In [4]:
df = pd.read_csv(file_path, index_col=None, header=0)

In [5]:
df.head()

Unnamed: 0,Year,Region,Acc-ID,Realized,Budget y,Budget y+1,Slack
0,2021,AG,30,1710857.0,1724699.0,1824801.0,13841.72964
1,2021,AG,31,465105.6,446347.7,486337.9,-18757.87605
2,2021,AG,33,264389.8,236841.0,201998.2,-27548.79548
3,2021,AG,35,355453.6,35099.99,117736.2,-320353.6555
4,2021,AG,36,2922549.0,2745166.0,2777999.0,-177383.42905


In [6]:
df.dtypes

Year            int64
Region         object
Acc-ID          int64
Realized      float64
Budget y      float64
Budget y+1    float64
Slack         float64
dtype: object

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9837 entries, 0 to 9836
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Year        9837 non-null   int64  
 1   Region      9837 non-null   object 
 2   Acc-ID      9837 non-null   int64  
 3   Realized    9837 non-null   float64
 4   Budget y    9837 non-null   float64
 5   Budget y+1  9837 non-null   float64
 6   Slack       9837 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 538.1+ KB


In [8]:
df = df.sort_values(by=['Year'])

In [9]:
# Define the lag periods
one_year_lag = 1
two_year_lag = 2
five_year_lag = 5

# Create lag features for 'Realized'
df['Realized_1yr_lag'] = df['Realized'].shift(one_year_lag)
df['Realized_2yr_lag'] = df['Realized'].shift(two_year_lag)
df['Realized_5yr_lag'] = df['Realized'].rolling(window=five_year_lag).mean()

# Drop rows with NaN values resulting from the lag operations
df = df.dropna()

In [10]:
train_ratio = 0.8
split_index = int(len(df) * train_ratio)

In [11]:
train_data = df.iloc[:split_index]
test_data = df.iloc[split_index:]

In [12]:
X_train = train_data.drop(columns=['Realized', 'Budget y', 'Budget y+1'])
y_train = train_data['Realized']
budget_y_train = train_data['Budget y']

X_test = test_data.drop(columns=['Realized', 'Budget y', 'Budget y+1'])
y_test = test_data['Realized']
budget_y_test = test_data['Budget y']

In [13]:
X_train.head()

Unnamed: 0,Year,Region,Acc-ID,Slack,Realized_1yr_lag,Realized_2yr_lag,Realized_5yr_lag
2235,2011,BL,40,0.008941,37176.7,368790.8,648239.0
2236,2011,BL,31,0.0,0.01465821,37176.7,1078602.0
2237,2011,BL,32,0.0,2151817.0,0.01465821,511557.1
2238,2011,BL,33,0.0,0.8484289,2151817.0,703723.0
2239,2011,BL,34,0.0,1329620.0,0.8484289,921201.6


In [14]:
param_grid = {
    'iterations': 500,
    'learning_rate': 0.1,
    'depth': 8,
    'loss_function': 'RMSE',
}

In [15]:
catboost = CatBoostRegressor(loss_function='RMSE', cat_features=['Region', 'Acc-ID'])

In [16]:
grid_search_result = cv(
    pool=Pool(X_train, label=y_train, cat_features=['Region', 'Acc-ID']),
    params=param_grid,
    fold_count=5, 
    stratified=False,  # Time series data is not stratified
    shuffle=True,
    verbose=50,  
    plot=False,
    seed=SEED,
)

Training on fold [0/5]
0:	learn: 1401650.0955799	test: 1558776.1465873	best: 1558776.1465873 (0)	total: 88.7ms	remaining: 44.3s
50:	learn: 400905.7024415	test: 568186.2885243	best: 568186.2885243 (50)	total: 360ms	remaining: 3.17s
100:	learn: 303229.4912632	test: 503216.8653006	best: 503216.8653006 (100)	total: 659ms	remaining: 2.6s
150:	learn: 247389.5122086	test: 474255.0635675	best: 474255.0635675 (150)	total: 1.02s	remaining: 2.37s
200:	learn: 214405.5490812	test: 460004.3639094	best: 459979.0249586 (199)	total: 1.45s	remaining: 2.16s
250:	learn: 188170.6486021	test: 451097.8083888	best: 450952.4701734 (249)	total: 1.76s	remaining: 1.75s
300:	learn: 163108.5591238	test: 442822.1136773	best: 442822.1136773 (300)	total: 1.98s	remaining: 1.31s
350:	learn: 146603.0942034	test: 437509.8627443	best: 437509.8627443 (350)	total: 2.27s	remaining: 965ms
400:	learn: 132326.0902633	test: 434064.0283373	best: 433904.9995212 (395)	total: 2.65s	remaining: 655ms
450:	learn: 118694.0138573	test: 43

In [17]:
best_model = CatBoostRegressor( cat_features=['Region', 'Acc-ID'], **param_grid)
best_model.fit(X_train, y_train)

0:	learn: 1372501.5713863	total: 5.12ms	remaining: 2.56s
1:	learn: 1289163.9350097	total: 8.97ms	remaining: 2.23s
2:	learn: 1209751.4867484	total: 11.7ms	remaining: 1.94s
3:	learn: 1143842.6862039	total: 15.7ms	remaining: 1.95s
4:	learn: 1084136.4323617	total: 18.1ms	remaining: 1.79s
5:	learn: 1030341.0506612	total: 20.7ms	remaining: 1.71s
6:	learn: 987218.2148320	total: 23.3ms	remaining: 1.64s
7:	learn: 935476.5689983	total: 25.6ms	remaining: 1.58s
8:	learn: 897248.9366621	total: 28.1ms	remaining: 1.53s
9:	learn: 860567.8662511	total: 30.7ms	remaining: 1.5s
10:	learn: 820777.7975794	total: 33.8ms	remaining: 1.5s
11:	learn: 787798.8924882	total: 36.1ms	remaining: 1.47s
12:	learn: 763889.0281495	total: 38.5ms	remaining: 1.44s
13:	learn: 741089.9033555	total: 40.6ms	remaining: 1.41s
14:	learn: 715712.1702521	total: 43.7ms	remaining: 1.41s
15:	learn: 696077.7736284	total: 45.9ms	remaining: 1.39s
16:	learn: 675234.0405127	total: 48.3ms	remaining: 1.37s
17:	learn: 656147.1767552	total: 50.9

<catboost.core.CatBoostRegressor at 0x12b220910>

In [18]:
y_pred = best_model.predict(X_test)

In [19]:
rmse_model = np.sqrt(mean_squared_error(y_test, y_pred))
mae_model = mean_absolute_error(y_test, y_pred)

In [20]:
print("Model Performance:")
print(f"Root Mean Squared Error (RMSE): {round(rmse_model, 0)}")
print(f"Mean Absolute Error (MAE): {round(mae_model, 0)}")

Model Performance:
Root Mean Squared Error (RMSE): 344213.0
Mean Absolute Error (MAE): 136064.0


In [21]:
rmse_budget_y = np.sqrt(mean_squared_error(y_test, budget_y_test))
mae_budget_y = mean_absolute_error(y_test, budget_y_test)

In [22]:
print("\nCompeting Forecast (Budget y+1) Performance:")
print(f"Root Mean Squared Error (RMSE): {round(rmse_budget_y, 0)}")
print(f"Mean Absolute Error (MAE): {round(mae_budget_y, 0)}")


Competing Forecast (Budget y+1) Performance:
Root Mean Squared Error (RMSE): 1238856.0
Mean Absolute Error (MAE): 236522.0
