# Preparing cashflow features for an training a global XGBoost model

### Project members:
- Marlene Ibrus
- Maare Karmen Oras
- Aleksandr Volžinski

In this file, we will create necessary agregate features for cashflow and creating the train-validation-test split. ... 

## I. Imports

In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
import joblib
import optuna
import warnings
warnings.filterwarnings("ignore")

## II. Load customer and transaction data

In [2]:
df_customer = pd.read_csv('../Data/synthetic_sme_customers_processed.csv')
df_customer.head()

Unnamed: 0,cust_id,customer_type,parent_company_flag,CUST_GRP_ID,BRTH_DT,language
0,3caf82febb1d4e64b140893d9e89d748115ec5b70455e1...,SME,0,eefcfda39744d0f4c7d1f1f6f13a53dbf49f7281de5de0...,2005-2009,ENG
1,fd0d32ecf697980ff5b750d18fe6f8e9f96f2e31fba130...,SME,1,,2015-2019,EST
2,320b10457c9c109c2b4f00b0fecac1cc8cb7c2da8ad936...,SME,1,,2020-2024,EST
3,d4d7e033ed758bb075ad96435d4afb505b7a4059aa3f37...,SME,1,,2020-2024,EST
4,e9b8374b1f3ae04cc65672bc5907726d962edb35e05736...,SME,1,,2020-2024,EST


In [3]:
df_transaction = pd.read_csv('../Data/synthetic_sme_transactions_processed.csv')
df_transaction.head()

Unnamed: 0,WeekDay,Channel,cust_id,D_C,Currency_trx,Amount_EUR,Customer_IBAN,Counterparty_IBAN,Amount_Orig,currency,MCC,country_of_merchant,BookingDatetime
0,5,POS,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,7.518036,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,7.518036,EUR,5271.0,EE,2023-01-27 10:00:28
1,2,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,0.200481,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,0.200481,EUR,,,2023-01-17 02:14:14
2,3,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,1.002405,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,1.002405,EUR,,,2023-02-08 02:00:18
3,7,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,C,EUR,0.010024,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,0.010024,EUR,,,2023-01-01 03:56:39
4,3,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,1.002405,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,1.002405,EUR,,,2023-03-08 02:06:48


## III. Merge datasets and clean datetime

In [4]:
df_total = pd.merge(df_transaction, df_customer, on='cust_id', how='left')
df_total = df_total[np.abs(df_total['Amount_EUR']) >= 0.01]
df_total['BookingDatetime'] = pd.to_datetime(df_total['BookingDatetime'])
df_total['Amount_EUR'] = df_total['Amount_EUR'].round(2)
df_total['Week'] = df_total['BookingDatetime'].dt.isocalendar().week
df_total['Month'] = df_total['BookingDatetime'].dt.month

In [6]:
df_total.head()

Unnamed: 0,WeekDay,Channel,cust_id,D_C,Currency_trx,Amount_EUR,Customer_IBAN,Counterparty_IBAN,Amount_Orig,currency,MCC,country_of_merchant,BookingDatetime,customer_type,parent_company_flag,CUST_GRP_ID,BRTH_DT,language,Week,Month
0,5,POS,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,7.52,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,7.518036,EUR,5271.0,EE,2023-01-27 10:00:28,SME,1,,1995-1999,RUS,4,1
1,2,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,0.2,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,0.200481,EUR,,,2023-01-17 02:14:14,SME,1,,1995-1999,RUS,3,1
2,3,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,1.0,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,1.002405,EUR,,,2023-02-08 02:00:18,SME,1,,1995-1999,RUS,6,2
3,7,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,C,EUR,0.01,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,0.010024,EUR,,,2023-01-01 03:56:39,SME,1,,1995-1999,RUS,52,1
4,3,Internet Bank,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,D,EUR,1.0,3a8c99d1e8caf60ddbc4b9865b0947f981c90980f3fab2...,7ecc6d620a9ea4170231178c236206770eb7a4785d2a6c...,1.002405,EUR,,,2023-03-08 02:06:48,SME,1,,1995-1999,RUS,10,3


In [7]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2540072 entries, 0 to 2542015
Data columns (total 20 columns):
 #   Column               Dtype         
---  ------               -----         
 0   WeekDay              int64         
 1   Channel              object        
 2   cust_id              object        
 3   D_C                  object        
 4   Currency_trx         object        
 5   Amount_EUR           float64       
 6   Customer_IBAN        object        
 7   Counterparty_IBAN    object        
 8   Amount_Orig          float64       
 9   currency             object        
 10  MCC                  float64       
 11  country_of_merchant  object        
 12  BookingDatetime      datetime64[ns]
 13  customer_type        object        
 14  parent_company_flag  int64         
 15  CUST_GRP_ID          object        
 16  BRTH_DT              object        
 17  language             object        
 18  Week                 UInt32        
 19  Month                int32

In [8]:
df_total.shape

(2540072, 20)

In [11]:
print(f"Unique customers: {df_total['cust_id'].nunique()}")

Unique customers: 958


## IV. Compute daily net cash flow per customer

In [12]:
df_total['flow'] = df_total.apply(lambda x: x['Amount_EUR'] if x['D_C'] == 'D' else -x['Amount_EUR'], axis=1)

daily_cashflow = (
    df_total.groupby(['cust_id', pd.Grouper(key='BookingDatetime', freq='D')])['flow']
      .sum()
      .reset_index()
      .rename(columns={'flow': 'net_flow'})
)

print(f"Daily cashflow shape: {daily_cashflow.shape}")
print(daily_cashflow.head())

Daily cashflow shape: (270433, 3)
                                             cust_id BookingDatetime  net_flow
0  00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...      2023-01-01     -0.02
1  00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...      2023-01-08      2.00
2  00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...      2023-01-16    280.68
3  00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...      2023-01-17      0.40
4  00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...      2023-01-27     15.04


In [13]:
#  Rename 'BookingDatetime' to 'date' and normalize
daily = daily_cashflow.copy()
daily = daily.rename(columns={'BookingDatetime': 'date'})
daily['date'] = pd.to_datetime(daily['date']).dt.normalize()

## V. Reindex per customer to fill missing dates

In [15]:
def reindex_customer(group):
    idx = pd.date_range(group['date'].min(), group['date'].max(), freq='D')
    g = group.set_index('date').reindex(idx).rename_axis('date').reset_index()
    g['cust_id'] = group['cust_id'].iloc[0]
    g['net_flow'] = g['net_flow'].fillna(0.0)
    return g

In [16]:
daily_full = daily.groupby('cust_id', as_index=False, group_keys=False).apply(reindex_customer).reset_index(drop=True)

In [18]:
daily_full.describe(include='all')

Unnamed: 0,date,cust_id,net_flow
count,682527,682527,682527.0
unique,,958,
top,,7e3a7fac7a24ad6e091f0fe698a30c42a043b1953b9243...,
freq,,731,
mean,2023-12-26 14:41:11.163779072,,4.660542
min,2023-01-01 00:00:00,,-7808635.0
25%,2023-06-29 00:00:00,,0.0
50%,2023-12-25 00:00:00,,0.0
75%,2024-06-23 00:00:00,,0.0
max,2024-12-31 00:00:00,,7803744.0


## Create a 7-day future target

Since we aim to predict for a 7-day window, the we use the last 7 days to create that prediction. We use a rolling sum shifted by -7 to avoid leakage. Once `target_7d` is created, drop rows where `target_7d` is `NaN`.

In [19]:
daily_full = daily_full.sort_values(['cust_id', 'date']).reset_index(drop=True)
daily_full['target_7d'] = (
    daily_full
    .groupby('cust_id')['net_flow']
    .transform(lambda x: x.rolling(window=7, min_periods=7).sum().shift(-7))
)

In [20]:
daily_full = daily_full[~daily_full['target_7d'].isna()].reset_index(drop=True)

## Create lag features using past data only

In [23]:
windows = [7, 14, 30]
for w in windows:
    daily_full[f'roll_sum_{w}'] = daily_full.groupby('cust_id')['net_flow'].transform(lambda x: x.rolling(window=w, min_periods=1).sum())
    daily_full[f'roll_mean_{w}'] = daily_full.groupby('cust_id')['net_flow'].transform(lambda x: x.rolling(window=w, min_periods=1).mean())
    daily_full[f'roll_std_{w}'] = daily_full.groupby('cust_id')['net_flow'].transform(lambda x: x.rolling(window=w, min_periods=1).std().fillna(0))

In [24]:
daily_full['lag_1'] = daily_full.groupby('cust_id')['net_flow'].shift(1).fillna(0)
daily_full['lag_7_sum'] = daily_full.groupby('cust_id')['net_flow'].transform(lambda x: x.shift(1).rolling(7, min_periods=1).sum()).fillna(0)

In [27]:
daily_full['dayofweek'] = daily_full['date'].dt.dayofweek
daily_full['is_weekend'] = daily_full['dayofweek'].isin([5,6]).astype(int)
daily_full['day'] = daily_full['date'].dt.day
daily_full['month'] = daily_full['date'].dt.month
daily_full['year'] = daily_full['date'].dt.year

In [28]:
daily_full.head()

Unnamed: 0,date,cust_id,net_flow,target_7d,roll_sum_7,roll_mean_7,roll_std_7,roll_sum_14,roll_mean_14,roll_std_14,roll_sum_30,roll_mean_30,roll_std_30,lag_1,lag_7_sum,dayofweek,is_weekend,day,month,year
0,2023-01-01,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,-0.02,2.0,-0.02,-0.02,0.0,-0.02,-0.02,0.0,-0.02,-0.02,0.0,0.0,0.0,6,1,1,1,2023
1,2023-01-02,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.01,0.014142,-0.02,-0.01,0.014142,-0.02,-0.01,0.014142,-0.02,-0.02,0,0,2,1,2023
2,2023-01-03,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.006667,0.011547,-0.02,-0.006667,0.011547,-0.02,-0.006667,0.011547,0.0,-0.02,1,0,3,1,2023
3,2023-01-04,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.005,0.01,-0.02,-0.005,0.01,-0.02,-0.005,0.01,0.0,-0.02,2,0,4,1,2023
4,2023-01-05,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.004,0.008944,-0.02,-0.004,0.008944,-0.02,-0.004,0.008944,0.0,-0.02,3,0,5,1,2023


In [29]:
daily_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 675821 entries, 0 to 675820
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date          675821 non-null  datetime64[ns]
 1   cust_id       675821 non-null  object        
 2   net_flow      675821 non-null  float64       
 3   target_7d     675821 non-null  float64       
 4   roll_sum_7    675821 non-null  float64       
 5   roll_mean_7   675821 non-null  float64       
 6   roll_std_7    675821 non-null  float64       
 7   roll_sum_14   675821 non-null  float64       
 8   roll_mean_14  675821 non-null  float64       
 9   roll_std_14   675821 non-null  float64       
 10  roll_sum_30   675821 non-null  float64       
 11  roll_mean_30  675821 non-null  float64       
 12  roll_std_30   675821 non-null  float64       
 13  lag_1         675821 non-null  float64       
 14  lag_7_sum     675821 non-null  float64       
 15  dayofweek     675

In [35]:
daily_full.describe(include='all')

Unnamed: 0,date,cust_id,net_flow,target_7d,roll_sum_7,roll_mean_7,roll_std_7,roll_sum_14,roll_mean_14,roll_std_14,roll_sum_30,roll_mean_30,roll_std_30,lag_1,lag_7_sum,dayofweek,is_weekend,day,month,year
count,675821,675821,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0,675821.0
unique,,958,,,,,,,,,,,,,,,,,,
top,,7e3a7fac7a24ad6e091f0fe698a30c42a043b1953b9243...,,,,,,,,,,,,,,,,,,
freq,,724,,,,,,,,,,,,,,,,,,
mean,2023-12-23 03:09:33.047004928,,4.881056,65.16134,20.82984,-4.685244,4844.398,44.67088,-8.517485,5485.513,60.44327,-14.83346,6100.757,3.222172,18.89229,2.998377,0.285835,15.617372,6.411582,2023.485799
min,2023-01-01 00:00:00,,-7808635.0,-7951560.0,-7951560.0,-3612219.0,0.0,-8012290.0,-3612219.0,0.0,-7954514.0,-3612219.0,0.0,-7808635.0,-7951560.0,0.0,0.0,1.0,1.0,2023.0
25%,2023-06-27 00:00:00,,0.0,-493.3,-487.0,-69.78571,11.37673,-1014.04,-73.17286,76.0597,-1539.68,-52.656,150.6903,0.0,-483.98,1.0,0.0,8.0,3.0,2023.0
50%,2023-12-21 00:00:00,,0.0,0.0,0.0,0.0,273.4343,0.0,0.0,449.7368,1.9,0.06333333,639.1042,0.0,0.0,3.0,0.0,16.0,6.0,2023.0
75%,2024-06-18 00:00:00,,0.0,658.04,655.64,94.06286,1604.625,1144.64,82.58571,2234.617,1454.44,49.85733,2785.145,0.0,652.96,5.0,1.0,23.0,9.0,2024.0
max,2024-12-24 00:00:00,,7803744.0,8022838.0,8022838.0,1146120.0,4508648.0,5630463.0,402175.9,3064111.0,8042096.0,347276.7,2554225.0,7803744.0,8022838.0,6.0,1.0,31.0,12.0,2024.0


## VI. Merge static customer features

In [30]:
cust_static = df_customer.copy()

In [31]:
cust_static['BRTH_DT'] = pd.to_datetime(cust_static['BRTH_DT'], errors='coerce')
latest_date = daily_full['date'].max()
cust_static['age'] = ((latest_date - cust_static['BRTH_DT']).dt.days // 365).fillna(-1).astype(int)

In [32]:
keep_cols = ['cust_id', 'customer_type', 'parent_company_flag', 'CUST_GRP_ID', 'language', 'age']
cust_static = cust_static[[c for c in keep_cols if c in cust_static.columns]]

In [33]:
df = daily_full.merge(cust_static, on='cust_id', how='left')

In [34]:
df.head()

Unnamed: 0,date,cust_id,net_flow,target_7d,roll_sum_7,roll_mean_7,roll_std_7,roll_sum_14,roll_mean_14,roll_std_14,...,dayofweek,is_weekend,day,month,year,customer_type,parent_company_flag,CUST_GRP_ID,language,age
0,2023-01-01,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,-0.02,2.0,-0.02,-0.02,0.0,-0.02,-0.02,0.0,...,6,1,1,1,2023,SME,1,,RUS,-1
1,2023-01-02,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.01,0.014142,-0.02,-0.01,0.014142,...,0,0,2,1,2023,SME,1,,RUS,-1
2,2023-01-03,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.006667,0.011547,-0.02,-0.006667,0.011547,...,1,0,3,1,2023,SME,1,,RUS,-1
3,2023-01-04,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.005,0.01,-0.02,-0.005,0.01,...,2,0,4,1,2023,SME,1,,RUS,-1
4,2023-01-05,00feb0ff373287a3b1b210369f5aef9bfffd5d02f6bc8f...,0.0,2.0,-0.02,-0.004,0.008944,-0.02,-0.004,0.008944,...,3,0,5,1,2023,SME,1,,RUS,-1


## VII. Encode categorical variables

In [36]:
cat_cols = [c for c in ['customer_type', 'CUST_GRP_ID', 'language'] if c in df.columns]
for c in cat_cols:
    df[c] = df[c].astype('category').cat.codes.fillna(-1).astype(int)

## VIII. Prepare matrix

In [37]:
label_col = 'target_7d'
ignore_cols = ['cust_id', 'date', label_col]
feature_cols = [c for c in df.columns if c not in ignore_cols]

In [38]:
X = df[feature_cols]
y = df[label_col].values

In [39]:
print("Feature columns used:", feature_cols)
print("Total rows:", len(df))

Feature columns used: ['net_flow', 'roll_sum_7', 'roll_mean_7', 'roll_std_7', 'roll_sum_14', 'roll_mean_14', 'roll_std_14', 'roll_sum_30', 'roll_mean_30', 'roll_std_30', 'lag_1', 'lag_7_sum', 'dayofweek', 'is_weekend', 'day', 'month', 'year', 'customer_type', 'parent_company_flag', 'CUST_GRP_ID', 'language', 'age']
Total rows: 675821


## IX. Create a chronological train/valid/test split by date

Split dates by quantiles of the global date (70%, 15%, 15%)

In [40]:
date_values = df['date']
train_end = date_values.quantile(0.70)
valid_end = date_values.quantile(0.85)

In [41]:
train_idx = df['date'] <= train_end
valid_idx = (df['date'] > train_end) & (df['date'] <= valid_end)
test_idx = df['date'] > valid_end

In [42]:
X_train, y_train = X[train_idx], y[train_idx]
X_valid, y_valid = X[valid_idx], y[valid_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

In [43]:
print(f"Train rows: {len(X_train)}  Valid rows: {len(X_valid)}  Test rows: {len(X_test)}")

Train rows: 473547  Valid rows: 101290  Test rows: 100984


## X. Create XGBoost DMatrix and set parameters

As we are able to use GPU acceleration for XGBoost, we will use the necessary GPU-specific parameters for training.

In [44]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest  = xgb.DMatrix(X_test, label=y_test)

In [54]:
params_gpu = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',       # Use GPU-accelerated histogram algorithm
    'device': 'gpu',                     # GPU device index (0 = first GPU)
    'predictor': 'gpu_predictor',    # Use GPU for predictions
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1.0,
    'alpha': 0.0,
    'seed': 42,
    'verbosity': 1
}

In [55]:
evallist = [(dtrain, 'train'), (dvalid, 'valid')]

## XI. Train with early stopping

In [56]:
bst_gpu = xgb.train(
    params_gpu,
    dtrain,
    num_boost_round=2000,
    evals=evallist,
    early_stopping_rounds=50,
    verbose_eval=50
)

[0]	train-rmse:59127.93899	valid-rmse:47739.30823
[50]	train-rmse:51960.56780	valid-rmse:46338.94651
[76]	train-rmse:50901.39948	valid-rmse:46382.67422


## XII. Evaluate on test set and determine feature importance

In [63]:
y_pred_test = bst_gpu.predict(dtest, iteration_range=(0, bst_gpu.best_iteration + 1))
rmse = mean_squared_error(y_test, y_pred_test) ** 0.5
mae = mean_absolute_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / np.where(y_test==0, 1e-6, y_test))) * 100

In [64]:
print("\nTEST METRICS")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"MAPE: {mape:.2f}%")


TEST METRICS
RMSE: 50364.5492
MAE : 10019.3850
MAPE: 2643003586437408.00%


In [66]:
fi = bst_gpu.get_score(importance_type='gain')
fi_df = pd.DataFrame([
    {'feature': k, 'gain': v} for k, v in fi.items()
]).sort_values('gain', ascending=False)

In [67]:
print("\nTop features by gain:")
print(fi_df.head(20))


Top features by gain:
                feature          gain
9           roll_std_30  3.959649e+12
15                month  3.851449e+12
10                lag_1  3.261741e+12
4           roll_sum_14  3.193916e+12
5          roll_mean_14  3.030790e+12
18          CUST_GRP_ID  2.743424e+12
17  parent_company_flag  2.488570e+12
0              net_flow  2.394373e+12
12            dayofweek  2.219355e+12
14                  day  1.871676e+12
2           roll_mean_7  1.865525e+12
7           roll_sum_30  1.813482e+12
1            roll_sum_7  1.794066e+12
3            roll_std_7  1.749569e+12
19             language  1.692736e+12
6           roll_std_14  1.522094e+12
8          roll_mean_30  1.359352e+12
11            lag_7_sum  1.309898e+12
16                 year  9.778478e+11
13           is_weekend  1.316644e+11


## XIII. Hyperparameter tuning

The previous results were not good (to say the least), so we will use `Optuna` to define the search space for hyperparameters and run a study to fin the best ones.

In [83]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'device': 'cuda',      # <- works for ALL versions
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 0, 10),
        'alpha': trial.suggest_float('alpha', 0, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
    }

    evals = [(dtrain, 'train'), (dvalid, 'valid')]

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=evals,
        early_stopping_rounds=100,   # <– works in ALL versions
        verbose_eval=False
    )

    preds = model.predict(dvalid)
    rmse = mean_squared_error(y_valid, preds) ** 0.5
    return rmse

In [86]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=70)  # try 50 parameter combinations

[I 2025-12-09 21:02:31,924] A new study created in memory with name: no-name-2a26a54a-029a-401b-891b-2ca269205279
[I 2025-12-09 21:02:35,440] Trial 0 finished with value: 47602.47896747024 and parameters: {'learning_rate': 0.031145658135077258, 'max_depth': 12, 'subsample': 0.9364759098699966, 'colsample_bytree': 0.7079618893456459, 'lambda': 1.7998826903825016, 'alpha': 9.761434569529754, 'min_child_weight': 2}. Best is trial 0 with value: 47602.47896747024.
[I 2025-12-09 21:02:35,997] Trial 1 finished with value: 48569.0963810305 and parameters: {'learning_rate': 0.1414135812075259, 'max_depth': 6, 'subsample': 0.7294519363654415, 'colsample_bytree': 0.5567814929389163, 'lambda': 5.673441155651135, 'alpha': 2.703428979288165, 'min_child_weight': 3}. Best is trial 0 with value: 47602.47896747024.
[I 2025-12-09 21:02:37,677] Trial 2 finished with value: 48413.82073782594 and parameters: {'learning_rate': 0.05599268036634106, 'max_depth': 10, 'subsample': 0.9832011999582075, 'colsample_

In [87]:
print("Best RMSE:", study.best_value)
print("Best parameters:", study.best_params)

Best RMSE: 44690.58744628705
Best parameters: {'learning_rate': 0.01146265235811284, 'max_depth': 11, 'subsample': 0.5693286009880982, 'colsample_bytree': 0.5745635245894778, 'lambda': 4.182988084688336, 'alpha': 3.505903048241581, 'min_child_weight': 12}


## XIV. Train the final model

In [98]:
best_params = study.best_params
best_params.update({
    'tree_method': 'hist',
    'predictor': 'gpu_predictor',
    'device': 'gpu',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
})

In [102]:
final_model = XGBRegressor(
    **best_params,
    n_estimators=2000,
    verbosity=1,  # Set verbosity to a lower value (0 = silent, 1 = warning, etc.)
    random_state=42
)

In [104]:
final_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=50  # Output logs after 50 boosting rounds
)

[0]	validation_0-rmse:47725.40045
[50]	validation_0-rmse:45918.99460
[100]	validation_0-rmse:45105.52415
[150]	validation_0-rmse:44786.50391
[200]	validation_0-rmse:44687.77588
[250]	validation_0-rmse:44699.08134
[300]	validation_0-rmse:44775.32666
[350]	validation_0-rmse:44837.08552
[400]	validation_0-rmse:44920.63282
[450]	validation_0-rmse:45031.36983
[500]	validation_0-rmse:45133.97977
[550]	validation_0-rmse:45217.54837
[600]	validation_0-rmse:45300.43087
[650]	validation_0-rmse:45362.54756
[700]	validation_0-rmse:45513.83311
[750]	validation_0-rmse:45601.66944
[800]	validation_0-rmse:45672.09578
[850]	validation_0-rmse:45790.77779
[900]	validation_0-rmse:45882.55865
[950]	validation_0-rmse:45996.54309
[1000]	validation_0-rmse:46107.80473
[1050]	validation_0-rmse:46166.51071
[1100]	validation_0-rmse:46244.03587
[1150]	validation_0-rmse:46337.84103
[1200]	validation_0-rmse:46435.37153
[1250]	validation_0-rmse:46498.48093
[1300]	validation_0-rmse:46581.66805
[1350]	validation_0-rmse

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5745635245894778
,device,'gpu'
,early_stopping_rounds,
,enable_categorical,False


In [105]:
y_pred_test = final_model.predict(X_test)

## XV. Analysis of results

In [106]:
epsilon = 1e-6
rmse = mean_squared_error(y_test, y_pred_test) ** 0.5
mae = mean_absolute_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / (np.abs(y_test) + epsilon))) * 100

In [107]:
print("\nTEST METRICS")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"MAPE: {mape:.2f}%")


TEST METRICS
RMSE: 50364.5492
MAE : 10019.3850
MAPE: 12610119015.10%


In [108]:
# inspect target scale & extremes
import numpy as np
s = y_train
print("count", len(s))
print("min", np.min(s), "median", np.median(s), "mean", np.mean(s), "max", np.max(s))
print("quantiles", np.quantile(s, [0.5,0.75,0.9,0.95,0.99]))


count 473547
min -7951559.879999999 median 0.0 mean -38.631114989642 max 8022838.260000002
quantiles [    0.      672.16   5532.1   15777.68  90067.878]


Even after hyperparameter tuning....... LÕPETA