## Install necessary packages

In [1]:
!pip install catboost



# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    confusion_matrix,
    accuracy_score
)

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# 1. NORMAL – Linear Regression

In [2]:
X_train=pd.read_csv("X_normal_train.csv")
X_test=pd.read_csv("X_normal_test.csv")
y_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=LinearRegression()
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== NORMAL – Linear Regression (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (Linear, normal) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


=== NORMAL – Linear Regression (continuous days) ===
MAE: 0.31091856973032794
RMSE: 0.6492766685703916
R²: 0.5933638546196413

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0          1           0          0          24
1          4           0          3           8
2          1           0          1           1
3          1           0          1           1
4          1           0          0          23
5          5           0          3           2
6          1           0          1           0
7          1           0          1           1
8          1           0          0          24
9          1           0          1           0

=== INTEGER-DAY METRICS (Linear, normal) ===
Int MAE: 0.5764972655728958
Int MSE: 0.8536748032546352
Int RMSE: 0.9239452382336494

Confusion matrix (integer days):
[[   0    0    0    0    0    0]
 [2060 3436  148   18    1    0]
 [   2  337  376   52    2    1]
 [   1   83  248  136   11    1]
 [   0   37  172

# 2. NORMAL – Random Forest

In [4]:
X_train=pd.read_csv("X_normal_train.csv")
X_test=pd.read_csv("X_normal_test.csv")
y_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=RandomForestRegressor(n_estimators=300,max_depth=None,random_state=42,n_jobs=-1)
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== NORMAL – Random Forest (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (RF, normal) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)



=== NORMAL – Random Forest (continuous days) ===
MAE: 0.27166185317482894
RMSE: 0.6164885238005062
R²: 0.6333966925718073

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0          1           0          1           0
1          4           0          3          18
2          1           0          1           0
3          1           0          1           0
4          1           0          1           0
5          5           0          2           9
6          1           0          1           0
7          1           0          1           0
8          1           0          1           0
9          1           0          1           0

=== INTEGER-DAY METRICS (RF, normal) ===
Int MAE: 0.28398025877017474
Int MSE: 0.5339469120981726
Int RMSE: 0.7307167112487387

Confusion matrix (integer days):
[[5524  119   20    0    0]
 [ 301  402   67    0    0]
 [  68  268  135    9    0]
 [  32  161  133   19    0]
 [  21  130   76   12    0]]
Exact-day accu

# 3. NORMAL – XGBoost Regressor

In [5]:
# NORMAL – XGBoost Regressor

X_train=pd.read_csv("X_normal_train.csv")
X_test=pd.read_csv("X_normal_test.csv")
y_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=XGBRegressor(objective="reg:squarederror",n_estimators=400,learning_rate=0.05,max_depth=6,subsample=0.8,colsample_bytree=0.8,random_state=42,tree_method="hist")
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== NORMAL – XGBoost (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (XGB, normal) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)



=== NORMAL – XGBoost (continuous days) ===
MAE: 0.27418288051469875
RMSE: 0.6124342639044911
R²: 0.6382026793171705

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0          1           0          0          24
1          4           0          3           6
2          1           0          0          24
3          1           0          0          24
4          1           0          0          24
5          5           0          2          17
6          1           0          1           0
7          1           0          1           0
8          1           0          0          24
9          1           0          1           0

=== INTEGER-DAY METRICS (XGB, normal) ===
Int MAE: 0.6920101373882886
Int MSE: 0.9537148192610377
Int RMSE: 0.9765832372414743

Confusion matrix (integer days):
[[   0    0    0    0    0    0]
 [3057 2471  121   13    1    0]
 [   0  297  436   36    1    0]
 [   0   69  276  135    0    0]
 [   0   28  188  117   12   

# As per the results i think the best for normal is random forest

# 4. OUTLIERS – SVR

In [6]:
# OUTLIERS – SVR

X_train=pd.read_csv("X_outliers_train.csv")
X_test=pd.read_csv("X_outliers_test.csv")
y_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=SVR(kernel="rbf",C=5.0,epsilon=0.1)
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== OUTLIER – SVR (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (SVR, outliers) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)



=== OUTLIER – SVR (continuous days) ===
MAE: 6.549665962810095
RMSE: 11.90454279979262
R²: 0.08325819176836646

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0         11           0          8          16
1          6          24         13          11
2          8           0         12           4
3         12           0         17          14
4         10           0         16          18
5          6          24          8          17
6         19          24         20          19
7         17          24         16           5
8          9           0          8          23
9          6          24         10          16

=== INTEGER-DAY METRICS (SVR, outliers) ===
Int MAE: 6.514318442153494
Int MSE: 142.08132875143184
Int RMSE: 11.919787277943842

Confusion matrix (integer days):
[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 1  6 31 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
Exact-day

# 5. OUTLIERS – LightGBM Regressor

In [7]:
# OUTLIERS – LightGBM Regressor

X_train=pd.read_csv("X_outliers_train.csv")
X_test=pd.read_csv("X_outliers_test.csv")
y_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=LGBMRegressor(n_estimators=400,learning_rate=0.05,num_leaves=31,random_state=42,verbose=-1)
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== OUTLIER – LightGBM (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (LGBM, outliers) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)



=== OUTLIER – LightGBM (continuous days) ===
MAE: 6.288392035426221
RMSE: 11.564608946643167
R²: 0.13486576130939998

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0         11           0          8          20
1          6          24         21          10
2          8           0          9          17
3         12           0         16           6
4         10           0         14           5
5          6          24          7          18
6         19          24         22          10
7         17          24          9          12
8          9           0          6           7
9          6          24         11           2

=== INTEGER-DAY METRICS (LGBM, outliers) ===
Int MAE: 6.270332187857961
Int MSE: 134.37571592210767
Int RMSE: 11.592053999275006

Confusion matrix (integer days):
[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 1 10 24 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
Ex



# 6. OUTLIERS – KNN Regressor

In [8]:
# OUTLIERS – KNN Regressor

X_train=pd.read_csv("X_outliers_train.csv")
X_test=pd.read_csv("X_outliers_test.csv")
y_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=KNeighborsRegressor(n_neighbors=10,weights="distance")
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== OUTLIER – KNN (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (KNN, outliers) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)



=== OUTLIER – KNN (continuous days) ===
MAE: 6.456946318945638
RMSE: 12.180224268627319
R²: 0.040307355818123614

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0         11           0          8          23
1          6          24         10           4
2          8           0         13           5
3         12           0         13           9
4         10           0         14           9
5          6          24         11           9
6         19          24         15          24
7         17          24          9          17
8          9           0         10          16
9          6          24          9          17

=== INTEGER-DAY METRICS (KNN, outliers) ===
Int MAE: 6.419243986254296
Int MSE: 148.971363115693
Int RMSE: 12.205382546880413

Confusion matrix (integer days):
[[35 37 41 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [11 14 12 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
Exact-day

# 7. OUTLIER – XGBoost Regressor

In [12]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix, accuracy_score
from xgboost import XGBRegressor

X_train = pd.read_csv("X_outliers_train.csv")
X_test = pd.read_csv("X_outliers_test.csv")
y_train = pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test = pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

xgb_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=600,
    learning_rate=0.03,
    max_depth=10,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=2,
    tree_method="hist",
    random_state=42
)

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])

pipe.fit(X_train, y_train)
y_pred_log = pipe.predict(X_test)

y_true_days = np.exp(y_test)
y_pred_days = np.exp(y_pred_log)

mae = mean_absolute_error(y_true_days, y_pred_days)
rmse = mean_squared_error(y_true_days, y_pred_days)**0.5
r2 = r2_score(y_true_days, y_pred_days)

print("\n=== OUTLIER – XGBoost (continuous days) ===")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

true_days_int = np.floor(y_true_days).astype(int)
true_hours = np.round((y_true_days - true_days_int) * 24).astype(int)
pred_days_int = np.floor(y_pred_days).astype(int)
pred_hours = np.round((y_pred_days - pred_days_int) * 24).astype(int)

results_df = X_test.copy()
results_df["true_days"] = true_days_int
results_df["true_hours"] = true_hours
results_df["pred_days"] = pred_days_int
results_df["pred_hours"] = pred_hours

print("\nSample predictions (first 10):")
print(results_df[["true_days", "true_hours", "pred_days", "pred_hours"]].head(10))

y_true_int = true_days_int
y_pred_int = pred_days_int

int_mae = mean_absolute_error(y_true_int, y_pred_int)
int_mse = mean_squared_error(y_true_int, y_pred_int)
int_rmse = int_mse**0.5
cm = confusion_matrix(y_true_int, y_pred_int)
acc_exact = accuracy_score(y_true_int, y_pred_int)
acc_pm1 = np.mean(np.abs(y_true_int - y_pred_int) <= 1)

print("\n=== INTEGER-DAY METRICS (XGBoost, outliers) ===")
print("Int MAE:", int_mae)
print("Int MSE:", int_mse)
print("Int RMSE:", int_rmse)
print("\nConfusion matrix:")
print(cm)
print("Exact-day accuracy:", acc_exact)
print("Accuracy within ±1 day:", acc_pm1)



=== OUTLIER – XGBoost (continuous days) ===
MAE: 6.1782480640782795
RMSE: 11.526699204889399
R²: 0.14052842747674232

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0         11           0         10           1
1          6          24         16           5
2          8           0          9          19
3         12           0         16          23
4         10           0         13          17
5          6          24          8          15
6         19          24         26          22
7         17          24          9           8
8          9           0          7          13
9          6          24         10           7

=== INTEGER-DAY METRICS (XGBoost, outliers) ===
Int MAE: 6.13631156930126
Int MSE: 133.19931271477662
Int RMSE: 11.541200661749913

Confusion matrix:
[[ 0  0  0 ...  0  0  0]
 [ 3 27 51 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
Exact-day accur

# 8. OUTLIERS – CatBoost Regressor

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix, accuracy_score
from catboost import CatBoostRegressor

X_train = pd.read_csv("X_outliers_train.csv")
X_test = pd.read_csv("X_outliers_test.csv")
y_train = pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test = pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

categorical_indices = np.where(X_train.dtypes == "object")[0]

cb_model = CatBoostRegressor(
    depth=8,
    learning_rate=0.05,
    loss_function="RMSE",
    n_estimators=500,
    random_seed=42,
    verbose=100
)

cb_model.fit(
    X_train,
    y_train,
    cat_features=categorical_indices,
    eval_set=(X_test, y_test),
    use_best_model=True
)

y_pred_log = cb_model.predict(X_test)

y_true_days = np.exp(y_test)
y_pred_days = np.exp(y_pred_log)

mae = mean_absolute_error(y_true_days, y_pred_days)
rmse = mean_squared_error(y_true_days, y_pred_days)**0.5
r2 = r2_score(y_true_days, y_pred_days)

print("\n=== OUTLIER – CatBoost_Outliers (continuous days) ===")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

true_days_int = np.floor(y_true_days).astype(int)
true_hours = np.round((y_true_days - true_days_int) * 24).astype(int)
pred_days_int = np.floor(y_pred_days).astype(int)
pred_hours = np.round((y_pred_days - pred_days_int) * 24).astype(int)

results_df = X_test.copy()
results_df["true_days"] = true_days_int
results_df["true_hours"] = true_hours
results_df["pred_days"] = pred_days_int
results_df["pred_hours"] = pred_hours

print("\nSample predictions (first 10):")
print(results_df[["true_days", "true_hours", "pred_days", "pred_hours"]].head(10))

y_true_int = true_days_int
y_pred_int = pred_days_int

int_mae = mean_absolute_error(y_true_int, y_pred_int)
int_mse = mean_squared_error(y_true_int, y_pred_int)
int_rmse = int_mse**0.5

cm = confusion_matrix(y_true_int, y_pred_int)
acc_exact = accuracy_score(y_true_int, y_pred_int)
acc_pm1 = np.mean(np.abs(y_true_int - y_pred_int) <= 1)

print("\n=== INTEGER-DAY METRICS (CatBoost_Outliers) ===")
print("Int MAE:", int_mae)
print("Int MSE:", int_mse)
print("Int RMSE:", int_rmse)
print("\nConfusion matrix:")
print(cm)
print("Exact-day accuracy:", acc_exact)
print("Accuracy within ±1 day:", acc_pm1)


0:	learn: 0.5709300	test: 0.5878208	best: 0.5878208 (0)	total: 85.5ms	remaining: 42.6s
100:	learn: 0.4355773	test: 0.5232511	best: 0.5232511 (100)	total: 9.75s	remaining: 38.5s
200:	learn: 0.3945914	test: 0.5221826	best: 0.5219892 (168)	total: 19.7s	remaining: 29.3s
300:	learn: 0.3557565	test: 0.5230698	best: 0.5212169 (245)	total: 30.8s	remaining: 20.3s
400:	learn: 0.3262781	test: 0.5234936	best: 0.5212169 (245)	total: 41.3s	remaining: 10.2s
499:	learn: 0.3024313	test: 0.5237225	best: 0.5212169 (245)	total: 51.2s	remaining: 0us

bestTest = 0.5212169001
bestIteration = 245

Shrink model to first 246 iterations.

=== OUTLIER – CatBoost_Outliers (continuous days) ===
MAE: 6.15712086644807
RMSE: 11.603231805753916
R²: 0.12907745406320736

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0         11           0         10           8
1          6          24         14           1
2          8           0          9          11
3         12           0       

# As of for outliers we can keep our first option as xgboost as it preformed well in every thing and also we may consider catboost for its accuracy but the rmse value is low compared to xgboost.

# 8. COMBINED – CatBoost Regressor (Normal + Outliers)

In [9]:
# COMBINED – CatBoost Regressor (normal + outliers merged)

Xn_train=pd.read_csv("X_normal_train.csv")
Xn_test=pd.read_csv("X_normal_test.csv")
yn_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
yn_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

Xo_train=pd.read_csv("X_outliers_train.csv")
Xo_test=pd.read_csv("X_outliers_test.csv")
yo_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
yo_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

X_train=pd.concat([Xn_train,Xo_train],axis=0).reset_index(drop=True)
X_test=pd.concat([Xn_test,Xo_test],axis=0).reset_index(drop=True)
y_train=pd.concat([yn_train,yo_train],axis=0).reset_index(drop=True)
y_test=pd.concat([yn_test,yo_test],axis=0).reset_index(drop=True)

categorical_features=np.where(X_train.dtypes=="object")[0]

model=CatBoostRegressor(depth=8,learning_rate=0.05,loss_function="RMSE",n_estimators=500,random_seed=42,verbose=100)
model.fit(X_train,y_train,cat_features=categorical_features,eval_set=(X_test,y_test),use_best_model=True)

y_pred_log=model.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== COMBINED – CatBoost (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (CatBoost, combined) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


0:	learn: 0.7855462	test: 0.7953593	best: 0.7953593 (0)	total: 310ms	remaining: 2m 34s
100:	learn: 0.2856519	test: 0.3045991	best: 0.3045991 (100)	total: 13s	remaining: 51.4s
200:	learn: 0.2725077	test: 0.2986104	best: 0.2986104 (200)	total: 25.8s	remaining: 38.4s
300:	learn: 0.2644401	test: 0.2963805	best: 0.2963805 (300)	total: 37.9s	remaining: 25.1s
400:	learn: 0.2568369	test: 0.2946680	best: 0.2946680 (400)	total: 50s	remaining: 12.4s
499:	learn: 0.2506862	test: 0.2939606	best: 0.2939369 (496)	total: 1m 4s	remaining: 0us

bestTest = 0.2939368593
bestIteration = 496

Shrink model to first 497 iterations.

=== COMBINED – CatBoost (continuous days) ===
MAE: 0.8937557801908974
RMSE: 3.834997107756231
R²: 0.5397387457851166

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0          1           0          1           0
1          4           0          3           6
2          1           0          1           0
3          1           0          0        

# This is the final combined model which i have used catboost.