## Install necessary packages

In [1]:
!pip install catboost



# Imports

In [2]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    confusion_matrix,
    accuracy_score
)

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# 1. NORMAL – Linear Regression

In [3]:
X_train=pd.read_csv("X_normal_train.csv")
X_test=pd.read_csv("X_normal_test.csv")
y_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=LinearRegression()
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== NORMAL – Linear Regression (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (Linear, normal) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


=== NORMAL – Linear Regression (continuous days) ===
MAE: 0.3109199619868224
RMSE: 0.6492777472943925
R²: 0.5933625034282147

Sample predictions (first 10):
   true_days  true_hours  pred_days  pred_hours
0          1           0          0          24
1          4           0          3           8
2          1           0          1           1
3          1           0          1           1
4          1           0          0          23
5          5           0          3           2
6          1           0          1           0
7          1           0          1           1
8          1           0          0          24
9          1           0          1           0

=== INTEGER-DAY METRICS (Linear, normal) ===
Int MAE: 0.5764972655728958
Int MSE: 0.8536748032546352
Int RMSE: 0.9239452382336494

Confusion matrix (integer days):
[[   0    0    0    0    0    0]
 [2060 3436  148   18    1    0]
 [   2  337  376   52    2    1]
 [   1   83  248  136   11    1]
 [   0   37  172 

# 2. NORMAL – Random Forest

In [None]:
X_train=pd.read_csv("X_normal_train.csv")
X_test=pd.read_csv("X_normal_test.csv")
y_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=RandomForestRegressor(n_estimators=300,max_depth=None,random_state=42,n_jobs=-1)
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== NORMAL – Random Forest (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (RF, normal) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


# 3. NORMAL – XGBoost Regressor

In [None]:
# NORMAL – XGBoost Regressor

X_train=pd.read_csv("X_normal_train.csv")
X_test=pd.read_csv("X_normal_test.csv")
y_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=XGBRegressor(objective="reg:squarederror",n_estimators=400,learning_rate=0.05,max_depth=6,subsample=0.8,colsample_bytree=0.8,random_state=42,tree_method="hist")
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== NORMAL – XGBoost (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (XGB, normal) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


# 4. OUTLIERS – SVR

In [None]:
# OUTLIERS – SVR

X_train=pd.read_csv("X_outliers_train.csv")
X_test=pd.read_csv("X_outliers_test.csv")
y_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=SVR(kernel="rbf",C=5.0,epsilon=0.1)
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== OUTLIER – SVR (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (SVR, outliers) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


# 5. OUTLIERS – LightGBM Regressor

In [None]:
# OUTLIERS – LightGBM Regressor

X_train=pd.read_csv("X_outliers_train.csv")
X_test=pd.read_csv("X_outliers_test.csv")
y_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=LGBMRegressor(n_estimators=400,learning_rate=0.05,num_leaves=31,random_state=42,verbose=-1)
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== OUTLIER – LightGBM (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (LGBM, outliers) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


# 6. OUTLIERS – KNN Regressor

In [None]:
# OUTLIERS – KNN Regressor

X_train=pd.read_csv("X_outliers_train.csv")
X_test=pd.read_csv("X_outliers_test.csv")
y_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
y_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

numeric_features=X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
categorical_transformer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor=ColumnTransformer(transformers=[("num",numeric_transformer,numeric_features),("cat",categorical_transformer,categorical_features)])

model=KNeighborsRegressor(n_neighbors=10,weights="distance")
pipe=Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

pipe.fit(X_train,y_train)
y_pred_log=pipe.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== OUTLIER – KNN (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (KNN, outliers) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)


# 7. COMBINED – CatBoost Regressor (Normal + Outliers)

In [None]:
# COMBINED – CatBoost Regressor (normal + outliers merged)

Xn_train=pd.read_csv("X_normal_train.csv")
Xn_test=pd.read_csv("X_normal_test.csv")
yn_train=pd.read_csv("y_normal_train.csv")["spell_episode_los"]
yn_test=pd.read_csv("y_normal_test.csv")["spell_episode_los"]

Xo_train=pd.read_csv("X_outliers_train.csv")
Xo_test=pd.read_csv("X_outliers_test.csv")
yo_train=pd.read_csv("y_outliers_train.csv")["spell_episode_los"]
yo_test=pd.read_csv("y_outliers_test.csv")["spell_episode_los"]

X_train=pd.concat([Xn_train,Xo_train],axis=0).reset_index(drop=True)
X_test=pd.concat([Xn_test,Xo_test],axis=0).reset_index(drop=True)
y_train=pd.concat([yn_train,yo_train],axis=0).reset_index(drop=True)
y_test=pd.concat([yn_test,yo_test],axis=0).reset_index(drop=True)

categorical_features=np.where(X_train.dtypes=="object")[0]

model=CatBoostRegressor(depth=8,learning_rate=0.05,loss_function="RMSE",n_estimators=500,random_seed=42,verbose=100)
model.fit(X_train,y_train,cat_features=categorical_features,eval_set=(X_test,y_test),use_best_model=True)

y_pred_log=model.predict(X_test)

y_true_days=np.exp(y_test)
y_pred_days=np.exp(y_pred_log)

mae=mean_absolute_error(y_true_days,y_pred_days)
rmse=mean_squared_error(y_true_days,y_pred_days)**0.5
r2=r2_score(y_true_days,y_pred_days)

print("\n=== COMBINED – CatBoost (continuous days) ===")
print("MAE:",mae)
print("RMSE:",rmse)
print("R²:",r2)

true_days_int=np.floor(y_true_days).astype(int)
true_hours=np.round((y_true_days-true_days_int)*24).astype(int)
pred_days_int=np.floor(y_pred_days).astype(int)
pred_hours=np.round((y_pred_days-pred_days_int)*24).astype(int)

results_df=X_test.copy()
results_df["true_days"]=true_days_int
results_df["true_hours"]=true_hours
results_df["pred_days"]=pred_days_int
results_df["pred_hours"]=pred_hours
results_df["pred_days_decimal"]=y_pred_days

print("\nSample predictions (first 10):")
print(results_df[["true_days","true_hours","pred_days","pred_hours"]].head(10))

y_true_int=true_days_int
y_pred_int=pred_days_int

int_mse=mean_squared_error(y_true_int,y_pred_int)
int_rmse=int_mse**0.5
int_mae=mean_absolute_error(y_true_int,y_pred_int)

print("\n=== INTEGER-DAY METRICS (CatBoost, combined) ===")
print("Int MAE:",int_mae)
print("Int MSE:",int_mse)
print("Int RMSE:",int_rmse)

cm=confusion_matrix(y_true_int,y_pred_int)
acc=accuracy_score(y_true_int,y_pred_int)
within_1_day=np.mean(np.abs(y_true_int-y_pred_int)<=1)

print("\nConfusion matrix (integer days):")
print(cm)
print("Exact-day accuracy:",acc)
print("Accuracy within ±1 day:",within_1_day)
