In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Step 1: Load the dataset
data = pd.read_csv("hospital_data.csv", encoding="ISO-8859-1")

# Step 2: Handle missing values
# Fill numeric columns with mean and categorical columns with "Missing"
data.fillna(
    {col: data[col].mean() if data[col].dtype != "object" else "Missing" for col in data.columns},
    inplace=True,
)

# Step 3: Encode categorical variables using Label Encoding
categorical_columns = [
    "Rndrng_Prvdr_Org_Name",
    "Rndrng_Prvdr_City",
    "Rndrng_Prvdr_St",
    "Rndrng_Prvdr_State_Abrvtn",
    "Rndrng_Prvdr_RUCA_Desc",
    "DRG_Desc",
]
data[categorical_columns] = data[categorical_columns].apply(lambda col: pd.factorize(col)[0])

# Step 4: Define features (X) and target variable (y)
X = data.drop("Avg_Mdcr_Pymt_Amt", axis=1)  # Features
y = data["Avg_Mdcr_Pymt_Amt"]  # Target

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 7: Define a function for model evaluation
def evaluate_model(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    print(f"{name} - R²: {r2:.4f}, MAE: {mae:.4f}, MSE: {mse:.4f}")

# Step 8: Define hyperparameter grids
rf_param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
}
gb_param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7],
}
xgb_param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [4, 6, 8],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

# Step 9: Train and tune Random Forest
print("Training Random Forest...")
rf_random = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=10,
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42,
)
rf_random.fit(X_train, y_train)
rf_pred = rf_random.best_estimator_.predict(X_test)
evaluate_model("Random Forest", y_test, rf_pred)

# Step 10: Train and tune Gradient Boosting
print("Training Gradient Boosting...")
gb_random = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_distributions=gb_param_grid,
    n_iter=10,
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42,
)
gb_random.fit(X_train, y_train)
gb_pred = gb_random.best_estimator_.predict(X_test)
evaluate_model("Gradient Boosting", y_test, gb_pred)

# Step 11: Train and tune XGBoost
print("Training XGBoost...")
xgb_random = RandomizedSearchCV(
    estimator=XGBRegressor(tree_method="hist", random_state=42),
    param_distributions=xgb_param_grid,
    n_iter=10,
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42,
)
xgb_random.fit(X_train, y_train)
xgb_pred = xgb_random.best_estimator_.predict(X_test)
evaluate_model("XGBoost", y_test, xgb_pred)


Training Random Forest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END .max_depth=20, min_samples_split=2, n_estimators=50; total time= 1.5min
[CV] END .max_depth=20, min_samples_split=2, n_estimators=50; total time= 1.5min
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time= 2.7min
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time= 2.7min
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time= 2.7min
[CV] END .max_depth=20, min_samples_split=2, n_estimators=50; total time= 1.3min
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time= 1.4min
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time= 3.1min
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time= 3.2min
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time= 3.2min
[CV] END .max_depth=10, min_samples_split=2, n_estimators=50; total time=  42.8s
[CV] END .max_dept

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Model metrics (replace with actual values)
models = ["Random Forest", "Gradient Boosting", "XGBoost"]
r2_scores = [0.9088, 0.9003, 0.9015]
mae_scores = [2785.33, 3206.17, 2918.64]
mse_scores = [28909970.76, 31591481.91, 31202564.00]

# Create the bar plot
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

# R² Scores
sns.barplot(x=models, y=r2_scores, ax=ax[0])
ax[0].set_title("R² Scores")
ax[0].set_ylabel("R²")
ax[0].set_ylim(0.85, 1)

# MAE Scores
sns.barplot(x=models, y=mae_scores, ax=ax[1])
ax[1].set_title("Mean Absolute Error (MAE)")
ax[1].set_ylabel("MAE")

# MSE Scores
sns.barplot(x=models, y=mse_scores, ax=ax[2])
ax[2].set_title("Mean Squared Error (MSE)")
ax[2].set_ylabel("MSE")

plt.tight_layout()
plt.show()

rf_random.fit(X_train, y_train)

# Predictions (replace with actual predictions and true values)
predicted_values = rf_random.best_estimator_.predict(X_test)
actual_values = y_test

plt.figure(figsize=(8, 6))
sns.scatterplot(x=actual_values, y=predicted_values, alpha=0.6)
plt.title("Predicted vs. Actual Values")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.axline([0, 0], [1, 1], color='red', linestyle='--', linewidth=2)  # Reference line
plt.show()


plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, bins=30)
plt.title("Residuals Distribution")
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.axvline(0, color='red', linestyle='--')
plt.show()

# Residual plot for Random Forest
plt.figure(figsize=(8, 6))

NameError: name 'c' is not defined