# Hospital Readmissions (Classification)

In [11]:
import pandas as pd
readmissions = pd.read_csv("../data/classification/readmissions_clean.csv")


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split


In [14]:
# Split into X and Y
X = readmissions.drop('readmitted', axis=1)
y = readmissions["readmitted"]

# Split into train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

# Fit Random Forest
rf = RandomForestClassifier(random_state=3)

# Choose a parameter grid for grid search
param_grid = {
    'max_depth': [5, 7, 10, 15],
    'max_features': [3, 5, 8, 10],
    'n_estimators': [50, 150, 200, 300]
}

# Time grid search
import time
start = time.time()

grid_search_classifier = GridSearchCV(estimator = rf, param_grid = param_grid)

grid_search_classifier.fit(X_train, y_train)

end = time.time()

print("Grid search execution time:",
      (end-start), "s")


Grid search execution time: 604.5506429672241 s


In [15]:
# Get optimal parameters
grid_search_classifier.best_params_

{'max_depth': 5, 'max_features': 3, 'n_estimators': 200}

In [17]:
# Fit model with optimal parameters
rf = RandomForestClassifier(random_state=3, max_depth = 5, max_features = 3, n_estimators = 200)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Get performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6278
Precision: 0.6252028123309897
Recall: 0.4974182444061962


# Car Emissions (Regression)

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split

In [22]:
import numpy as np

In [19]:
import pandas as pd
emissions = pd.read_csv("../data/regression/emissions_cleaned.csv")

# Split into X and Y
X = emissions.drop('co2_emissions', axis=1)
y = emissions["co2_emissions"]

# Split into train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

# Fit Random Forest
rf = RandomForestRegressor(random_state=3)

# Choose a parameter grid for grid search
param_grid = {
    'max_depth': [5, 7, 10, 15],
    'max_features': [3, 5, 8, 10],
    'n_estimators': [50, 150, 200, 300]
}

# Time grid search
import time
start = time.time()

grid_search_regressor = GridSearchCV(estimator = rf, param_grid = param_grid)

grid_search_regressor.fit(X_train, y_train)

end = time.time()

print("Grid search execution time:",
      (end-start), "s")

Grid search execution time: 185.21761798858643 s


In [20]:
# Get optimal parameters
grid_search_regressor.best_params_


{'max_depth': 15, 'max_features': 10, 'n_estimators': 300}

In [23]:
# Fit model with optimal parameters
rf = RandomForestRegressor(random_state=3, max_depth = 15, max_features = 10, n_estimators = 300)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Mean Absolute Error (MAE):', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(mean_squared_error(y_test, y_pred)))
mape = np.mean(np.abs((y_test - y_pred) / np.abs(y_test)))
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
print('Accuracy:', round(100*(1 - mape), 2))

Mean Absolute Error (MAE): 1.6924411810082374
Mean Squared Error (MSE): 10.143196795641945
Root Mean Squared Error (RMSE): 3.1848385823526355
Mean Absolute Percentage Error (MAPE): 0.7
Accuracy: 99.3
