## Predict ICU Facilities given COVID-19 cases

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [3]:
# Load the data
covid_data = pd.read_csv('../datasets/filtered_datasets/cases_state.csv', parse_dates=['date'])
icu_data = pd.read_csv('../datasets/filtered_datasets/icu.csv', parse_dates=['date'])

# Merge the data on the date column
data = pd.merge(covid_data, icu_data, on=['date', 'state'])

# Sort the data by date
data = data.sort_values(by='date')


# Create lag features
def create_lag_features(data, lag=14):
    for i in range(1, lag + 1):
        data[f'cases_new_lag_{i}'] = data['cases_new'].shift(i)
    return data

# Apply the function to create lag features
data = create_lag_features(data)

# Drop rows with missing values (due to lagging)
data = data.dropna()

# Select the features and target
features = [col for col in data.columns if 'cases_new_lag' in col]
target = 'icu_covid'

In [4]:
# Split the data into training and testing sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [5]:

# Initialize the random forest regressor
model = RandomForestRegressor(n_estimators=150, random_state=42)

# Fit the model
model.fit(X_train, y_train)


In [6]:

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'ICU COVID - Mean Absolute Error: {mae}')
print(f'ICU COVID - Mean Squared Error: {mse}')
print(f'ICU COVID - Root Mean Squared Error: {rmse}')
print()

# Predicting ICU availability for future dates
# Assuming future_dates is a DataFrame containing future dates
future_dates = pd.date_range(start='2023-06-01', end='2023-06-14', freq='D')
future_data = pd.DataFrame({'date': future_dates})

# Initialize lag features with NaN
for lag in range(1, 15):
    future_data[f'cases_new_lag_{lag}'] = np.nan

# Fill in the lag features with the most recent data
for lag in range(1, 15):
    future_data[f'cases_new_lag_{lag}'] = data['cases_new'].iloc[-lag]

# Predict ICU availability for future dates
future_features = future_data[features]
future_features_scaled = scaler.transform(future_features)

# Predict ICU availability for the future dates
future_predictions = model.predict(future_features_scaled)

# Add the predictions to the future_data DataFrame
future_data['icu_covid_predicted'] = future_predictions

ICU COVID - Mean Absolute Error: 8.171185460378737
ICU COVID - Mean Squared Error: 415.1772724271413
ICU COVID - Root Mean Squared Error: 20.37589930351888



In [7]:
import plotly.graph_objects as go

# Create the figure
fig = go.Figure()

# Add the actual ICU COVID data
fig.add_trace(go.Scatter(x=data['date'], y=data['icu_covid'], name='Actual ICU COVID', mode='lines', line=dict(color='blue')))

# Add the predicted ICU COVID data
fig.add_trace(go.Scatter(x=future_data['date'], y=future_data['icu_covid_predicted'], name='Predicted ICU COVID', mode='lines', line=dict(color='red', dash='dash')))

# Set the layout
fig.update_layout(
    title='Actual vs Predicted ICU COVID Availability',
    xaxis=dict(title='Date'),
    yaxis=dict(title='ICU COVID'),
    showlegend=True
)

# Show the plot
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

