In [175]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [176]:
# Import our 3-year modeling data
data = pd.read_csv('data/all_model_data.csv')
data

Unnamed: 0,Age,City_Eugene,City_Springfield,DayOfMonth,DayOfWeek,DayOfYear,Gender_female,Gender_male,Gender_non_binary,Gender_trans_female,...,solarradiation,temp,tempmax,tempmin,uvindex,visibility,windgust,windspeed,Date,CallsPerDay
0,34,1,0,10,5,253,0,1,0,0,...,183,70,89,51,6,8,15,12,1662768000000000000,1
1,38,1,0,1,4,244,0,1,0,0,...,99,67,78,62,4,7,19,15,1693526400000000000,3
2,48,0,1,1,4,244,0,0,1,0,...,99,67,78,62,4,7,19,15,1693526400000000000,3
3,11,0,1,1,4,244,0,1,0,0,...,99,67,78,62,4,7,19,15,1693526400000000000,3
4,26,1,0,2,5,245,1,0,0,0,...,186,67,85,55,8,9,15,11,1693612800000000000,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,65,1,0,14,5,287,0,1,0,0,...,60,56,64,48,4,7,9,4,1697241600000000000,3
345,70,0,1,15,6,288,1,0,0,0,...,110,59,69,54,6,6,5,6,1697328000000000000,4
346,32,1,0,15,6,288,1,0,0,0,...,110,59,69,54,6,6,5,6,1697328000000000000,4
347,39,1,0,15,6,288,1,0,0,0,...,110,59,69,54,6,6,5,6,1697328000000000000,4


In [177]:
# Import our upcoming prediction data
upcoming_data = pd.read_csv('data/upcoming_data_with_dummies.csv')
upcoming_data

Unnamed: 0,DayOfMonth,DayOfWeek,DayOfYear,Month,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Year,cloudcover,...,snowdepth,solarenergy,solarradiation,temp,tempmax,tempmin,uvindex,visibility,windgust,windspeed
0,14,1,135,5,False,True,False,False,2024,12.8,...,0,27.9,323.4,60.9,74.1,47.9,9,10.0,26.4,18.2
1,15,2,136,5,False,True,False,False,2024,10.2,...,0,28.7,330.0,63.7,78.0,49.0,9,10.1,17.2,11.4
2,16,3,137,5,False,True,False,False,2024,20.5,...,0,29.8,344.4,62.2,74.0,51.0,9,10.1,13.9,9.2
3,17,4,138,5,False,True,False,False,2024,26.2,...,0,29.8,344.4,57.0,65.9,49.0,9,11.3,18.3,12.8
4,18,5,139,5,False,True,False,False,2024,10.5,...,0,28.5,327.0,55.2,66.8,43.6,9,15.0,19.0,12.3
5,19,6,140,5,False,True,False,False,2024,28.8,...,0,23.7,274.4,53.5,63.6,44.0,8,14.9,18.1,12.1
6,20,0,141,5,False,True,False,False,2024,24.5,...,0,29.9,346.9,53.3,63.4,43.2,9,15.0,17.0,11.4
7,21,1,142,5,False,True,False,False,2024,34.3,...,0,28.4,327.2,54.4,64.5,44.3,9,15.0,16.8,11.0


In [178]:
age_column = ['Age']
gender_columns = ['Gender_female', 'Gender_male', 'Gender_non_binary', 'Gender_trans_female', 'Gender_trans_male']
race_columns = ['Race_alaska native', 'Race_american indian', 'Race_american indian/alaska native', 'Race_asian',
                'Race_black/african american', 'Race_hispanic/latino', 'Race_native hawaiian/other pacific islander',
                'Race_other', 'Race_two or more races', 'Race_white']
city_columns = ['City_Eugene', 'City_Springfield']
reason_for_dispatch_columns = ['Reason for Dispatch_Check Welfare', 'Reason for Dispatch_Counseling',
                                'Reason for Dispatch_EMS Assist', 'Reason for Dispatch_Fire Assist',
                                'Reason for Dispatch_Police Assist', 'Reason for Dispatch_Public Assist',
                                'Reason for Dispatch_Suicidal Subject', 'Reason for Dispatch_Transport']

classification_columns = age_column + gender_columns + race_columns + city_columns + reason_for_dispatch_columns

model_data = data.drop(columns=classification_columns)
print(model_data.columns)
model_data.head()

Index(['DayOfMonth', 'DayOfWeek', 'DayOfYear', 'Hour', 'Month',
       'Season_Autumn', 'Season_Spring', 'Season_Summer', 'Season_Winter',
       'Year', 'cloudcover', 'conditions_Clear', 'conditions_Overcast',
       'conditions_Partially cloudy', 'conditions_Rain',
       'conditions_Rain, Freezing Drizzle/Freezing Rain, Overcast',
       'conditions_Rain, Freezing Drizzle/Freezing Rain, Partially cloudy',
       'conditions_Rain, Overcast', 'conditions_Rain, Partially cloudy',
       'conditions_Snow, Rain', 'conditions_Snow, Rain, Overcast',
       'conditions_Snow, Rain, Partially cloudy', 'date', 'dew', 'feelslike',
       'feelslikemax', 'feelslikemin', 'humidity', 'pm25', 'precip',
       'precipcover', 'precipprob', 'sealevelpressure', 'snow', 'snowdepth',
       'solarenergy', 'solarradiation', 'temp', 'tempmax', 'tempmin',
       'uvindex', 'visibility', 'windgust', 'windspeed', 'Date',
       'CallsPerDay'],
      dtype='object')


Unnamed: 0,DayOfMonth,DayOfWeek,DayOfYear,Hour,Month,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Year,...,solarradiation,temp,tempmax,tempmin,uvindex,visibility,windgust,windspeed,Date,CallsPerDay
0,10,5,253,11,9,1,0,0,0,2022,...,183,70,89,51,6,8,15,12,1662768000000000000,1
1,1,4,244,12,9,1,0,0,0,2023,...,99,67,78,62,4,7,19,15,1693526400000000000,3
2,1,4,244,17,9,1,0,0,0,2023,...,99,67,78,62,4,7,19,15,1693526400000000000,3
3,1,4,244,20,9,1,0,0,0,2023,...,99,67,78,62,4,7,19,15,1693526400000000000,3
4,2,5,245,12,9,1,0,0,0,2023,...,186,67,85,55,8,9,15,11,1693612800000000000,7


In [179]:
# Convert the column names of both DataFrames to sets
model_data_columns = set(model_data.columns)
upcoming_data_columns = set(upcoming_data.columns)

# Columns in model_data but not in upcoming_data
model_only_columns = model_data_columns.difference(upcoming_data_columns)
print("Columns in model_data but not in upcoming_data:")
print(model_only_columns)

# Columns in upcoming_data but not in model_data
upcoming_only_columns = upcoming_data_columns.difference(model_data_columns)
print("\nColumns in upcoming_data but not in model_data:")
print(upcoming_only_columns)

# Drop columns in model_data but not in upcoming_data
columns_to_drop = ['date', 'Date', 'Hour']
model_data = model_data.drop(columns=columns_to_drop)

# List of columns to remove from both DataFrames
columns_to_drop = ['Year']
# Drop specified columns from model_data
model_data = model_data.drop(columns=columns_to_drop)
# Drop specified columns from upcoming_data
upcoming_data = upcoming_data.drop(columns=columns_to_drop)

Columns in model_data but not in upcoming_data:
{'date', 'CallsPerDay', 'Date', 'Hour'}

Columns in upcoming_data but not in model_data:
set()


In [180]:
# Calculate the call volume threshold for the 90th percentile
call_volume_threshold = model_data['CallsPerDay'].quantile(0.9)

# Label the days based on whether their call volume exceeds the threshold
model_data['HighRisk'] = (model_data['CallsPerDay'] >= call_volume_threshold).astype(int)

# Split the data into features (X) and target labels (y)
X = model_data.drop(columns=['CallsPerDay', 'HighRisk'])
y = model_data['HighRisk']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Check label distribution
label_distribution = model_data['HighRisk'].value_counts(normalize=True)
print("Label Distribution:")
print(label_distribution)

# Check for data leakage
X_leakage = X_test[X_test.index.isin(X_train.index)]
if not X_leakage.empty:
    print("Warning: Data Leakage Detected!\n")

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        64
           1       1.00      1.00      1.00         6

    accuracy                           1.00        70
   macro avg       1.00      1.00      1.00        70
weighted avg       1.00      1.00      1.00        70

Label Distribution:
HighRisk
0    0.888252
1    0.111748
Name: proportion, dtype: float64


In [181]:
# Use the trained classifier to predict on upcoming_data
upcoming_data_predictions = clf.predict(upcoming_data)

# Add the predictions as a new column to upcoming_data
upcoming_data['HighRisk'] = upcoming_data_predictions

# Print the mean of the 'HighRisk' column in upcoming_data
print(upcoming_data['HighRisk'].mean())

0.0


In [182]:
# Save the DataFrame with predictions to a CSV file
upcoming_data.to_csv('data/upcoming_data_with_predictions.csv', index=False)

In [183]:
# Get feature importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': clf.feature_importances_})
feature_importances.sort_values(by='Importance', ascending=False, inplace=True)
print(feature_importances)

                                              Feature  Importance
2                                           DayOfYear    0.164093
25                                               pm25    0.121038
24                                           humidity    0.088385
20                                                dew    0.085802
32                                        solarenergy    0.067611
29                                   sealevelpressure    0.063505
33                                     solarradiation    0.052266
37                                            uvindex    0.046889
8                                          cloudcover    0.039439
39                                           windgust    0.037629
34                                               temp    0.028035
1                                           DayOfWeek    0.024579
3                                               Month    0.023460
36                                            tempmin    0.022960
22        