In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import our cleaned and dummied data
data = pd.read_csv('data/clean_data_with_dummies.csv')

# Sorting by year, then by the day of the year
data = data.sort_values(by=['Year', 'DayOfYear'], ascending=[True, True])
print(data.columns)
data

Index(['Age', 'City_Eugene', 'City_Springfield', 'DayOfMonth', 'DayOfWeek',
       'DayOfYear', 'Gender_female', 'Gender_male', 'Gender_non_binary',
       'Gender_trans_female', 'Gender_trans_male', 'Hour', 'Month',
       'Race_alaska native', 'Race_american indian',
       'Race_american indian/alaska native', 'Race_asian',
       'Race_black/african american', 'Race_hispanic/latino',
       'Race_native hawaiian/other pacific islander', 'Race_other',
       'Race_two or more races', 'Race_white',
       'Reason for Dispatch_Check Welfare', 'Reason for Dispatch_Counseling',
       'Reason for Dispatch_EMS Assist', 'Reason for Dispatch_Fire Assist',
       'Reason for Dispatch_Police Assist',
       'Reason for Dispatch_Public Assist',
       'Reason for Dispatch_Suicidal Subject', 'Reason for Dispatch_Transport',
       'Season_Autumn', 'Season_Spring', 'Season_Summer', 'Season_Winter',
       'Year', 'cloudcover', 'conditions_Clear', 'conditions_Overcast',
       'conditions_Partia

  data = pd.read_csv('data/clean_data_with_dummies.csv')


Unnamed: 0,Age,City_Eugene,City_Springfield,DayOfMonth,DayOfWeek,DayOfYear,Gender_female,Gender_male,Gender_non_binary,Gender_trans_female,...,solarradiation,sunrise_hour,sunset_hour,temp,tempmax,tempmin,uvindex,visibility,windgust,windspeed
0,48.0,True,False,4,0,4,True,False,False,False,...,20.0,7.0,16.0,48.6,52.0,43.0,1.0,9.3,44.3,14.9
1,34.0,False,True,14,3,14,False,True,False,False,...,26.5,7.0,16.0,44.5,53.8,38.0,3.0,9.5,11.4,8.6
2,,False,True,14,3,14,,,,,...,26.5,7.0,16.0,44.5,53.8,38.0,3.0,9.5,11.4,8.6
3,,False,True,14,3,14,,,,,...,26.5,7.0,16.0,44.5,53.8,38.0,3.0,9.5,11.4,8.6
4,35.0,False,True,14,3,14,False,True,False,False,...,26.5,7.0,16.0,44.5,53.8,38.0,3.0,9.5,11.4,8.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66456,,False,True,31,6,365,,,,,...,20.1,7.0,16.0,41.1,42.9,39.9,1.0,0.5,5.8,5.5
66457,,,,31,6,365,,,,,...,20.1,7.0,16.0,41.1,42.9,39.9,1.0,0.5,5.8,5.5
66458,17.0,True,False,31,6,365,,,,,...,20.1,7.0,16.0,41.1,42.9,39.9,1.0,0.5,5.8,5.5
66459,17.0,True,False,31,6,365,,,,,...,20.1,7.0,16.0,41.1,42.9,39.9,1.0,0.5,5.8,5.5


In [3]:
# Filter rows containing specific values for race
race_values = ['Race_alaska native', 'Race_american indian', 'Race_american indian/alaska native',
               'Race_asian', 'Race_black/african american', 'Race_hispanic/latino',
               'Race_native hawaiian/other pacific islander', 'Race_other',
               'Race_two or more races', 'Race_white']

# Filter rows containing specific values for gender
gender_values = ['Gender_female', 'Gender_male', 'Gender_non_binary',
                 'Gender_trans_female', 'Gender_trans_male']

# Filter rows containing specific values for dispatch
dispatch_values = ['Reason for Dispatch_Check Welfare', 'Reason for Dispatch_Counseling',
                   'Reason for Dispatch_EMS Assist', 'Reason for Dispatch_Fire Assist',
                   'Reason for Dispatch_Police Assist', 'Reason for Dispatch_Public Assist',
                   'Reason for Dispatch_Suicidal Subject', 'Reason for Dispatch_Transport']

# List of columns to drop
columns_to_drop = ['Age', 'Hour', 'City_Eugene', 'City_Springfield'] + race_values + gender_values + dispatch_values

# Drop the specified columns from 'model_data'
data = data.drop(columns=columns_to_drop)

In [4]:
def preprocess_data(data):
    # Drop rows with any missing values (NAs)
    model_data = data.dropna()

    # Converting all values to integers
    model_data = model_data.astype(int)

    # Reset index after dropping rows
    model_data.reset_index(drop=True, inplace=True)

    # Remove columns that are all the same value
    model_data = model_data.loc[:, model_data.nunique() > 1]

    # Calculate the number of calls per day without aggregating rows
    calls_per_day = model_data.groupby(['DayOfYear', 'Year']).size().reset_index(name='CallsPerDay')

    # Add 'CallsPerDay' column to the original model_data
    model_data = model_data.merge(calls_per_day, on=['DayOfYear', 'Year'], how='left')
    
    return model_data

# Apply the preprocessing function to the data
all_model_data = preprocess_data(data)

# Print column names and shape of the processed data
print(all_model_data.columns)
print(all_model_data.shape)

Index(['DayOfMonth', 'DayOfWeek', 'DayOfYear', 'Month', 'Season_Autumn',
       'Season_Spring', 'Season_Summer', 'Season_Winter', 'Year', 'cloudcover',
       'conditions_Clear', 'conditions_Overcast',
       'conditions_Partially cloudy', 'conditions_Rain',
       'conditions_Rain, Freezing Drizzle/Freezing Rain, Overcast',
       'conditions_Rain, Freezing Drizzle/Freezing Rain, Partially cloudy',
       'conditions_Rain, Overcast', 'conditions_Rain, Partially cloudy',
       'conditions_Snow, Rain', 'conditions_Snow, Rain, Overcast',
       'conditions_Snow, Rain, Partially cloudy', 'dew', 'feelslike',
       'feelslikemax', 'feelslikemin', 'humidity', 'pm25', 'precip',
       'precipcover', 'precipprob', 'sealevelpressure', 'snow', 'snowdepth',
       'solarenergy', 'solarradiation', 'sunrise_hour', 'sunset_hour', 'temp',
       'tempmax', 'tempmin', 'uvindex', 'visibility', 'windgust', 'windspeed',
       'CallsPerDay'],
      dtype='object')
(66461, 45)


In [5]:
# Saving the aggregated data to a csv file
all_model_data.to_csv('data/all_model_data.csv', index=False)
print('Model data saved')

Model data saved
