In [419]:
import pandas as pd
from scipy.stats import pearsonr

#### Importing Datasets

In [420]:
# Import 2021 + 2022 CAHOOTS data from a XLSX
raw_data_2021_2022 = pd.read_excel('raw_data/call_data_from_CAHOOTS_2021_2022.xlsx')

# Import 2023 CAHOOTS data from a CSV
raw_data_2023 = pd.read_excel('raw_data/call_data_from_CAHOOTS.xlsx')

# Combining both data sets vertically
raw_data = pd.concat([raw_data_2021_2022, raw_data_2023], ignore_index=True)
raw_data

Unnamed: 0,Date,TimeOfCall,Age,Gender,Race,Language,City,Reason for Dispatch
0,2021-01-04,20:10:31,48,Female,White,English,Eugene,Public Assist
1,2021-01-14,14:28:00,34,Male,White,English,Springfield,Public Assist
2,2021-01-14,14:55:00,Unavailable,Chose not to Disclose,Chose not to Disclose,English,Springfield,Check Welfare
3,2021-01-14,14:53:00,Unavailable,Chose not to Disclose,Chose not to Disclose,English,Springfield,Check Welfare
4,2021-01-14,16:11:00,35,Male,White,English,Springfield,Check Welfare
...,...,...,...,...,...,...,...,...
67516,2023-12-31,2023-12-31 21:33:35,Not Given,Chose not to disclose,White,English,Springfield,Information Not Available
67517,2023-12-31,2023-12-31 21:51:35,Not Given,Chose not to disclose,,English,Informatio not recorded,Information Not Available
67518,2023-12-31,2023-12-31 21:58:10,17,Chose not to disclose,White,English,Eugene,Information Not Available
67519,2023-12-31,2023-12-31 22:15:00,17,Chose not to disclose,White,English,Eugene,Information Not Available


In [421]:
# Import Eugene climate 21-22 data from a CSV
eugene_climate_data_21_22 = pd.read_csv('raw_data/weather_database_21_22.csv')

# Import Eugene climate 23 data from a CSV
eugene_climate_data_23 = pd.read_csv('raw_data/weather_database_23.csv')

# Vertically merge the two DataFrames
eugene_climate_data = pd.concat([eugene_climate_data_21_22, eugene_climate_data_23], ignore_index=True)

# Rename the 'datetime' column to 'date'
eugene_climate_data = eugene_climate_data.rename(columns={'datetime': 'date'})

# Save the merged DataFrame to a CSV
eugene_climate_data.to_csv('data/merged_eugene_climate_data.csv', index=False)
eugene_climate_data.head()

Unnamed: 0,name,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,"Eugene, OR, United States",2021-01-01,54.9,46.9,48.7,54.9,40.4,44.9,42.8,80.1,...,2.2,2,,2021-01-01T07:47:25,2021-01-01T16:45:10,0.6,"Rain, Overcast",Cloudy skies throughout the day with rain.,rain,"KCVO,WWCO3,72693024221,99999904236,72694524202..."
1,"Eugene, OR, United States",2021-01-02,51.1,46.1,49.1,51.1,40.8,44.7,44.4,84.0,...,1.2,1,,2021-01-02T07:47:26,2021-01-02T16:46:06,0.63,"Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,"KCVO,72693024221,99999904236,72694524202,F6274..."
2,"Eugene, OR, United States",2021-01-03,52.0,42.3,49.4,52.0,38.0,47.6,45.4,86.1,...,1.6,1,,2021-01-03T07:47:24,2021-01-03T16:47:03,0.67,"Rain, Overcast",Cloudy skies throughout the day with rain.,rain,"KCVO,72693024221,99999904236,72694524202,F6274..."
3,"Eugene, OR, United States",2021-01-04,52.0,43.0,48.6,52.0,37.8,46.3,43.8,83.6,...,1.6,1,,2021-01-04T07:47:20,2021-01-04T16:48:02,0.7,"Rain, Partially cloudy",Partly cloudy throughout the day with a chance...,rain,"KCVO,72693024221,99999904236,72694524202,F6274..."
4,"Eugene, OR, United States",2021-01-05,50.0,37.4,44.9,46.0,31.2,39.9,37.0,74.7,...,1.6,1,,2021-01-05T07:47:14,2021-01-05T16:49:03,0.74,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KCVO,72693024221,99999904236,72694524202,F6274..."


In [422]:
# Import AQI data from a CSV
aqi_data = pd.read_csv('raw_data/eugene-- highway 99, oregon-air-quality.csv')
# Rename the pm25 column
aqi_data = aqi_data.rename(columns={' pm25': 'pm25'})
print(aqi_data.shape)
aqi_data.head()

(3354, 2)


Unnamed: 0,date,pm25
0,2024/4/2,31
1,2024/4/3,25
2,2024/4/4,11
3,2024/4/5,15
4,2024/4/6,17


#### Importing Upcoming Weather Data

In [423]:
# Import AQI data from a CSV
upcoming_data = pd.read_csv('raw_data/upcoming_weather.csv')
# Rename the 'datetime' column to 'date'
upcoming_data = upcoming_data.rename(columns={'datetime': 'date'})
upcoming_data.head()

Unnamed: 0,name,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,"Eugene, OR, United States",2024-05-14,74.1,47.9,60.9,74.1,44.2,60.4,49.0,67.3,...,27.9,9,10,2024-05-14T05:46:32,2024-05-14T20:31:40,0.22,Clear,Clear conditions throughout the day.,clear-day,"KCVO,WWCO3,KEUG"
1,"Eugene, OR, United States",2024-05-15,78.0,49.0,63.7,78.0,47.2,63.5,51.5,66.8,...,28.7,9,10,2024-05-15T05:45:28,2024-05-15T20:32:46,0.25,Clear,Clear conditions throughout the day.,clear-day,
2,"Eugene, OR, United States",2024-05-16,74.0,51.0,62.2,74.0,51.0,62.2,51.6,70.5,...,29.8,9,10,2024-05-16T05:44:25,2024-05-16T20:33:51,0.28,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,
3,"Eugene, OR, United States",2024-05-17,65.9,49.0,57.0,65.9,46.6,56.8,43.9,63.8,...,29.8,9,10,2024-05-17T05:43:24,2024-05-17T20:34:56,0.31,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,
4,"Eugene, OR, United States",2024-05-18,66.8,43.6,55.2,66.8,40.0,54.0,41.7,62.7,...,28.5,9,10,2024-05-18T05:42:25,2024-05-18T20:35:59,0.34,Clear,Clear conditions throughout the day.,clear-day,


#### Adding a Predicted pm25 value to the Upcoming Data

In [424]:
# Convert 'date' columns to datetime
aqi_data['date'] = pd.to_datetime(aqi_data['date'])
upcoming_data['date'] = pd.to_datetime(upcoming_data['date'])

# Calculate average pm25 for each month in 'aqi_data'
aqi_data['month'] = aqi_data['date'].dt.month
monthly_avg_pm25 = aqi_data.groupby(aqi_data['month'])['pm25'].mean()

# Create a new column in 'upcoming_data' to store the average pm25
upcoming_data['pm25'] = upcoming_data['date'].dt.month.map(monthly_avg_pm25)
upcoming_data

Unnamed: 0,name,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations,pm25
0,"Eugene, OR, United States",2024-05-14,74.1,47.9,60.9,74.1,44.2,60.4,49.0,67.3,...,9,10,2024-05-14T05:46:32,2024-05-14T20:31:40,0.22,Clear,Clear conditions throughout the day.,clear-day,"KCVO,WWCO3,KEUG",16.982079
1,"Eugene, OR, United States",2024-05-15,78.0,49.0,63.7,78.0,47.2,63.5,51.5,66.8,...,9,10,2024-05-15T05:45:28,2024-05-15T20:32:46,0.25,Clear,Clear conditions throughout the day.,clear-day,,16.982079
2,"Eugene, OR, United States",2024-05-16,74.0,51.0,62.2,74.0,51.0,62.2,51.6,70.5,...,9,10,2024-05-16T05:44:25,2024-05-16T20:33:51,0.28,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,,16.982079
3,"Eugene, OR, United States",2024-05-17,65.9,49.0,57.0,65.9,46.6,56.8,43.9,63.8,...,9,10,2024-05-17T05:43:24,2024-05-17T20:34:56,0.31,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,,16.982079
4,"Eugene, OR, United States",2024-05-18,66.8,43.6,55.2,66.8,40.0,54.0,41.7,62.7,...,9,10,2024-05-18T05:42:25,2024-05-18T20:35:59,0.34,Clear,Clear conditions throughout the day.,clear-day,,16.982079
5,"Eugene, OR, United States",2024-05-19,63.6,44.0,53.5,63.6,41.7,52.5,41.6,66.0,...,8,10,2024-05-19T05:41:28,2024-05-19T20:37:02,0.38,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,,16.982079
6,"Eugene, OR, United States",2024-05-20,63.4,43.2,53.3,63.4,40.5,52.1,42.2,67.5,...,9,10,2024-05-20T05:40:33,2024-05-20T20:38:04,0.41,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,,16.982079
7,"Eugene, OR, United States",2024-05-21,64.5,44.3,54.4,64.5,42.1,53.4,44.8,71.5,...,9,10,2024-05-21T05:39:39,2024-05-21T20:39:05,0.44,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,,16.982079


#### Building a Dataframe with all of our Data (minus Upcoming)

In [425]:
# Convert the 'date' column in eugene_climate_data to datetime64[ns]
eugene_climate_data['date'] = pd.to_datetime(eugene_climate_data['date'])

# Convert the 'date' column in aqi_data to datetime64[ns]
aqi_data['date'] = pd.to_datetime(aqi_data['date'])

# Merge eugene_climate_data and aqi_data on 'date'
eugene_climate_data_aqi = pd.concat([eugene_climate_data.set_index('date'), aqi_data.set_index('date')], axis=1, join='outer')

# Reset index to make 'date' a column again
eugene_climate_data_aqi = eugene_climate_data_aqi.reset_index()

In [426]:
# Convert 'Date' column in raw_data to datetime
raw_data['Date'] = pd.to_datetime(raw_data['Date'])

# Merge with Eugene climate data
raw_all_col_data = pd.merge(raw_data, eugene_climate_data_aqi, left_on='Date', right_on='date', how='left')

# Drop redundant columns
raw_all_col_data.drop(['date'], axis=1, inplace=True)

# Display the merged DataFrame
raw_all_col_data.head()

Unnamed: 0,Date,TimeOfCall,Age,Gender,Race,Language,City,Reason for Dispatch,name,tempmax,...,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations,pm25,month
0,2021-01-04,20:10:31,48,Female,White,English,Eugene,Public Assist,"Eugene, OR, United States",52.0,...,,2021-01-04T07:47:20,2021-01-04T16:48:02,0.7,"Rain, Partially cloudy",Partly cloudy throughout the day with a chance...,rain,"KCVO,72693024221,99999904236,72694524202,F6274...",13.0,1.0
1,2021-01-14,14:28:00,34,Male,White,English,Springfield,Public Assist,"Eugene, OR, United States",53.8,...,,2021-01-14T07:44:22,2021-01-14T16:59:14,0.05,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0
2,2021-01-14,14:55:00,Unavailable,Chose not to Disclose,Chose not to Disclose,English,Springfield,Check Welfare,"Eugene, OR, United States",53.8,...,,2021-01-14T07:44:22,2021-01-14T16:59:14,0.05,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0
3,2021-01-14,14:53:00,Unavailable,Chose not to Disclose,Chose not to Disclose,English,Springfield,Check Welfare,"Eugene, OR, United States",53.8,...,,2021-01-14T07:44:22,2021-01-14T16:59:14,0.05,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0
4,2021-01-14,16:11:00,35,Male,White,English,Springfield,Check Welfare,"Eugene, OR, United States",53.8,...,,2021-01-14T07:44:22,2021-01-14T16:59:14,0.05,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0


In [427]:
# Drop rows with missing values in 'Date' or 'TimeOfCall'
raw_all_col_data = raw_all_col_data.dropna(subset=['Date', 'TimeOfCall'])

# Convert 'TimeOfCall' to datetime format
raw_all_col_data['TimeOfCall'] = pd.to_datetime(raw_all_col_data['TimeOfCall'], errors='coerce')

# Extract hour from 'TimeOfCall' and assign it to the 'Hour' column
raw_all_col_data['Hour'] = raw_all_col_data['TimeOfCall'].dt.hour

def extract_date_features(df, date_column_name):
    df[date_column_name] = pd.to_datetime(df[date_column_name])
    df['DayOfWeek'] = df[date_column_name].dt.dayofweek
    df['DayOfMonth'] = df[date_column_name].dt.day
    df['DayOfYear'] = df[date_column_name].dt.dayofyear
    df['Year'] = df[date_column_name].dt.year
    return df

# Creating dummy columns for date features
raw_all_col_data = extract_date_features(raw_all_col_data, 'Date')

# Define a function to map dates to seasons
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Winter'

# Add a 'Season' column based on the 'Date' column
raw_all_col_data['Season'] = raw_all_col_data['Date'].apply(get_season)

# Add a 'Month' column that assigns the month as 1-12
raw_all_col_data['Month'] = pd.to_datetime(raw_all_col_data['Date']).dt.month

# Drop the 'TimeOfCall' column
raw_all_col_data.drop(columns=['Date', 'TimeOfCall'], inplace=True)

# Display the DataFrame
print(raw_all_col_data.columns)
raw_all_col_data

Index(['Age', 'Gender', 'Race', 'Language', 'City', 'Reason for Dispatch',
       'name', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'preciptype', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'sunrise', 'sunset',
       'moonphase', 'conditions', 'description', 'icon', 'stations', 'pm25',
       'month', 'Hour', 'DayOfWeek', 'DayOfMonth', 'DayOfYear', 'Year',
       'Season', 'Month'],
      dtype='object')


Unnamed: 0,Age,Gender,Race,Language,City,Reason for Dispatch,name,tempmax,tempmin,temp,...,stations,pm25,month,Hour,DayOfWeek,DayOfMonth,DayOfYear,Year,Season,Month
0,48,Female,White,English,Eugene,Public Assist,"Eugene, OR, United States",52.0,43.0,48.6,...,"KCVO,72693024221,99999904236,72694524202,F6274...",13.0,1.0,,0,4,4,2021,Winter,1
1,34,Male,White,English,Springfield,Public Assist,"Eugene, OR, United States",53.8,38.0,44.5,...,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0,,3,14,14,2021,Winter,1
2,Unavailable,Chose not to Disclose,Chose not to Disclose,English,Springfield,Check Welfare,"Eugene, OR, United States",53.8,38.0,44.5,...,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0,,3,14,14,2021,Winter,1
3,Unavailable,Chose not to Disclose,Chose not to Disclose,English,Springfield,Check Welfare,"Eugene, OR, United States",53.8,38.0,44.5,...,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0,,3,14,14,2021,Winter,1
4,35,Male,White,English,Springfield,Check Welfare,"Eugene, OR, United States",53.8,38.0,44.5,...,"KCVO,WWCO3,72693024221,99999904236,72694524202...",24.0,1.0,,3,14,14,2021,Winter,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67516,Not Given,Chose not to disclose,White,English,Springfield,Information Not Available,"Eugene, OR, United States",42.9,39.9,41.1,...,"KCVO,WWCO3,72693024221,72694524202,KEUG",28.0,12.0,21.0,6,31,365,2023,Winter,12
67517,Not Given,Chose not to disclose,,English,Informatio not recorded,Information Not Available,"Eugene, OR, United States",42.9,39.9,41.1,...,"KCVO,WWCO3,72693024221,72694524202,KEUG",28.0,12.0,21.0,6,31,365,2023,Winter,12
67518,17,Chose not to disclose,White,English,Eugene,Information Not Available,"Eugene, OR, United States",42.9,39.9,41.1,...,"KCVO,WWCO3,72693024221,72694524202,KEUG",28.0,12.0,21.0,6,31,365,2023,Winter,12
67519,17,Chose not to disclose,White,English,Eugene,Information Not Available,"Eugene, OR, United States",42.9,39.9,41.1,...,"KCVO,WWCO3,72693024221,72694524202,KEUG",28.0,12.0,22.0,6,31,365,2023,Winter,12


#### Data Cleaning (ignoring NaNs)

In [428]:
columns_to_drop = ['Language', 'stations', 'name', 'icon', 'moonphase', 'sunrise', 'sunset', 'severerisk', 'winddir', 'description', 'preciptype', 'month']

raw_all_col_data = raw_all_col_data.drop(columns_to_drop, axis=1)

#### Cleaning 'Age'

In [429]:
# Replace 'Not Given' values with NA in the 'Age' column
raw_all_col_data['Age'] = raw_all_col_data['Age'].replace('Not Given', pd.NA)

# Replace 'Unavailable' values with NA in the 'Age' column
raw_all_col_data['Age'] = raw_all_col_data['Age'].replace('Unavailable', pd.NA)

#### Cleaning 'Gender'

In [430]:
# Convert all values in 'Gender' column to lowercase
raw_all_col_data['Gender'] = raw_all_col_data['Gender'].str.lower()

# Map similar values to a single representation
gender_mapping = {
    'female': 'female',
    'male': 'male',
    'chose not to disclose': 'undisclosed',
    'female (assigned male at birth)': 'trans_female',
    'male (assigned female at birth)': 'trans_male',
    'non-binary/ gender queer': 'non_binary'
}

raw_all_col_data['Gender'] = raw_all_col_data['Gender'].map(gender_mapping)

# Replace 'undisclosed' values with NA
raw_all_col_data['Gender'] = raw_all_col_data['Gender'].replace('undisclosed', pd.NA)

#### Cleaning 'Race'

In [431]:
# Convert all values in 'Race' column to lowercase
raw_all_col_data['Race'] = raw_all_col_data['Race'].str.lower()

# Map similar values to a single representation
race_mapping = {
    'alaska native': 'alaska native',
    'american indian': 'american indian',
    'american indian or alaska native': 'american indian/alaska native',
    'asian': 'asian',
    'black or african american': 'black/african american',
    'chose not to disclose': 'undisclosed',
    'hispanic or latino': 'hispanic/latino',
    'native hawaiian or other pac island': 'native hawaiian/other pacific islander',
    'other race': 'other',
    'other single race': 'other',
    'two or more unspecified race': 'two or more races',
    'white': 'white'
}

raw_all_col_data['Race'] = raw_all_col_data['Race'].map(race_mapping)

# Replace 'undisclosed' values with NA
raw_all_col_data['Race'] = raw_all_col_data['Race'].replace('undisclosed', pd.NA)

#### Cleaning 'City'

In [432]:
# Replace 'Informatio not recorded' values with NA in the 'City' column
raw_all_col_data['City'] = raw_all_col_data['City'].replace('Informatio not recorded', pd.NA)

#### Cleaning 'Reason for Dispatch'

In [433]:
# Replace 'Information Not Available' values with NA in the 'Reason for Dispatch' column
raw_all_col_data['Reason for Dispatch'] = raw_all_col_data['Reason for Dispatch'].replace('Information Not Available', pd.NA)

#### Checking for Errors

In [434]:
# Print unique values in the 'Age' column
print("Unique values in 'Age' column:")
print(raw_all_col_data['Age'].unique())

# Print unique values in the 'Gender' column
print("\nUnique values in 'Gender' column:")
print(raw_all_col_data['Gender'].unique())

# Print unique values in the 'Race' column
print("\nUnique values in 'Race' column:")
print(raw_all_col_data['Race'].unique())

# Print unique values in the 'City' column
print("\nUnique values in 'City' column:")
print(raw_all_col_data['City'].unique())

# Print unique values in the 'Reason for Dispatch' column
print("\nUnique values in 'Reason for Dispatch' column:")
print(raw_all_col_data['Reason for Dispatch'].unique())

Unique values in 'Age' column:
[48 34 <NA> 35 68 72 21 31 22 76 58 56 20 32 45 29 36 28 55 18 49 73 94 12
 60 17 46 59 44 69 40 24 50 30 63 19 7 25 64 26 16 54 39 66 57 62 84 85 70
 82 15 43 67 33 65 75 11 13 81 23 38 14 53 42 41 79 52 51 27 61 37 78 47
 87 77 80 83 92 6 9 71 90 5 10 89 74 91 96 8 95 93 86 88 97 4 98]

Unique values in 'Gender' column:
['female' 'male' <NA> nan 'trans_female' 'trans_male' 'non_binary']

Unique values in 'Race' column:
['white' <NA> 'black/african american' 'two or more races'
 'american indian' 'asian' 'other'
 'native hawaiian/other pacific islander' 'alaska native' nan
 'american indian/alaska native' 'hispanic/latino']

Unique values in 'City' column:
['Eugene' 'Springfield' nan <NA>]

Unique values in 'Reason for Dispatch' column:
['Public Assist' 'Check Welfare' 'EMS Assist' 'Police Assist' 'Transport'
 'Counseling' 'Suicidal Subject' 'Fire Assist' <NA>]


In [435]:
# Create a copy of raw_all_col_data and name it clean_data
clean_data = raw_all_col_data.copy()
clean_data

Unnamed: 0,Age,Gender,Race,City,Reason for Dispatch,tempmax,tempmin,temp,feelslikemax,feelslikemin,...,uvindex,conditions,pm25,Hour,DayOfWeek,DayOfMonth,DayOfYear,Year,Season,Month
0,48,female,white,Eugene,Public Assist,52.0,43.0,48.6,52.0,37.8,...,1.0,"Rain, Partially cloudy",13.0,,0,4,4,2021,Winter,1
1,34,male,white,Springfield,Public Assist,53.8,38.0,44.5,53.8,37.7,...,3.0,Partially cloudy,24.0,,3,14,14,2021,Winter,1
2,,,,Springfield,Check Welfare,53.8,38.0,44.5,53.8,37.7,...,3.0,Partially cloudy,24.0,,3,14,14,2021,Winter,1
3,,,,Springfield,Check Welfare,53.8,38.0,44.5,53.8,37.7,...,3.0,Partially cloudy,24.0,,3,14,14,2021,Winter,1
4,35,male,white,Springfield,Check Welfare,53.8,38.0,44.5,53.8,37.7,...,3.0,Partially cloudy,24.0,,3,14,14,2021,Winter,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67516,,,white,Springfield,,42.9,39.9,41.1,42.9,37.2,...,1.0,Overcast,28.0,21.0,6,31,365,2023,Winter,12
67517,,,,,,42.9,39.9,41.1,42.9,37.2,...,1.0,Overcast,28.0,21.0,6,31,365,2023,Winter,12
67518,17,,white,Eugene,,42.9,39.9,41.1,42.9,37.2,...,1.0,Overcast,28.0,21.0,6,31,365,2023,Winter,12
67519,17,,white,Eugene,,42.9,39.9,41.1,42.9,37.2,...,1.0,Overcast,28.0,22.0,6,31,365,2023,Winter,12


In [436]:
def create_dummies(data):
    # Get list of qualitative (categorical) columns excluding 'Age' if it exists
    if 'Age' in data.columns:
        qualitative_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
        qualitative_cols.remove('Age')
    else:
        qualitative_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

    # Replace missing values in qualitative columns with 'missing'
    data[qualitative_cols] = data[qualitative_cols].fillna('missing')

    # Create dummy variables for every unique value in every qualitative column
    dummy_columns = []
    for col in qualitative_cols:
        dummies = pd.get_dummies(data[col], prefix=col)
        dummy_columns.extend(dummies.columns.tolist())
        data = pd.concat([data, dummies], axis=1)

        # Set dummy variables to NA if the _missing column is True
        if f"{col}_missing" in data.columns:
            missing_col = f"{col}_missing"
            data.loc[data[missing_col] == 1, dummies.columns] = pd.NA

        # Drop _missing column
        if f"{col}_missing" in data.columns:
            data.drop(columns=[f"{col}_missing"], inplace=True)

    # Drop original qualitative columns
    data.drop(columns=qualitative_cols, inplace=True)

    return data


# Assuming clean_data is your DataFrame
clean_data_with_dummies = create_dummies(clean_data)
print(clean_data_with_dummies.columns)
clean_data_with_dummies

  data.loc[data[missing_col] == 1, dummies.columns] = pd.NA
  data.loc[data[missing_col] == 1, dummies.columns] = pd.NA
  data.loc[data[missing_col] == 1, dummies.columns] = pd.NA
  data.loc[data[missing_col] == 1, dummies.columns] = pd.NA


Index(['Age', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'snow', 'snowdepth', 'windgust', 'windspeed', 'sealevelpressure',
       'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex',
       'pm25', 'Hour', 'DayOfWeek', 'DayOfMonth', 'DayOfYear', 'Year', 'Month',
       'Gender_female', 'Gender_male', 'Gender_non_binary',
       'Gender_trans_female', 'Gender_trans_male', 'Race_alaska native',
       'Race_american indian', 'Race_american indian/alaska native',
       'Race_asian', 'Race_black/african american', 'Race_hispanic/latino',
       'Race_native hawaiian/other pacific islander', 'Race_other',
       'Race_two or more races', 'Race_white', 'City_Eugene',
       'City_Springfield', 'Reason for Dispatch_Check Welfare',
       'Reason for Dispatch_Counseling', 'Reason for Dispatch_EMS Assist',
       'Reason for Dispatch_Fire Assist', 'Reason for Dispatch_Poli

Unnamed: 0,Age,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,"conditions_Rain, Freezing Drizzle/Freezing Rain, Partially cloudy","conditions_Rain, Overcast","conditions_Rain, Partially cloudy","conditions_Snow, Rain","conditions_Snow, Rain, Overcast","conditions_Snow, Rain, Partially cloudy",Season_Autumn,Season_Spring,Season_Summer,Season_Winter
0,48,52.0,43.0,48.6,52.0,37.8,46.3,43.8,83.6,0.823,...,False,False,True,False,False,False,False,False,False,True
1,34,53.8,38.0,44.5,53.8,37.7,43.5,40.1,85.0,0.000,...,False,False,False,False,False,False,False,False,False,True
2,,53.8,38.0,44.5,53.8,37.7,43.5,40.1,85.0,0.000,...,False,False,False,False,False,False,False,False,False,True
3,,53.8,38.0,44.5,53.8,37.7,43.5,40.1,85.0,0.000,...,False,False,False,False,False,False,False,False,False,True
4,35,53.8,38.0,44.5,53.8,37.7,43.5,40.1,85.0,0.000,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67516,,42.9,39.9,41.1,42.9,37.2,39.8,39.8,95.1,0.000,...,False,False,False,False,False,False,False,False,False,True
67517,,42.9,39.9,41.1,42.9,37.2,39.8,39.8,95.1,0.000,...,False,False,False,False,False,False,False,False,False,True
67518,17,42.9,39.9,41.1,42.9,37.2,39.8,39.8,95.1,0.000,...,False,False,False,False,False,False,False,False,False,True
67519,17,42.9,39.9,41.1,42.9,37.2,39.8,39.8,95.1,0.000,...,False,False,False,False,False,False,False,False,False,True


#### Cleaning the Upcoming Data

In [437]:
columns_to_drop = ['stations', 'name', 'icon', 'moonphase', 'sunrise', 'sunset', 'severerisk', 'winddir', 'description', 'preciptype']

upcoming_data = upcoming_data.drop(columns_to_drop, axis=1)
upcoming_data

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,windgust,windspeed,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,conditions,pm25
0,2024-05-14,74.1,47.9,60.9,74.1,44.2,60.4,49.0,67.3,0.0,...,26.4,18.2,1019.3,12.8,10.0,323.4,27.9,9,Clear,16.982079
1,2024-05-15,78.0,49.0,63.7,78.0,47.2,63.5,51.5,66.8,0.0,...,17.2,11.4,1016.8,10.2,10.1,330.0,28.7,9,Clear,16.982079
2,2024-05-16,74.0,51.0,62.2,74.0,51.0,62.2,51.6,70.5,0.0,...,13.9,9.2,1015.1,20.5,10.1,344.4,29.8,9,Partially cloudy,16.982079
3,2024-05-17,65.9,49.0,57.0,65.9,46.6,56.8,43.9,63.8,0.0,...,18.3,12.8,1021.7,26.2,11.3,344.4,29.8,9,Partially cloudy,16.982079
4,2024-05-18,66.8,43.6,55.2,66.8,40.0,54.0,41.7,62.7,0.0,...,19.0,12.3,1019.6,10.5,15.0,327.0,28.5,9,Clear,16.982079
5,2024-05-19,63.6,44.0,53.5,63.6,41.7,52.5,41.6,66.0,0.032,...,18.1,12.1,1023.5,28.8,14.9,274.4,23.7,8,Partially cloudy,16.982079
6,2024-05-20,63.4,43.2,53.3,63.4,40.5,52.1,42.2,67.5,0.0,...,17.0,11.4,1023.5,24.5,15.0,346.9,29.9,9,Partially cloudy,16.982079
7,2024-05-21,64.5,44.3,54.4,64.5,42.1,53.4,44.8,71.5,0.004,...,16.8,11.0,1021.1,34.3,15.0,327.2,28.4,9,Partially cloudy,16.982079


In [438]:
# Creating dummy columns for date features
upcoming_data = extract_date_features(upcoming_data, 'date')

# Add a 'Season' column based on the 'Date' column
upcoming_data['Season'] = upcoming_data['date'].apply(get_season)

# Add a 'Month' column that assigns the month as 1-12
upcoming_data['Month'] = pd.to_datetime(upcoming_data['date']).dt.month

# Creating dummy variables
upcoming_data_with_dummies = create_dummies(upcoming_data)

def align_dummies_columns(df1, df2):
    # Get the set of dummy variable columns from both DataFrames
    df1_dummy_cols = set(df1.columns)
    df2_dummy_cols = set(df2.columns)

    # Find missing columns in each DataFrame
    missing_cols_df1 = df2_dummy_cols - df1_dummy_cols
    missing_cols_df2 = df1_dummy_cols - df2_dummy_cols

    # Add missing columns to each DataFrame and fill them with False
    for col in missing_cols_df1:
        df1[col] = False
    for col in missing_cols_df2:
        df2[col] = False

    # Reorder columns to match between DataFrames
    df1 = df1.reindex(sorted(df2.columns), axis=1)
    df2 = df2.reindex(sorted(df1.columns), axis=1)

    return df1, df2

# Creating dummies 
upcoming_data_with_dummies = create_dummies(upcoming_data)

# Adding an missing dummy columns
clean_data_with_dummies, upcoming_data_with_dummies = align_dummies_columns(clean_data_with_dummies, upcoming_data_with_dummies)
upcoming_data_with_dummies.columns

Index(['Age', 'City_Eugene', 'City_Springfield', 'DayOfMonth', 'DayOfWeek',
       'DayOfYear', 'Gender_female', 'Gender_male', 'Gender_non_binary',
       'Gender_trans_female', 'Gender_trans_male', 'Hour', 'Month',
       'Race_alaska native', 'Race_american indian',
       'Race_american indian/alaska native', 'Race_asian',
       'Race_black/african american', 'Race_hispanic/latino',
       'Race_native hawaiian/other pacific islander', 'Race_other',
       'Race_two or more races', 'Race_white',
       'Reason for Dispatch_Check Welfare', 'Reason for Dispatch_Counseling',
       'Reason for Dispatch_EMS Assist', 'Reason for Dispatch_Fire Assist',
       'Reason for Dispatch_Police Assist',
       'Reason for Dispatch_Public Assist',
       'Reason for Dispatch_Suicidal Subject', 'Reason for Dispatch_Transport',
       'Season_Autumn', 'Season_Spring', 'Season_Summer', 'Season_Winter',
       'Year', 'cloudcover', 'conditions_Clear', 'conditions_Overcast',
       'conditions_Partia

In [439]:
columns_to_drop = ['Age', 'City_Eugene', 'City_Springfield', 'Gender_female', 'Gender_male', 'Gender_non_binary',
       'Gender_trans_female', 'Gender_trans_male', 'Hour', 'Race_alaska native', 'Race_american indian',
       'Race_american indian/alaska native', 'Race_asian',
       'Race_black/african american', 'Race_hispanic/latino',
       'Race_native hawaiian/other pacific islander', 'Race_other',
       'Race_two or more races', 'Race_white',
       'Reason for Dispatch_Check Welfare', 'Reason for Dispatch_Counseling',
       'Reason for Dispatch_EMS Assist', 'Reason for Dispatch_Fire Assist',
       'Reason for Dispatch_Police Assist',
       'Reason for Dispatch_Public Assist',
       'Reason for Dispatch_Suicidal Subject', 'Reason for Dispatch_Transport', 'date']

upcoming_data_with_dummies = upcoming_data_with_dummies.drop(columns_to_drop, axis=1)
upcoming_data_with_dummies

Unnamed: 0,DayOfMonth,DayOfWeek,DayOfYear,Month,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Year,cloudcover,...,snowdepth,solarenergy,solarradiation,temp,tempmax,tempmin,uvindex,visibility,windgust,windspeed
0,14,1,135,5,False,True,False,False,2024,12.8,...,0,27.9,323.4,60.9,74.1,47.9,9,10.0,26.4,18.2
1,15,2,136,5,False,True,False,False,2024,10.2,...,0,28.7,330.0,63.7,78.0,49.0,9,10.1,17.2,11.4
2,16,3,137,5,False,True,False,False,2024,20.5,...,0,29.8,344.4,62.2,74.0,51.0,9,10.1,13.9,9.2
3,17,4,138,5,False,True,False,False,2024,26.2,...,0,29.8,344.4,57.0,65.9,49.0,9,11.3,18.3,12.8
4,18,5,139,5,False,True,False,False,2024,10.5,...,0,28.5,327.0,55.2,66.8,43.6,9,15.0,19.0,12.3
5,19,6,140,5,False,True,False,False,2024,28.8,...,0,23.7,274.4,53.5,63.6,44.0,8,14.9,18.1,12.1
6,20,0,141,5,False,True,False,False,2024,24.5,...,0,29.9,346.9,53.3,63.4,43.2,9,15.0,17.0,11.4
7,21,1,142,5,False,True,False,False,2024,34.3,...,0,28.4,327.2,54.4,64.5,44.3,9,15.0,16.8,11.0


In [440]:
# Define a function to check if a value is either NaN, <NA>, TRUE, FALSE, or a number
def check_value(x):
    return pd.isna(x) or pd.isna(pd.NA) or isinstance(x, bool) or pd.api.types.is_numeric_dtype(x)

# Apply the function to every value in the dataframe
def check_dataframe(df):
    result = df.applymap(check_value)

    # Check if all values are True (indicating they are one of the specified types)
    all_values_valid = result.all().all()

    return all_values_valid

# Check 'clean_data_with_dummies'
all_values_valid_clean = check_dataframe(clean_data_with_dummies)
print("All values in 'clean_data_with_dummies' are either NaN, <NA>, TRUE, FALSE, or a number:", all_values_valid_clean)

# Check 'upcoming_data_with_dummies'
all_values_valid_upcoming = check_dataframe(upcoming_data_with_dummies)
print("All values in 'upcoming_data_with_dummies' are either NaN, <NA>, TRUE, FALSE, or a number:", all_values_valid_upcoming)

  result = df.applymap(check_value)


All values in 'clean_data_with_dummies' are either NaN, <NA>, TRUE, FALSE, or a number: True
All values in 'upcoming_data_with_dummies' are either NaN, <NA>, TRUE, FALSE, or a number: True


  result = df.applymap(check_value)


In [441]:
# Save the cleaned 3-year data
clean_data_with_dummies.to_csv('data/clean_data_with_dummies.csv', index=False)

In [442]:
# Save the cleaned upcoming data
upcoming_data_with_dummies.to_csv('data/upcoming_data_with_dummies.csv', index=False)