## **PurpleAir Monitors** ##

In [1]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot
import pandas as pd
import xgboost as xgb
import numpy as np

In [2]:
# Load cleaned data
purple_df = pd.read_csv('../data/clean_purpleair.csv')
purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,


### **Preprocessing Check**

In [3]:
# Convert longitude and latitude to correct types
purple_df['longitude'] = purple_df['longitude'].apply(lambda x: -abs(x))
purple_df['latitude'] = purple_df['latitude'].apply(lambda x: abs(x))

# Convert time to datetime (if not already done)
purple_df['time'] = pd.to_datetime(purple_df['time'], format='%Y-%m-%d %H:%M:%S')

purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,


In [4]:
# Create column for date only
purple_df['date'] = purple_df['time'].dt.date
purple_df['date'] = pd.to_datetime(purple_df['date'], format='%Y-%m-%d')

purple_df['date'].head()

0   2018-12-27
1   2018-12-27
2   2018-12-27
3   2018-12-27
4   2018-12-27
Name: date, dtype: datetime64[ns]

### **Visualizing and Analyzing**

In [5]:
# Only take unique dates and their average pm2.5 concentrations
unique_dates = purple_df.drop_duplicates(subset=['location_name', 'date'], keep='first')

In [6]:
# Find outliers using IQR and separate them from the dataframe
def find_outliers_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    non_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return non_outliers, outliers

# Remove outliers and keep them in a separate dataframe
unique_dates, outliers_df = find_outliers_iqr(unique_dates, 'pm2_5_24h_mean')

print("Number of outliers removed:", len(outliers_df))
print("Number of unique dates remaining:", len(unique_dates))

Number of outliers removed: 2104
Number of unique dates remaining: 24034


In [7]:
# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Add histogram for PM2.5 values without outliers
fig.add_trace(
    go.Histogram(
        x=unique_dates['pm2_5_24h_mean'],
        nbinsx=35,
        marker=dict(color='blue'),
        hoverinfo='x+y'
    ),
    row=1, col=1
)

# Add histogram for PM2.5 values outliers
fig.add_trace(
    go.Box(
        x=outliers_df['pm2_5_24h_mean'],
        marker=dict(color='green'),
        hoverinfo='x',
        name=''
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(title_text="Distribution of PM2.5 Values for PurpleAir Monitors",
                  showlegend=False)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=2)

# Save the figure offline
plot(fig, filename='../figures/purpleair_pm25_distribution.html', auto_open=False)

'../figures/purpleair_pm25_distribution.html'

In [8]:
# Calculate the rolling 24h mean for PM2.5 for each hour
purple_df['pm2_5_24h_rolling_mean'] = purple_df.groupby('location_name')['pm2_5_1h_mean'].transform(
    lambda x: x.rolling(window=24, min_periods=1).mean()
)

# Impute missing values in the rolling mean
purple_df['pm2_5_24h_rolling_mean'] = purple_df['pm2_5_24h_rolling_mean'].ffill()
purple_df['pm2_5_24h_rolling_mean'] = purple_df['pm2_5_24h_rolling_mean'].bfill()

purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure,date,pm2_5_24h_rolling_mean
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,,2018-12-27,0.736345
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,,2018-12-27,0.738086
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,,2018-12-27,0.838347
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,,2018-12-27,0.932413
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,,2018-12-27,0.971445


In [9]:
# AQI calculation function
def calculate_pm2_5_aqi(C_p):
    if pd.isna(C_p):
        return None

    C_p = float(str(C_p)[:str(C_p).find('.')+2]) if '.' in str(C_p) else float(C_p)

    breakpoints = [
        (0.0,   9.0,   0,   50),
        (9.1,   35.4,  51,  100),
        (35.5,  55.4,  101, 150),
        (55.5,  125.4, 151, 200),
        (125.5, 225.4, 201, 300),
        (225.5, 500.4, 301, 500)
    ]

    for BP_Lo, BP_Hi, I_Lo, I_Hi in breakpoints:
        if BP_Lo <= C_p <= BP_Hi:
            I_p = ((I_Hi - I_Lo) / (BP_Hi - BP_Lo)) * (C_p - BP_Lo) + I_Lo
            return round(I_p)

    return None

# Calculate AQI for PM2.5 24hr mean
purple_df['pm2_5_24h_rolling_mean_aqi'] = purple_df['pm2_5_24h_rolling_mean'].apply(calculate_pm2_5_aqi)

In [10]:
# Find outliers from dataframe
no_outliers_df, outliers_df = find_outliers_iqr(purple_df, 'pm2_5_24h_rolling_mean_aqi')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Scatterplot of PM2.5 values for PurpleAir monitors without outliers
fig.add_trace(
        go.Scatter(
                x=no_outliers_df['pm2_5_1h_mean'],
                y=no_outliers_df['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='blue', size=5),
                hovertext=no_outliers_df['location_name'],
                name=''
        ),
        row=1, col=1
)

# Scatterplot of PM2.5 values for PurpleAir monitors with outliers
fig.add_trace(
        go.Scatter(
                x=outliers_df['pm2_5_1h_mean'],
                y=outliers_df['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='red', size=5),
                hovertext=outliers_df['location_name'],
                name=''
        ),
        row=1, col=2
)

# Update axis titles
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=1)
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=2)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=2)

# Update layout
fig.update_layout(title_text="Scatter Plots of PM2.5 Values for PurpleAir Monitors",
                                  showlegend=False)

# Save the figure offline
plot(fig, filename='../figures/purpleair_pm25_aqi_scatter.html', auto_open=False)

'../figures/purpleair_pm25_aqi_scatter.html'

## **Clarity Monitors**

In [11]:
# Load cleaned data for Clarity monitors
clarity_df = pd.read_csv('../data/clean_clarity.csv')

# Sort by monitor and time
clarity_df = clarity_df.sort_values(['location_name', 'time'])

# Convert longitude and latitude to correct types
clarity_df['longitude'] = clarity_df['longitude'].apply(lambda x: -abs(x))
clarity_df['latitude'] = clarity_df['latitude'].apply(lambda x: abs(x))

# Convert time to datetime (if not already done)
clarity_df['time'] = pd.to_datetime(clarity_df['time'], format='%Y-%m-%d %H:%M:%S')

# Create column for date only
clarity_df['date'] = clarity_df['time'].dt.date
clarity_df['date'] = pd.to_datetime(clarity_df['date'], format='%Y-%m-%d')

clarity_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,date
33,2024-10-31 11:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.07,22.0,3.942308,22,20.0,54.91,2024-10-31
34,2024-10-31 12:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.11,23.0,3.942308,22,20.5,51.66,2024-10-31
49,2024-10-31 13:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.38,24.0,3.942308,22,20.47,50.51,2024-10-31
52,2024-10-31 14:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.15,23.0,3.942308,22,19.94,53.11,2024-10-31
63,2024-10-31 15:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,3.67,20.0,3.942308,22,18.86,56.11,2024-10-31


In [12]:
# Only take unique dates and their average pm2.5 concentrations
unique_dates_clarity = clarity_df.drop_duplicates(subset=['location_name', 'date'], keep='first')

# Find outliers using IQR and separate them from the dataframe
unique_dates_clarity, outliers_df_clarity = find_outliers_iqr(unique_dates_clarity, 'pm2_5_24h_mean')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Add histogram for PM2.5 values without outliers
fig.add_trace(
    go.Histogram(
        x=unique_dates_clarity['pm2_5_24h_mean'],
        nbinsx=35,
        marker=dict(color='blue'),
        hoverinfo='x+y'
    ),
    row=1, col=1
)

# Add histogram for PM2.5 values outliers
fig.add_trace(
    go.Box(
        x=outliers_df_clarity['pm2_5_24h_mean'],
        marker=dict(color='green'),
        hoverinfo='x',
        name=''
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(title_text="Distribution of PM2.5 Values for Clarity Monitors",
                  showlegend=False)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=2)

# Save the figure offline
plot(fig, filename='../figures/clarity_pm25_distribution.html', auto_open=False)

'../figures/clarity_pm25_distribution.html'

In [13]:
# Calculate the rolling 24h mean for PM2.5 for each hour
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df.groupby('location_name')['pm2_5_1h_mean'].transform(
    lambda x: x.rolling(window=24, min_periods=1).mean()
)

# Impute missing values in the rolling mean
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df['pm2_5_24h_rolling_mean'].ffill()
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df['pm2_5_24h_rolling_mean'].bfill()

# Calculate AQI for PM2.5 24hr mean
clarity_df['pm2_5_24h_rolling_mean_aqi'] = clarity_df['pm2_5_24h_rolling_mean'].apply(calculate_pm2_5_aqi)

# Find outliers from dataframe
no_outliers_df_clarity, outliers_df_clarity = find_outliers_iqr(clarity_df, 'pm2_5_24h_rolling_mean_aqi')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Scatterplot of PM2.5 values for Clarity monitors without outliers
fig.add_trace(
        go.Scatter(
                x=no_outliers_df_clarity['pm2_5_1h_mean'],
                y=no_outliers_df_clarity['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='blue', size=5),
                hovertext=no_outliers_df_clarity['location_name'],
                name=''
        ),
        row=1, col=1
)

# Scatterplot of PM2.5 values for Clarity monitors with outliers
fig.add_trace(
        go.Scatter(
                x=outliers_df_clarity['pm2_5_1h_mean'],
                y=outliers_df_clarity['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='red', size=5),
                hovertext=outliers_df_clarity['location_name'],
                name=''
        ),
        row=1, col=2
)

# Update axis titles
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=1)
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=2)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=2)

# Update layout
fig.update_layout(title_text="Scatter Plots of PM2.5 Values for Clarity Monitors",
                                  showlegend=False)

# Save the figure offline
plot(fig, filename='../figures/clarity_pm25_aqi_scatter.html', auto_open=False)

'../figures/clarity_pm25_aqi_scatter.html'

In [14]:
# Keep the clarity_df_grouped definition
clarity_df_grouped = clarity_df.groupby(['date']).agg({'pm2_5_1h_mean': 'mean'}).reset_index()
clarity_df_grouped['month_year_str'] = clarity_df_grouped['date'].dt.strftime('%B')
clarity_df_grouped['day'] = clarity_df_grouped['date'].dt.day

# Create a facet grid with month-year as the facet column
fig = px.line(
    clarity_df_grouped,
    x="day",
    y="pm2_5_1h_mean",
    facet_col="month_year_str",
    facet_col_wrap=6,
    title="PM2.5 Concentrations by Month (Clarity Monitors)",
    labels={"pm2_5_1h_mean": "PM2.5 (1-hour mean)", "day": "Day of Month", "month_year_str": "Month"},
    height=400
)

# Update layout for better visualization
fig.update_layout(
    margin=dict(t=50, l=50, r=50, b=50),
    title_x=0.5
)

# Show the plot
fig.show()

## **Self-Prediction**

### **Experiment: Clarity Monitors**

In [15]:
# Filter the data
filtered_df = clarity_df[['location_id', 'location_name', 'date', 'time', 'pm2_5_24h_mean']]
filtered_df.head()

Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.942308
34,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 12:00:00,3.942308
49,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 13:00:00,3.942308
52,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 14:00:00,3.942308
63,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 15:00:00,3.942308


In [16]:
# Round the pm2_5_24h_mean values to 2 decimal places
filtered_df['pm2_5_24h_mean'] = filtered_df['pm2_5_24h_mean'].round(2)
filtered_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.94
34,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 12:00:00,3.94
49,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 13:00:00,3.94
52,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 14:00:00,3.94
63,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 15:00:00,3.94


In [17]:
# Remove outliers from data (either removing, or just imputing)


In [18]:
# Get the unique values by date
unique_dates_filtered = filtered_df.drop_duplicates(subset=['location_name', 'date'], keep='first')
unique_dates_filtered.head()

Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.94
150,DRCAC7970,Belle Air School,2024-11-01,2024-11-01 00:00:00,4.89
361,DRCAC7970,Belle Air School,2024-11-02,2024-11-02 00:00:00,4.02
574,DRCAC7970,Belle Air School,2024-11-03,2024-11-03 00:00:00,4.34
803,DRCAC7970,Belle Air School,2024-11-04,2024-11-04 00:00:00,5.06


In [19]:
unique_dates_filtered.shape

(2069, 5)

In [20]:
# Distribution of PM2.5 values for PurpleAir monitors
px.histogram(unique_dates_filtered[unique_dates_filtered['pm2_5_24h_mean'] <= 200], 
             x='pm2_5_24h_mean', title='Distribution of PM2.5 24hr Mean Values', nbins=50).show()

In [21]:
import numpy as np

# Distribution of the pm2_5_24h_mean values
# Apply log transformation to the pm2_5_24h_mean values
unique_dates_filtered['pm2_5_24h_mean_log'] = unique_dates_filtered['pm2_5_24h_mean'].apply(lambda x: np.log(x + 1))
px.histogram(unique_dates_filtered, x='pm2_5_24h_mean_log', title='Distribution of Log PM2.5 24hr Mean Values', nbins=25).show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
# Create bins for the log pm2_5_24h_mean values and get edges of the bins
# bins = pd.qcut(unique_dates_filtered['pm2_5_24h_mean_log'], q=5)

# # Get the edges of the bins
# edges = bins.cat.categories
# edges = [(edges[i].left, edges[i].right) for i in range(len(edges))]
# edges = [(edges[i][0].round(2), edges[i][1].round(2)) for i in range(len(edges))]
# edges = [(np.exp(edges[i][0]) - 1, np.exp(edges[i][1]) - 1) for i in range(len(edges))]

# edges

In [23]:
# Create final edges for the bins
final_edges = [(0, 4.50), (4.51, 9.00), (9.01, 35.40)]

# Create bins for the log pm2_5_24h_mean values and get edges of the bins from final_edges above
log_bins = np.log1p([x[0] for x in final_edges] + [final_edges[-1][1]])
log_edges = [(log_bins[i], log_bins[i+1]) for i in range(len(log_bins)-1)]
log_edges = [(log_edges[i][0].round(2), log_edges[i][1].round(2)) for i in range(len(log_edges))]

# Create a new column for the bins
unique_dates_filtered['pm2_5_24h_mean_log_bins'] = pd.cut(unique_dates_filtered['pm2_5_24h_mean_log'], 
                                                      bins=[x[0] for x in log_edges] + [log_edges[-1][1]], labels=[f"{i}" for i in range(len(final_edges))], 
                                                      include_lowest=True)

unique_dates_filtered.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean,pm2_5_24h_mean_log,pm2_5_24h_mean_log_bins
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.94,1.597365,0
150,DRCAC7970,Belle Air School,2024-11-01,2024-11-01 00:00:00,4.89,1.773256,1
361,DRCAC7970,Belle Air School,2024-11-02,2024-11-02 00:00:00,4.02,1.61343,0
574,DRCAC7970,Belle Air School,2024-11-03,2024-11-03 00:00:00,4.34,1.675226,0
803,DRCAC7970,Belle Air School,2024-11-04,2024-11-04 00:00:00,5.06,1.80171,1


In [24]:
# Get distribution of the bins using proportions
bins_distribution = unique_dates_filtered['pm2_5_24h_mean_log_bins'].value_counts(normalize=True).sort_index()
bins_distribution = bins_distribution.reset_index()
bins_distribution.columns = ['pm2_5_24h_mean_log_bins', 'proportion']

# Create bar plot for the bins distribution
fig = px.bar(bins_distribution, x='pm2_5_24h_mean_log_bins', y='proportion', 
             title='Distribution of PM2.5 24hr Mean Values by Bins', 
             labels={'pm2_5_24h_mean_log_bins': 'PM2.5 24hr Log Mean Bins', 'proportion': 'Proportion'},
             color='proportion', color_continuous_scale=px.colors.sequential.Plasma)

fig.show()

In [25]:
# Create a new dataframe with rolling window of 7 days and the next day's bin as label
window_size = 7

# Group by location_id to ensure the rolling window is applied per location
data = []
for location_id, group in unique_dates_filtered.groupby('location_id'):
    group = group.sort_values('date').reset_index(drop=True)
    
    for i in range(len(group) - window_size):
        # Extract the 7-day window of pm2_5_24h_mean
        features = group.loc[i:i+window_size-1, 'pm2_5_24h_mean_log'].values.tolist()
        # Extract the next day's bin as the label
        label = group.loc[i+window_size, 'pm2_5_24h_mean_log_bins']
        # Include location_id and location_name
        location_id_value = group.loc[i, 'location_id']
        location_name_value = group.loc[i, 'location_name']
        data.append([location_id_value, location_name_value] + features + [label])

# Create the final dataframe
columns = ['location_id', 'location_name'] + [f'feature_day_{i+1}' for i in range(window_size)] + ['label']
rolling_window_df = pd.DataFrame(data, columns=columns)

rolling_window_df.head()

Unnamed: 0,location_id,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,label
0,DCVIM2201,Brentwood Park,1.669592,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1
1,DCVIM2201,Brentwood Park,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,0
2,DCVIM2201,Brentwood Park,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,0
3,DCVIM2201,Brentwood Park,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,0
4,DCVIM2201,Brentwood Park,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,1.040277,0


In [26]:
# One hot encode by location_id
rolling_window_df = pd.get_dummies(rolling_window_df, columns=['location_id'], prefix='location_id')

# Convert boolean columns to integers
bool_cols = rolling_window_df.select_dtypes(include='bool').columns
rolling_window_df[bool_cols] = rolling_window_df[bool_cols].astype(int)

# Move label column to the end
label_col = rolling_window_df.pop('label')
rolling_window_df['label'] = label_col
rolling_window_df.head()

Unnamed: 0,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,location_id_DCVIM2201,location_id_DETMG3939,...,location_id_DJGNN5114,location_id_DJTYV8538,location_id_DMEYT2138,location_id_DNSEJ7404,location_id_DRCAC7970,location_id_DRYLF3821,location_id_DTMSK2119,location_id_DUBTA4581,location_id_DVRGV9737,label
0,Brentwood Park,1.669592,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1,0,...,0,0,0,0,0,0,0,0,0,1
1,Brentwood Park,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Brentwood Park,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Brentwood Park,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Brentwood Park,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,1.040277,1,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Tranform all feature_day columns with a log transformation
# rolling_window_df.iloc[:, 2:-1] = rolling_window_df.iloc[:, 2:-1].apply(lambda x: np.log(x + 1))
# rolling_window_df.head()

In [28]:
rolling_window_df.shape

(1971, 23)

In [29]:
# Build LSTM model and dataset

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(-1)  # Add a channel dimension
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class AirForecastClassifier(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=64, num_layers=2, num_classes=3):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # lstm_out: (batch, seq_len, hidden_dim)
        final_hidden = lstm_out[:, -1, :]  # take last timestep
        logits = self.fc(final_hidden)
        return logits

In [30]:
def train_model(model, dataloader, epochs=10, lr=1e-3):
    """
    AirForecast LSTM Classifier model training function.
    Args:
        model (nn.Module): The LSTM model to train.
        dataloader (DataLoader): DataLoader for the training data.
        epochs (int): Number of training epochs.
        lr (float): Learning rate for the optimizer.
    Returns:
        model (nn.Module): The trained LSTM model.
    """
    # Use available device
    device = torch.device("mps" if torch.backends.mps.is_available()
                          else "cuda" if torch.cuda.is_available()
                          else "cpu")
    model.to(device)

    # Loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0

        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            logits = model(X_batch)               # (batch_size, num_classes)
            loss = criterion(logits, y_batch)     # y_batch: (batch_size,)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Metrics
            epoch_loss += loss.item()
            preds = logits.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

        avg_loss = epoch_loss / len(dataloader)
        accuracy = correct / total
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2%}")

    return model

def evaluate(model, dataloader):
    """
    AirForecast LSTM model evaluation function.
    Args:
        model (nn.Module): The LSTM model to evaluate.
        dataloader (DataLoader): DataLoader for the validation data.
    Returns:
        preds (np.ndarray): Predicted class labels.
        truths (np.ndarray): True labels.
    """
    model.eval()
    preds, truths = [], []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            logits = model(X_batch)
            batch_preds = logits.argmax(dim=1)

            preds.append(batch_preds.cpu().numpy())
            truths.append(y_batch.cpu().numpy())

    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    accuracy = np.mean(preds == truths)

    print(f"Validation Accuracy: {accuracy:.2%}")
    return preds, truths

In [31]:
# Ensure all monitors are included in the validation set using location_index_map
val_indices = []

for location, group in rolling_window_df.groupby('location_name'):
    split_index = int(len(group) * 0.8)
    if split_index < len(group):  # Ensure split_index is within bounds
        val_indices.extend(group.index[split_index:])

val_indices = np.array(val_indices)
train_indices = np.setdiff1d(rolling_window_df.index, val_indices)

train_df = rolling_window_df.loc[train_indices]
val_df = rolling_window_df.loc[val_indices]

# Create TimeSeriesDataset instances for training and validation sets
train_dataset = TimeSeriesDataset(train_df.iloc[:, 1:-1].values, train_df['label'].astype(int).values)
val_dataset = TimeSeriesDataset(val_df.iloc[:, 1:-1].values, val_df['label'].astype(int).values)

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Train the model
model = AirForecastClassifier(input_dim=1, num_classes=5)
trained_model = train_model(model, train_loader, epochs=20)

Epoch 1/20, Loss: 1.4930, Accuracy: 23.44%
Epoch 2/20, Loss: 1.1154, Accuracy: 37.48%
Epoch 3/20, Loss: 1.0976, Accuracy: 37.80%
Epoch 4/20, Loss: 1.0925, Accuracy: 40.09%
Epoch 5/20, Loss: 1.0933, Accuracy: 39.33%
Epoch 6/20, Loss: 1.0963, Accuracy: 36.98%
Epoch 7/20, Loss: 1.0933, Accuracy: 40.09%
Epoch 8/20, Loss: 1.0930, Accuracy: 40.09%
Epoch 9/20, Loss: 1.0914, Accuracy: 40.09%
Epoch 10/20, Loss: 1.0919, Accuracy: 40.09%
Epoch 11/20, Loss: 1.0895, Accuracy: 38.25%
Epoch 12/20, Loss: 1.0948, Accuracy: 40.09%
Epoch 13/20, Loss: 1.0922, Accuracy: 40.09%
Epoch 14/20, Loss: 1.0907, Accuracy: 40.09%
Epoch 15/20, Loss: 1.0939, Accuracy: 37.55%
Epoch 16/20, Loss: 1.0888, Accuracy: 40.09%
Epoch 17/20, Loss: 1.0902, Accuracy: 40.09%
Epoch 18/20, Loss: 1.0895, Accuracy: 40.09%
Epoch 19/20, Loss: 1.0955, Accuracy: 37.36%
Epoch 20/20, Loss: 1.0932, Accuracy: 38.06%


In [32]:
# Evaluate the model by monitor location
# Move the trained model back to CPU
trained_model = trained_model.to('cpu')

# Evaluate the model by monitor location
predictions, truths = evaluate(trained_model, val_loader)

Validation Accuracy: 44.58%


In [33]:
# Find the accuracy by monitor location
location_accuracy = {}
for location_name in rolling_window_df['location_name'].unique():
    location_indices = rolling_window_df[rolling_window_df['location_name'] == location_name].index
    location_preds = predictions[np.isin(val_indices, location_indices)]
    location_truths = truths[np.isin(val_indices, location_indices)]
    accuracy = np.mean(location_preds == location_truths)
    location_accuracy[location_name] = accuracy
    print(f"Location {location_name}: Accuracy = {accuracy:.2%}")


Location Brentwood Park: Accuracy = 33.33%
Location Nora Alvarado Home: Accuracy = 41.38%
Location Evelin Pacheco Home: Accuracy = 41.38%
Location Cypress and Pine Playlot: Accuracy = 88.89%
Location Rise South City Office: Accuracy = 73.33%
Location San Bruno School District Office: Accuracy = 62.07%
Location Parkside Middle: Accuracy = 34.48%
Location Buri Buri Park: Accuracy = 40.74%
Location Clay Ave Park: Accuracy = 44.44%
Location Belle Air School: Accuracy = 31.03%
Location Marita Santos Home: Accuracy = 41.38%
Location Rollingwood Elementary: Accuracy = 24.14%
Location Portola Elementary: Accuracy = 31.03%
Location Gardiner Park: Accuracy = 37.04%


In [34]:
from sklearn.model_selection import StratifiedKFold

# Create stratified k-fold cross-validation sets
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Split the rolling_window_df into k folds
folds = []
for train_index, val_index in skf.split(rolling_window_df, rolling_window_df['label']):
    train_set = rolling_window_df.iloc[train_index].reset_index(drop=True)
    val_set = rolling_window_df.iloc[val_index].reset_index(drop=True)
    
    # Ensure each training and validation set includes all monitors
    if (set(train_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique()) and
        set(val_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique())):
        folds.append((train_set, val_set))

len(folds)

5

In [35]:
# Create XGBClassifier
from sklearn.metrics import accuracy_score

# Create a list to store fold accuracy for each monitor
monitor_fold_accuracies = []

# Train and evaluate the XGBClassifier for each fold
for i, (train_set, val_set) in enumerate(folds):
    # Extract features and labels
    X_train = train_set.iloc[:, 1:-1].values
    y_train = train_set['label'].astype(int).values
    X_val = val_set.iloc[:, 1:-1].values
    y_val = val_set['label'].astype(int).values

    # Create and train the XGBClassifier
    model = xgb.XGBClassifier(random_state=42, eval_metric="mlogloss")
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_val)

    # Calculate accuracy for each monitor
    for location_name in val_set['location_name'].unique():
        location_indices = val_set[val_set['location_name'] == location_name].index
        location_y_val = y_val[np.isin(val_set.index, location_indices)]
        location_y_pred = y_pred[np.isin(val_set.index, location_indices)]
        location_accuracy = accuracy_score(location_y_val, location_y_pred)

        # Store the results
        monitor_fold_accuracies.append({
            'fold': i + 1,
            'location_name': location_name,
            'accuracy': location_accuracy
        })

# Convert the results to a DataFrame
monitor_fold_accuracies_df = pd.DataFrame(monitor_fold_accuracies)

# Display the DataFrame
monitor_fold_accuracies_df.head()


Unnamed: 0,fold,location_name,accuracy
0,1,Brentwood Park,0.833333
1,1,Nora Alvarado Home,0.933333
2,1,Evelin Pacheco Home,0.84375
3,1,Cypress and Pine Playlot,0.689655
4,1,Rise South City Office,0.703704


In [36]:
# Find average accuracy by fold by monitor location for random forest classifier
clarity_average_accuracy_by_fold = monitor_fold_accuracies_df.groupby(['location_name']).agg({'accuracy': 'mean'}).reset_index()
clarity_average_accuracy_by_fold = clarity_average_accuracy_by_fold.sort_values(by='accuracy', ascending=False)

clarity_average_accuracy_by_fold

Unnamed: 0,location_name,accuracy
8,Nora Alvarado Home,0.919152
2,Buri Buri Park,0.889348
5,Evelin Pacheco Home,0.886399
6,Gardiner Park,0.871612
0,Belle Air School,0.846114
1,Brentwood Park,0.840602
7,Marita Santos Home,0.837238
12,Rollingwood Elementary,0.833753
3,Clay Ave Park,0.784591
10,Portola Elementary,0.754103


In [37]:
clarity_average_accuracy_by_fold.shape

(14, 2)

### **Experiment: Purple Air Monitors**

In [38]:
purple_df = pd.read_csv('../data/purpleair_2024-03-01.csv')
purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,pressure
0,2024-02-29 08:00:00,Grundy Park,21427,37.622585,-122.42097,0.21,1.0,60.976,61.451,1012.084
1,2024-02-29 08:00:00,Shelter Crik,113144,37.62002,-122.42762,0.0,0.0,58.988,60.765,1008.262
2,2024-02-29 08:00:00,terra,109718,37.669968,-122.42153,0.89,4.0,61.047,59.685,1005.107
3,2024-02-29 08:00:00,Belle Air,111235,37.631878,-122.409966,1.16,6.0,,,
4,2024-02-29 08:00:00,Crestmoor III,111498,37.616806,-122.431,1.13,6.0,58.963,64.513,1006.47


In [39]:
# Visualize the monitor locations on a map
fig = px.scatter_mapbox(
    purple_df,
    lat='latitude',
    lon='longitude',
    hover_name='location_id',
    hover_data=['pm2_5_24h_mean'],
    size_max=15,
    zoom=10,
    mapbox_style="carto-positron",
    title="PurpleAir Monitor Locations"
)

fig.show()

In [40]:
purple_filt_df = purple_df[['location_id', 'location_name', 'time', 'pm2_5_24h_mean']]
purple_filt_df.head()

Unnamed: 0,location_id,location_name,time,pm2_5_24h_mean
0,21427,Grundy Park,2024-02-29 08:00:00,0.21
1,113144,Shelter Crik,2024-02-29 08:00:00,0.0
2,109718,terra,2024-02-29 08:00:00,0.89
3,111235,Belle Air,2024-02-29 08:00:00,1.16
4,111498,Crestmoor III,2024-02-29 08:00:00,1.13


In [41]:
# Identify outliers using IQR method
purple_filt_df["is_outlier"] = False
q1 = purple_filt_df['pm2_5_24h_mean'].quantile(0.25)
q3 = purple_filt_df['pm2_5_24h_mean'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
purple_filt_df.loc[(purple_filt_df['pm2_5_24h_mean'] < lower_bound) | (purple_filt_df['pm2_5_24h_mean'] > upper_bound), 'is_outlier'] = True
purple_filt_df['is_outlier'].value_counts()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



is_outlier
False    9259
True      715
Name: count, dtype: int64

In [42]:
# View upper_bound and lower_bound
lower_bound, upper_bound

(-5.740000000000001, 14.82)

In [43]:
# Visualize all PM2.5 values less than 300
px.histogram(purple_filt_df, 
             x='pm2_5_24h_mean', title='Distribution of PM2.5 24hr Mean Values', nbins=200).show()

In [44]:
# Find average PM2.5 values by location
purple_filt_df_grouped = purple_filt_df.groupby(['location_name']).agg({'pm2_5_24h_mean': 'mean'}).reset_index()
purple_filt_df_grouped = purple_filt_df_grouped.sort_values(by='pm2_5_24h_mean', ascending=False)
purple_filt_df_grouped.head(10)

Unnamed: 0,location_name,pm2_5_24h_mean
26,terra,474.043948
15,SSF Cypress,375.315701
24,emalita,14.084762
11,Rise-10,10.222347
12,Rise-8,7.720414
5,Danger Stairs,7.083248
20,South City Lights,6.712126
23,Valleyview &amp; Appian Way,6.554487
22,"Sunshine Gardens, SSF",6.283392
16,San Bruno,6.100157


In [45]:
# Cap the PM2.5 values at 500
purple_filt_df.loc[purple_filt_df['pm2_5_24h_mean'] > 275, 'pm2_5_24h_mean'] = 275

# Visualize the new distribtion of PM2.5 values
px.histogram(purple_filt_df,
             x='pm2_5_24h_mean', title='Distribution of PM2.5 24hr Mean Values', nbins=200).show()

In [46]:
# Apply log transformation to the pm2_5_24h_mean values
purple_filt_df['pm2_5_24h_mean_log'] = np.log1p(purple_filt_df['pm2_5_24h_mean'])

# Visualize the distribution of log PM2.5 values
px.histogram(purple_filt_df, 
             x='pm2_5_24h_mean_log', title='Distribution of Log PM2.5 24hr Mean Values', nbins=35).show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [47]:
# Make bins for the log PM2.5 values using quantiles
# bins = pd.qcut(purple_filt_df['pm2_5_24h_mean_log'], q=5)

# # Get the edges of the bins and transform them back to PM2.5 values
# edges = bins.cat.categories
# edges = [(edges[i].left, edges[i].right) for i in range(len(edges))]
# edges = [(edges[i][0].round(2), edges[i][1].round(2)) for i in range(len(edges))]
# edges = [(np.expm1(edges[i][0]), np.expm1(edges[i][1])) for i in range(len(edges))]

# edges

In [48]:
# Create final edges for the bins
final_edges = [(0, 4.50), (4.51, 9.00), (9.01, 35.40), (35.41, 55.40), (55.41, 125.40), (125.41, 225.40), (255.41, 500.00)]

# Create bins for the log pm2_5_24h_mean values and get edges of the bins from final_edges above
log_bins = np.log1p([x[0] for x in final_edges] + [final_edges[-1][1]])
log_edges = [(log_bins[i], log_bins[i+1]) for i in range(len(log_bins)-1)]
log_edges = [(log_edges[i][0].round(2), log_edges[i][1].round(2)) for i in range(len(log_edges))]

# Create a new column for the bins
purple_filt_df['pm2_5_24h_mean_log_bins'] = pd.cut(purple_filt_df['pm2_5_24h_mean_log'], 
                                                      bins=[x[0] for x in log_edges] + [log_edges[-1][1]], labels=[f"{i}" for i in range(len(final_edges))], 
                                                      include_lowest=True)
purple_filt_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,time,pm2_5_24h_mean,is_outlier,pm2_5_24h_mean_log,pm2_5_24h_mean_log_bins
0,21427,Grundy Park,2024-02-29 08:00:00,0.21,False,0.19062,0
1,113144,Shelter Crik,2024-02-29 08:00:00,0.0,False,0.0,0
2,109718,terra,2024-02-29 08:00:00,0.89,False,0.636577,0
3,111235,Belle Air,2024-02-29 08:00:00,1.16,False,0.770108,0
4,111498,Crestmoor III,2024-02-29 08:00:00,1.13,False,0.756122,0


In [49]:
# Distribution of the bins using proportions
bins_distribution = purple_filt_df['pm2_5_24h_mean_log_bins'].value_counts(normalize=True).sort_index()
bins_distribution = bins_distribution.reset_index()
bins_distribution.columns = ['pm2_5_24h_mean_log_bins', 'proportion']

# Create bar plot for the bins distribution
fig = px.bar(bins_distribution, x='pm2_5_24h_mean_log_bins', y='proportion', 
             title='Distribution of PM2.5 24hr Mean Values by Bins', 
             labels={'pm2_5_24h_mean_log_bins': 'PM2.5 24hr Log Mean Bins', 'proportion': 'Proportion'},
             color='proportion', color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

In [50]:
# Create a new dataframe with rolling window of 7 days and the next day's bin as label
window_size = 7

# Group by location_id to ensure the rolling window is applied per location
data = []
for location_id, group in purple_filt_df.groupby('location_id'):
    group = group.sort_values('time').reset_index(drop=True)
    
    for i in range(len(group) - window_size):
        # Extract the 7-day window of pm2_5_24h_mean
        features = group.loc[i:i+window_size-1, 'pm2_5_24h_mean_log'].values.tolist()
        # Extract the next day's bin as the label
        label = group.loc[i+window_size, 'pm2_5_24h_mean_log_bins']
        # Include location_id and location_name
        location_id_value = group.loc[i, 'location_id']
        location_name_value = group.loc[i, 'location_name']
        data.append([location_id_value, location_name_value] + features + [label])

# Create the final dataframe
columns = ['location_id', 'location_name'] + [f'feature_day_{i+1}' for i in range(window_size)] + ['label']
rolling_window_df = pd.DataFrame(data, columns=columns)

rolling_window_df.head()

Unnamed: 0,location_id,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,label
0,21427,Grundy Park,0.19062,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,0
1,21427,Grundy Park,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,0
2,21427,Grundy Park,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,0
3,21427,Grundy Park,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,0
4,21427,Grundy Park,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,0.688135,0


In [51]:
# One hot encode by location_id
rolling_window_df = pd.get_dummies(rolling_window_df, columns=['location_id'], prefix='location_id')

# Convert boolean columns to integers
bool_cols = rolling_window_df.select_dtypes(include='bool').columns
rolling_window_df[bool_cols] = rolling_window_df[bool_cols].astype(int)

# Move label column to the end
label_col = rolling_window_df.pop('label')
rolling_window_df['label'] = label_col
rolling_window_df.head()

Unnamed: 0,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,location_id_21427,location_id_38589,...,location_id_113144,location_id_119179,location_id_120937,location_id_144654,location_id_158239,location_id_158259,location_id_160983,location_id_169967,location_id_177521,label
0,Grundy Park,0.19062,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Grundy Park,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Grundy Park,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Grundy Park,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Grundy Park,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,0.688135,1,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# Ensure all monitors are included in the validation set using location_index_map
val_indices = []

for location, group in rolling_window_df.groupby('location_name'):
    split_index = int(len(group) * 0.8)
    if split_index < len(group):  # Ensure split_index is within bounds
        val_indices.extend(group.index[split_index:])

val_indices = np.array(val_indices)
train_indices = np.setdiff1d(rolling_window_df.index, val_indices)

train_df = rolling_window_df.loc[train_indices]
val_df = rolling_window_df.loc[val_indices]

# Create TimeSeriesDataset instances for training and validation sets
train_dataset = TimeSeriesDataset(train_df.iloc[:, 1:-1].values, train_df['label'].astype(int).values)
val_dataset = TimeSeriesDataset(val_df.iloc[:, 1:-1].values, val_df['label'].astype(int).values)

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Train the model
model = AirForecastClassifier(input_dim=1, num_classes=7)
trained_model = train_model(model, train_loader, epochs=20)

Epoch 1/20, Loss: 1.2699, Accuracy: 47.12%
Epoch 2/20, Loss: 1.1595, Accuracy: 50.31%
Epoch 3/20, Loss: 1.1621, Accuracy: 50.31%
Epoch 4/20, Loss: 1.1634, Accuracy: 50.31%
Epoch 5/20, Loss: 1.1619, Accuracy: 50.31%
Epoch 6/20, Loss: 1.1604, Accuracy: 50.31%
Epoch 7/20, Loss: 1.1669, Accuracy: 50.31%
Epoch 8/20, Loss: 1.1662, Accuracy: 50.31%
Epoch 9/20, Loss: 1.1616, Accuracy: 50.31%
Epoch 10/20, Loss: 1.1594, Accuracy: 50.31%
Epoch 11/20, Loss: 1.1584, Accuracy: 50.31%
Epoch 12/20, Loss: 1.1011, Accuracy: 51.15%
Epoch 13/20, Loss: 0.9239, Accuracy: 57.92%
Epoch 14/20, Loss: 0.8627, Accuracy: 61.57%
Epoch 15/20, Loss: 0.8221, Accuracy: 63.45%
Epoch 16/20, Loss: 0.7933, Accuracy: 65.24%
Epoch 17/20, Loss: 0.8070, Accuracy: 64.72%
Epoch 18/20, Loss: 0.7871, Accuracy: 65.78%
Epoch 19/20, Loss: 0.7857, Accuracy: 65.98%
Epoch 20/20, Loss: 0.7835, Accuracy: 66.27%


In [53]:
# Evaluate the model by monitor location
# Move the trained model back to CPU
trained_model = trained_model.to('cpu')

# Evaluate the model by monitor location
predictions, truths = evaluate(trained_model, val_loader)

Validation Accuracy: 70.17%


In [54]:
# Find the accuracy by monitor location
location_accuracy = {}
for location_name in rolling_window_df['location_name'].unique():
    location_indices = rolling_window_df[rolling_window_df['location_name'] == location_name].index
    location_preds = predictions[np.isin(val_indices, location_indices)]
    location_truths = truths[np.isin(val_indices, location_indices)]
    accuracy = np.mean(location_preds == location_truths)
    location_accuracy[location_name] = accuracy
    print(f"Location {location_name}: Accuracy = {accuracy:.2%}")


Location Grundy Park: Accuracy = 82.35%
Location Sign Hill: Accuracy = 72.94%
Location South San Francisco Westborough Park: Accuracy = 68.24%
Location Commodore Dr: Accuracy = 69.41%
Location Danger Stairs: Accuracy = 43.53%
Location Acacia And Crystal Springs: Accuracy = 77.11%
Location Sunshine Gardens, SSF: Accuracy = 69.23%
Location Rollingwood / Palmhaus: Accuracy = 69.41%
Location Home: Accuracy = 61.25%
Location South City Lights: Accuracy = 69.41%
Location B9 North AHU-5012: Accuracy = 74.55%
Location Rollingwood Elementary: Accuracy = 69.41%
Location Parkside Intermediate School: Accuracy = 72.94%
Location San Bruno: Accuracy = 54.17%
Location Sign Hill, Stonegate: Accuracy = 55.26%
Location terra: Accuracy = 75.47%
Location Belle Air: Accuracy = 59.52%
Location Crestmoor III: Accuracy = 70.24%
Location Shelter Crik: Accuracy = 96.20%
Location r": Accuracy = 96.47%
Location Elm Court: Accuracy = 91.53%
Location Valleyview &amp; Appian Way: Accuracy = 61.19%
Location Rise-8: A

In [55]:
# Create k-fold cross-validation sets
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Split the rolling_window_df into k folds
folds = []
for train_index, val_index in skf.split(rolling_window_df, rolling_window_df['label']):
    train_set = rolling_window_df.iloc[train_index].reset_index(drop=True)
    val_set = rolling_window_df.iloc[val_index].reset_index(drop=True)
    
    # Ensure each training and validation set includes all monitors
    if (set(train_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique()) and
        set(val_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique())):
        folds.append((train_set, val_set))

len(folds)

5

In [56]:
# Create a list to store fold accuracy for each monitor
monitor_fold_accuracies = []

# Train and evaluate the Decision Tree Classifier for each fold
for i, (train_set, val_set) in enumerate(folds):
    # Extract features and labels
    X_train = train_set.iloc[:, 2:-1].values
    y_train = train_set['label'].astype(int).values
    X_val = val_set.iloc[:, 2:-1].values
    y_val = val_set['label'].astype(int).values

    # Create and train the Decision Tree Classifier
    clf = xgb.XGBClassifier(random_state=42, eval_metric="mlogloss")
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_val)

    # Calculate accuracy for each monitor
    for location_name in val_set['location_name'].unique():
        location_indices = val_set[val_set['location_name'] == location_name].index
        location_y_val = y_val[np.isin(val_set.index, location_indices)]
        location_y_pred = y_pred[np.isin(val_set.index, location_indices)]
        location_accuracy = accuracy_score(location_y_val, location_y_pred)

        # Store the results
        monitor_fold_accuracies.append({
            'fold': i + 1,
            'location_name': location_name,
            'accuracy': location_accuracy
        })

# Convert the results to a DataFrame
monitor_fold_accuracies_df = pd.DataFrame(monitor_fold_accuracies)

# Display the DataFrame
monitor_fold_accuracies_df.head()

Unnamed: 0,fold,location_name,accuracy
0,1,Grundy Park,0.772727
1,1,Sign Hill,0.757895
2,1,South San Francisco Westborough Park,0.783133
3,1,Commodore Dr,0.811765
4,1,Danger Stairs,0.6


In [57]:
purple_average_accuracy_by_fold = monitor_fold_accuracies_df.groupby(['location_name']).agg({'accuracy': 'mean'}).reset_index()
purple_average_accuracy_by_fold = purple_average_accuracy_by_fold.sort_values(by='accuracy', ascending=False)

purple_average_accuracy_by_fold

Unnamed: 0,location_name,accuracy
25,"r""",0.900592
17,Shelter Crik,0.878641
14,Rollingwood Elementary,0.83249
26,terra,0.821105
6,Elm Court,0.81926
13,Rollingwood / Palmhaus,0.814113
3,Commodore Dr,0.813617
1,B9 North AHU-5012,0.812799
22,"Sunshine Gardens, SSF",0.806491
19,"Sign Hill, Stonegate",0.805642


In [58]:
# Combine monitor accuracy for both models
combined_accuracy = pd.concat([clarity_average_accuracy_by_fold, purple_average_accuracy_by_fold])
combined_accuracy.head()

Unnamed: 0,location_name,accuracy
8,Nora Alvarado Home,0.919152
2,Buri Buri Park,0.889348
5,Evelin Pacheco Home,0.886399
6,Gardiner Park,0.871612
0,Belle Air School,0.846114


In [59]:
combined_accuracy.shape

(41, 2)

In [60]:
# Add location_id to the combined accuracy
combined_accuracy = combined_accuracy.merge(
    pd.concat([clarity_df[['location_name', 'location_id']], purple_df[['location_name', 'location_id']]]).drop_duplicates(),
    on='location_name',
    how='left'
)

# Drop duplicate location_id
combined_accuracy = combined_accuracy.drop_duplicates(subset=['location_id'])
combined_accuracy = combined_accuracy.sort_values(by='accuracy', ascending=False)
combined_accuracy.head(10)


Unnamed: 0,location_name,accuracy,location_id
0,Nora Alvarado Home,0.919152,DETMG3939
15,"r""",0.900592,119179
1,Buri Buri Park,0.889348,DMEYT2138
2,Evelin Pacheco Home,0.886399,DEVPF7186
16,Shelter Crik,0.878641,113144
3,Gardiner Park,0.871612,DVRGV9737
4,Belle Air School,0.846114,DRCAC7970
5,Brentwood Park,0.840602,DCVIM2201
6,Marita Santos Home,0.837238,DRYLF3821
7,Rollingwood Elementary,0.833753,DTMSK2119


In [61]:
combined_accuracy.shape

(41, 3)

In [62]:
from geopy.distance import geodesic

# Compute the haversine distance between two points on the Earth in miles
def haversine_distance(point1, point2):
    """
    Calculate the haversine distance between two points on the Earth in miles.
    Args:
        point1 (tuple): Latitude and longitude of the first point (lat1, lon1).
        point2 (tuple): Latitude and longitude of the second point (lat2, lon2).
    Returns:
        float: Distance between the two points in miles.
    """
    return geodesic(point1, point2).miles

# Add latitude and longitude of the monitors to the combined accuracy dataframe
combined_accuracy = combined_accuracy.merge(
    pd.concat([clarity_df[['location_name', 'latitude', 'longitude']], purple_df[['location_name', 'latitude', 'longitude']]]).drop_duplicates(),
    on='location_name',
    how='left'
)

# Drop duplicate location_id
combined_accuracy = combined_accuracy.drop_duplicates(subset=['location_id'])

# Get tuples of latitude and longitude for all monitor locations
location_coords = {}
for _, row in combined_accuracy.iterrows():
    location_coords[row['location_id']] = (row['latitude'], row['longitude'])

# Calculate the distance between each pair of monitors
distances = {}
for i, (loc1, coords1) in enumerate(location_coords.items()):
    for j, (loc2, coords2) in enumerate(location_coords.items()):
        if i < j:  # Avoid duplicate pairs
            distance = haversine_distance(coords1, coords2)
            distances[(loc1, loc2)] = distance
            distances[(loc2, loc1)] = distance

# Create a DataFrame from the distances dictionary
distances_df = pd.DataFrame.from_dict(distances, orient='index', columns=['distance'])
distances_df.reset_index(inplace=True)
distances_df[['location_id_1', 'location_id_2']] = pd.DataFrame(distances_df['index'].tolist(), index=distances_df.index)
distances_df.drop(columns=['index'], inplace=True)

# Add the longitude and latitude of the monitors to the distances dataframe
distances_df = distances_df.merge(
    combined_accuracy[['location_id', 'latitude', 'longitude']],
    left_on='location_id_1',
    right_on='location_id',
    how='left'
).rename(columns={'latitude': 'latitude_1', 'longitude': 'longitude_1'})

distances_df = distances_df.merge(
    combined_accuracy[['location_id', 'latitude', 'longitude']],
    left_on='location_id_2',
    right_on='location_id',
    how='left'
).rename(columns={'latitude': 'latitude_2', 'longitude': 'longitude_2'})
distances_df.drop(columns=['location_id_x', 'location_id_y'], inplace=True)
distances_df.head()

Unnamed: 0,distance,location_id_1,location_id_2,latitude_1,longitude_1,latitude_2,longitude_2
0,3.083857,DETMG3939,119179,37.65875,-122.41055,37.632942,-122.456474
1,3.083857,119179,DETMG3939,37.632942,-122.456474,37.65875,-122.41055
2,1.728073,DETMG3939,DMEYT2138,37.65875,-122.41055,37.65214,-122.44095
3,1.728073,DMEYT2138,DETMG3939,37.65214,-122.44095,37.65875,-122.41055
4,0.474254,DETMG3939,DEVPF7186,37.65875,-122.41055,37.65721,-122.41898


In [63]:
distances_df.shape

(1640, 7)

In [64]:
# Find distribution of distances
px.histogram(distances_df, x='distance', title='Distribution of Distances Between Monitors', nbins=25).show()

In [65]:
# Function to find the 5 closest monitors for a given monitor
def get_closest_monitors(location_id, distances_df, n=5):
    """
    Get the n closest monitors for a given monitor.
    Args:
        location_id (str): The location ID of the monitor.
        distances_df (DataFrame): DataFrame containing distances between monitors.
        n (int): Number of closest monitors to find.
    Returns:
        list: List of closest monitor IDs.
    """
    closest_monitors = distances_df[distances_df['location_id_1'] == location_id].nsmallest(n, 'distance')
    return closest_monitors['location_id_2'].tolist()

# Create a dictionary to store the 5 closest monitors for each monitor
closest_monitors_dict = {}
for location_id in combined_accuracy['location_id'].unique():
    closest_monitors = get_closest_monitors(location_id, distances_df)
    closest_monitors_dict[location_id] = closest_monitors

closest_monitors_dict

{'DETMG3939': ['DHPSP8686', 'DRYLF3821', 'DHSHV3008', 160983, 120937],
 119179: ['DUBTA4581', 70569, 'DTMSK2119', 90215, 60171],
 'DMEYT2138': [158239, 158259, 144654, 78387, 'DCVIM2201'],
 'DEVPF7186': [120937, 'DHSHV3008', 'DETMG3939', 38589, 109506],
 113144: [111498, 91617, 21427, 'DJTYV8538', 'DJGNN5114'],
 'DVRGV9737': [160983, 'DHPSP8686', 'DRYLF3821', 'DETMG3939', 38589],
 'DRCAC7970': [111235, 'DJGNN5114', 67419, 169967, 21427],
 'DCVIM2201': [158259, 65711, 144654, 'DTMSK2119', 90215],
 'DRYLF3821': ['DHPSP8686', 160983, 'DETMG3939', 'DHSHV3008', 'DVRGV9737'],
 'DTMSK2119': [90215, 70569, 144654, 'DUBTA4581', 119179],
 90215: ['DTMSK2119', 70569, 144654, 'DUBTA4581', 119179],
 109718: [109506, 38589, 120937, 'DEVPF7186', 69403],
 120937: ['DEVPF7186', 'DHSHV3008', 'DETMG3939', 38589, 109506],
 70569: ['DTMSK2119', 90215, 'DUBTA4581', 177521, 119179],
 65711: [94137, 'DCVIM2201', 111235, 21427, 'DJGNN5114'],
 86761: ['DVRGV9737', 160983, 'DHPSP8686', 'DRYLF3821', 'DETMG3939'],

In [66]:
purple_filt_df['date'] = pd.to_datetime(purple_filt_df['time']).dt.date
purple_filt_df.drop(columns=['time', 'is_outlier', 'pm2_5_24h_mean_log_bins'], inplace=True)
purple_filt_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,pm2_5_24h_mean,pm2_5_24h_mean_log,date
0,21427,Grundy Park,0.21,0.19062,2024-02-29
1,113144,Shelter Crik,0.0,0.0,2024-02-29
2,109718,terra,0.89,0.636577,2024-02-29
3,111235,Belle Air,1.16,0.770108,2024-02-29
4,111498,Crestmoor III,1.13,0.756122,2024-02-29


In [67]:
unique_dates_filtered.drop(columns=['time', 'pm2_5_24h_mean_log_bins'], inplace=True)
unique_dates_filtered.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,date,pm2_5_24h_mean,pm2_5_24h_mean_log
33,DRCAC7970,Belle Air School,2024-10-31,3.94,1.597365
150,DRCAC7970,Belle Air School,2024-11-01,4.89,1.773256
361,DRCAC7970,Belle Air School,2024-11-02,4.02,1.61343
574,DRCAC7970,Belle Air School,2024-11-03,4.34,1.675226
803,DRCAC7970,Belle Air School,2024-11-04,5.06,1.80171


In [68]:
# Combine the filtered PurpleAir dataframe with the unique dates filtered dataframe
combined_filter_df = pd.concat([purple_filt_df, unique_dates_filtered], ignore_index=True)
combined_filter_df.head()

Unnamed: 0,location_id,location_name,pm2_5_24h_mean,pm2_5_24h_mean_log,date
0,21427,Grundy Park,0.21,0.19062,2024-02-29
1,113144,Shelter Crik,0.0,0.0,2024-02-29
2,109718,terra,0.89,0.636577,2024-02-29
3,111235,Belle Air,1.16,0.770108,2024-02-29
4,111498,Crestmoor III,1.13,0.756122,2024-02-29


In [69]:
from sklearn.linear_model import LinearRegression

monitor_nearby_accuracy = []

# If monitor does not have enough data points, take the average of the closest monitors
no_data_monitors = []

for location_id, nearby_monitors in list(closest_monitors_dict.items()):  # Iterate through one loop for testing
    monitor_data = combined_filter_df[combined_filter_df['location_id'] == location_id]

    # Add individual measures for nearby monitors
    for i, nearby_monitor in enumerate(nearby_monitors):  # Limit to 5 nearby monitors
        nearby_monitor_data = combined_filter_df[combined_filter_df['location_id'] == nearby_monitor]
        monitor_data = monitor_data.merge(
            nearby_monitor_data[['date', 'pm2_5_24h_mean_log', 'location_id']],
            on='date',
            how='left',
            suffixes=('', f'_nearby_{i+1}')
        )

        # Add distance weights for each nearby monitor
        distance = distances_df[(distances_df['location_id_1'] == location_id) & (distances_df['location_id_2'] == nearby_monitor)]['distance'].values[0]
        if distance == 0:
            distance = 1e-6  # Avoid division by zero by using a small value
        monitor_data[f'pm2_5_24h_mean_log_nearby_{i+1}'] *= 1 / distance

    # Drop rows with NaN values
    monitor_data.dropna(inplace=True)

    # Make sure there are enough data points to fit a model
    if len(monitor_data) < 10:
        no_data_monitors.append(location_id)
        continue
    
    # Fit a linear regression model for monitors with enough data points
    # Keep only the columns with pm2_5_24h_mean_log for nearby monitors
    feature_columns = [col for col in monitor_data.columns if 'pm2_5_24h_mean_log_nearby' in col]

    # Prepare the features (X) and target (y)
    X = monitor_data[feature_columns]
    y = monitor_data['pm2_5_24h_mean_log']

    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Store the accuracy of the model
    accuracy = model.score(X, y)
    monitor_nearby_accuracy.append({'location_id': location_id, 'r_squared': accuracy})

In [70]:
monitor_nearby_accuracy_df = pd.DataFrame(monitor_nearby_accuracy)
monitor_nearby_accuracy_df = monitor_nearby_accuracy_df.sort_values(by='r_squared', ascending=False)
monitor_nearby_accuracy_df

Unnamed: 0,location_id,r_squared
8,90215,0.992492
12,60171,0.990581
11,144654,0.990095
9,70569,0.989851
2,DMEYT2138,0.987588
13,21427,0.985685
6,DRYLF3821,0.98541
24,DJGNN5114,0.985092
0,DETMG3939,0.980483
21,158259,0.979786


In [71]:
# For monitors with no data, take the average of the closest monitors with r-squared value
for location_id in no_data_monitors:
    nearby_monitors = closest_monitors_dict[location_id]
    nearby_accuracies = monitor_nearby_accuracy_df[monitor_nearby_accuracy_df['location_id'].isin(nearby_monitors)]
    
    if not nearby_accuracies.empty:
        avg_r_squared = nearby_accuracies['r_squared'].mean()
        monitor_nearby_accuracy_df.loc[len(monitor_nearby_accuracy_df)] = [f'{location_id}', avg_r_squared]

# Sort the DataFrame by r_squared
monitor_nearby_accuracy_df = monitor_nearby_accuracy_df.sort_values(by='r_squared', ascending=False)
monitor_nearby_accuracy_df

Unnamed: 0,location_id,r_squared
8,90215,0.992492
12,60171,0.990581
11,144654,0.990095
9,70569,0.989851
2,DMEYT2138,0.987588
13,21427,0.985685
6,DRYLF3821,0.98541
24,DJGNN5114,0.985092
35,65711,0.982831
40,94137,0.982831


In [72]:
monitor_nearby_accuracy_df.shape

(41, 2)

In [73]:
# Standardize the r2 values and the self monitor accuracy values and put it to 0 to 1 scale
monitor_nearby_accuracy_df['r_squared'] = (monitor_nearby_accuracy_df['r_squared'] - monitor_nearby_accuracy_df['r_squared'].mean()) / monitor_nearby_accuracy_df['r_squared'].std()
monitor_nearby_accuracy_df['r_squared'] = monitor_nearby_accuracy_df['r_squared'].round(2)
monitor_nearby_accuracy_df

Unnamed: 0,location_id,r_squared
8,90215,0.49
12,60171,0.47
11,144654,0.47
9,70569,0.46
2,DMEYT2138,0.45
13,21427,0.43
6,DRYLF3821,0.43
24,DJGNN5114,0.43
35,65711,0.41
40,94137,0.41


In [74]:
# Standardize the combined accuracy values
combined_accuracy['accuracy'] = (combined_accuracy['accuracy'] - combined_accuracy['accuracy'].mean()) / combined_accuracy['accuracy'].std()
combined_accuracy['accuracy'] = combined_accuracy['accuracy'].round(2)
combined_accuracy.sort_values(by='accuracy', ascending=False)
combined_accuracy.head(10)

Unnamed: 0,location_name,accuracy,location_id,latitude,longitude
0,Nora Alvarado Home,1.64,DETMG3939,37.65875,-122.41055
1,"r""",1.43,119179,37.632942,-122.456474
2,Buri Buri Park,1.31,DMEYT2138,37.65214,-122.44095
3,Evelin Pacheco Home,1.27,DEVPF7186,37.65721,-122.41898
4,Shelter Crik,1.18,113144,37.62002,-122.42762
5,Gardiner Park,1.1,DVRGV9737,37.66409,-122.40165
6,Belle Air School,0.82,DRCAC7970,37.62441,-122.40469
7,Brentwood Park,0.75,DCVIM2201,37.63757,-122.4327
8,Marita Santos Home,0.71,DRYLF3821,37.65783,-122.40707
9,Rollingwood Elementary,0.68,DTMSK2119,37.63092,-122.44371


In [86]:
# Make sure location_id is in the same format
monitor_nearby_accuracy_df['location_id'] = monitor_nearby_accuracy_df['location_id'].astype(str).str.strip()
combined_accuracy['location_id'] = combined_accuracy['location_id'].astype(str).str.strip()

# Join the two dataframes on location_id
combined_scores = combined_accuracy.merge(
    monitor_nearby_accuracy_df[['location_id', 'r_squared']],
    on='location_id',
    how='left'
)

combined_scores = combined_scores.rename(columns={'accuracy': 'self_accuracy', 'r_squared': 'nearby_accuracy'})
combined_scores

Unnamed: 0,location_name,self_accuracy,location_id,latitude,longitude,nearby_accuracy
0,Nora Alvarado Home,1.64,DETMG3939,37.65875,-122.41055,0.39
1,"r""",1.43,119179,37.632942,-122.456474,-0.21
2,Buri Buri Park,1.31,DMEYT2138,37.65214,-122.44095,0.45
3,Evelin Pacheco Home,1.27,DEVPF7186,37.65721,-122.41898,0.12
4,Shelter Crik,1.18,113144,37.62002,-122.42762,-0.05
5,Gardiner Park,1.1,DVRGV9737,37.66409,-122.40165,0.37
6,Belle Air School,0.82,DRCAC7970,37.62441,-122.40469,0.36
7,Brentwood Park,0.75,DCVIM2201,37.63757,-122.4327,0.37
8,Marita Santos Home,0.71,DRYLF3821,37.65783,-122.40707,0.43
9,Rollingwood Elementary,0.68,DTMSK2119,37.63092,-122.44371,0.3


In [87]:
combined_scores.shape

(41, 6)

In [88]:
# Graph self_accuracy vs nearby_accuracy
px.scatter(combined_scores, x='self_accuracy', y='nearby_accuracy', 
           title='Self Accuracy vs Nearby Accuracy',
           labels={'self_accuracy': 'Self Monitor Accuracy', 'nearby_accuracy': 'Nearby Monitor Accuracy'},
           color='location_name').show()

In [None]:
# Normalize the accuracy values to be between 0 and 1
combined_scores['self_accuracy'] = (combined_scores['self_accuracy'] - combined_scores['self_accuracy'].min()) / (combined_scores['self_accuracy'].max() - combined_scores['self_accuracy'].min())
combined_scores['nearby_accuracy'] = (combined_scores['nearby_accuracy'] - combined_scores['nearby_accuracy'].min()) / (combined_scores['nearby_accuracy'].max() - combined_scores['nearby_accuracy'].min())
combined_scores['self_accuracy'] = combined_scores['self_accuracy'].round(2)
combined_scores['nearby_accuracy'] = combined_scores['nearby_accuracy'].round(2)

# Perform Harmonic Mean on the two accuracy values
combined_scores['predictability_index'] = 2 * (combined_scores['self_accuracy'] * combined_scores['nearby_accuracy']) / (combined_scores['self_accuracy'] + combined_scores['nearby_accuracy']) * 100
combined_scores['predictability_index'] = combined_scores['predictability_index'].round(2)
combined_scores = combined_scores.sort_values(by='predictability_index', ascending=False)
combined_scores.head()

Unnamed: 0,location_name,self_accuracy,location_id,latitude,longitude,nearby_accuracy,predictability_index
0,Nora Alvarado Home,1.0,DETMG3939,37.65875,-122.41055,0.98,98.99
2,Buri Buri Park,0.93,DMEYT2138,37.65214,-122.44095,0.99,95.91
5,Gardiner Park,0.88,DVRGV9737,37.66409,-122.40165,0.98,92.73
3,Evelin Pacheco Home,0.92,DEVPF7186,37.65721,-122.41898,0.92,92.0
1,"r""",0.95,119179,37.632942,-122.456474,0.86,90.28
4,Shelter Crik,0.9,113144,37.62002,-122.42762,0.89,89.5
6,Belle Air School,0.82,DRCAC7970,37.62441,-122.40469,0.97,88.87
10,Rollingwood Elementary,0.79,90215,37.63092,-122.44371,1.0,88.27
7,Brentwood Park,0.8,DCVIM2201,37.63757,-122.4327,0.98,88.09
8,Marita Santos Home,0.79,DRYLF3821,37.65783,-122.40707,0.99,87.88


In [111]:
# Save the combined scores to a CSV file
combined_scores[['location_id', 'location_name', 'longitude', 'latitude', 'predictability_index']].to_csv('../data/combined_scores.csv', index=False)