## **PurpleAir Monitors** ##

In [2]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot
import pandas as pd

In [3]:
# Load cleaned data
purple_df = pd.read_csv('../data/clean_purpleair.csv')
purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,


### **Preprocessing Check**

In [4]:
# Convert longitude and latitude to correct types
purple_df['longitude'] = purple_df['longitude'].apply(lambda x: -abs(x))
purple_df['latitude'] = purple_df['latitude'].apply(lambda x: abs(x))

# Convert time to datetime (if not already done)
purple_df['time'] = pd.to_datetime(purple_df['time'], format='%Y-%m-%d %H:%M:%S')

purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,


In [5]:
# Create column for date only
purple_df['date'] = purple_df['time'].dt.date
purple_df['date'] = pd.to_datetime(purple_df['date'], format='%Y-%m-%d')

purple_df['date'].head()

0   2018-12-27
1   2018-12-27
2   2018-12-27
3   2018-12-27
4   2018-12-27
Name: date, dtype: datetime64[ns]

### **Visualizing and Analyzing**

In [6]:
# Only take unique dates and their average pm2.5 concentrations
unique_dates = purple_df.drop_duplicates(subset=['location_name', 'date'], keep='first')

In [7]:
# Find outliers using IQR and separate them from the dataframe
def find_outliers_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    non_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return non_outliers, outliers

# Remove outliers and keep them in a separate dataframe
unique_dates, outliers_df = find_outliers_iqr(unique_dates, 'pm2_5_24h_mean')

print("Number of outliers removed:", len(outliers_df))
print("Number of unique dates remaining:", len(unique_dates))

Number of outliers removed: 2104
Number of unique dates remaining: 24034


In [None]:
# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Add histogram for PM2.5 values without outliers
fig.add_trace(
    go.Histogram(
        x=unique_dates['pm2_5_24h_mean'],
        nbinsx=35,
        marker=dict(color='blue'),
        hoverinfo='x+y'
    ),
    row=1, col=1
)

# Add histogram for PM2.5 values outliers
fig.add_trace(
    go.Box(
        x=outliers_df['pm2_5_24h_mean'],
        marker=dict(color='green'),
        hoverinfo='x',
        name=''
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(title_text="Distribution of PM2.5 Values for PurpleAir Monitors",
                  showlegend=False)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=2)

# Save the figure offline
plot(fig, filename='../figures/purpleair_pm25_distribution.html', auto_open=True)

'../figures/purpleair_pm25_distribution.html'

In [8]:
# Calculate the rolling 24h mean for PM2.5 for each hour
purple_df['pm2_5_24h_rolling_mean'] = purple_df.groupby('location_name')['pm2_5_1h_mean'].transform(
    lambda x: x.rolling(window=24, min_periods=1).mean()
)

# Impute missing values in the rolling mean
purple_df['pm2_5_24h_rolling_mean'] = purple_df['pm2_5_24h_rolling_mean'].ffill()
purple_df['pm2_5_24h_rolling_mean'] = purple_df['pm2_5_24h_rolling_mean'].bfill()

purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure,date,pm2_5_24h_rolling_mean
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,,2018-12-27,0.736345
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,,2018-12-27,0.738086
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,,2018-12-27,0.838347
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,,2018-12-27,0.932413
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,,2018-12-27,0.971445


In [9]:
# AQI calculation function
def calculate_pm2_5_aqi(C_p):
    if pd.isna(C_p):
        return None

    C_p = float(str(C_p)[:str(C_p).find('.')+2]) if '.' in str(C_p) else float(C_p)

    breakpoints = [
        (0.0,   9.0,   0,   50),
        (9.1,   35.4,  51,  100),
        (35.5,  55.4,  101, 150),
        (55.5,  125.4, 151, 200),
        (125.5, 225.4, 201, 300),
        (225.5, 500.4, 301, 500)
    ]

    for BP_Lo, BP_Hi, I_Lo, I_Hi in breakpoints:
        if BP_Lo <= C_p <= BP_Hi:
            I_p = ((I_Hi - I_Lo) / (BP_Hi - BP_Lo)) * (C_p - BP_Lo) + I_Lo
            return round(I_p)

    return None

# Calculate AQI for PM2.5 24hr mean
purple_df['pm2_5_24h_rolling_mean_aqi'] = purple_df['pm2_5_24h_rolling_mean'].apply(calculate_pm2_5_aqi)

In [10]:
# Find outliers from dataframe
no_outliers_df, outliers_df = find_outliers_iqr(purple_df, 'pm2_5_24h_rolling_mean_aqi')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Scatterplot of PM2.5 values for PurpleAir monitors without outliers
fig.add_trace(
        go.Scatter(
                x=no_outliers_df['pm2_5_1h_mean'],
                y=no_outliers_df['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='blue', size=5),
                hovertext=no_outliers_df['location_name'],
                name=''
        ),
        row=1, col=1
)

# Scatterplot of PM2.5 values for PurpleAir monitors with outliers
fig.add_trace(
        go.Scatter(
                x=outliers_df['pm2_5_1h_mean'],
                y=outliers_df['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='red', size=5),
                hovertext=outliers_df['location_name'],
                name=''
        ),
        row=1, col=2
)

# Update axis titles
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=1)
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=2)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=2)

# Update layout
fig.update_layout(title_text="Scatter Plots of PM2.5 Values for PurpleAir Monitors",
                                  showlegend=False)

# Save the figure offline
plot(fig, filename='../figures/purpleair_pm25_aqi_scatter.html', auto_open=True)

'../figures/purpleair_pm25_aqi_scatter.html'

## **Clarity Monitors**

In [3]:
# Load cleaned data for Clarity monitors
clarity_df = pd.read_csv('../data/clean_clarity.csv')

# Sort by monitor and time
clarity_df = clarity_df.sort_values(['location_name', 'time'])

# Convert longitude and latitude to correct types
clarity_df['longitude'] = clarity_df['longitude'].apply(lambda x: -abs(x))
clarity_df['latitude'] = clarity_df['latitude'].apply(lambda x: abs(x))

# Convert time to datetime (if not already done)
clarity_df['time'] = pd.to_datetime(clarity_df['time'], format='%Y-%m-%d %H:%M:%S')

# Create column for date only
clarity_df['date'] = clarity_df['time'].dt.date
clarity_df['date'] = pd.to_datetime(clarity_df['date'], format='%Y-%m-%d')

clarity_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,date
33,2024-10-31 11:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.07,22.0,3.942308,22,20.0,54.91,2024-10-31
34,2024-10-31 12:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.11,23.0,3.942308,22,20.5,51.66,2024-10-31
49,2024-10-31 13:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.38,24.0,3.942308,22,20.47,50.51,2024-10-31
52,2024-10-31 14:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,4.15,23.0,3.942308,22,19.94,53.11,2024-10-31
63,2024-10-31 15:00:00,Belle Air School,DRCAC7970,37.62441,-122.40469,3.67,20.0,3.942308,22,18.86,56.11,2024-10-31


In [None]:
# Only take unique dates and their average pm2.5 concentrations
unique_dates_clarity = clarity_df.drop_duplicates(subset=['location_name', 'date'], keep='first')

# Find outliers using IQR and separate them from the dataframe
unique_dates_clarity, outliers_df_clarity = find_outliers_iqr(unique_dates_clarity, 'pm2_5_24h_mean')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Add histogram for PM2.5 values without outliers
fig.add_trace(
    go.Histogram(
        x=unique_dates_clarity['pm2_5_24h_mean'],
        nbinsx=35,
        marker=dict(color='blue'),
        hoverinfo='x+y'
    ),
    row=1, col=1
)

# Add histogram for PM2.5 values outliers
fig.add_trace(
    go.Box(
        x=outliers_df_clarity['pm2_5_24h_mean'],
        marker=dict(color='green'),
        hoverinfo='x',
        name=''
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(title_text="Distribution of PM2.5 Values for Clarity Monitors",
                  showlegend=False)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=2)

# Save the figure offline
plot(fig, filename='../figures/clarity_pm25_distribution.html', auto_open=True)

In [12]:
# Calculate the rolling 24h mean for PM2.5 for each hour
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df.groupby('location_name')['pm2_5_1h_mean'].transform(
    lambda x: x.rolling(window=24, min_periods=1).mean()
)

# Impute missing values in the rolling mean
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df['pm2_5_24h_rolling_mean'].ffill()
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df['pm2_5_24h_rolling_mean'].bfill()

# Calculate AQI for PM2.5 24hr mean
clarity_df['pm2_5_24h_rolling_mean_aqi'] = clarity_df['pm2_5_24h_rolling_mean'].apply(calculate_pm2_5_aqi)

# Find outliers from dataframe
no_outliers_df_clarity, outliers_df_clarity = find_outliers_iqr(clarity_df, 'pm2_5_24h_rolling_mean_aqi')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Scatterplot of PM2.5 values for Clarity monitors without outliers
fig.add_trace(
        go.Scatter(
                x=no_outliers_df_clarity['pm2_5_1h_mean'],
                y=no_outliers_df_clarity['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='blue', size=5),
                hovertext=no_outliers_df_clarity['location_name'],
                name=''
        ),
        row=1, col=1
)

# Scatterplot of PM2.5 values for Clarity monitors with outliers
fig.add_trace(
        go.Scatter(
                x=outliers_df_clarity['pm2_5_1h_mean'],
                y=outliers_df_clarity['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='red', size=5),
                hovertext=outliers_df_clarity['location_name'],
                name=''
        ),
        row=1, col=2
)

# Update axis titles
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=1)
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=2)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=2)

# Update layout
fig.update_layout(title_text="Scatter Plots of PM2.5 Values for Clarity Monitors",
                                  showlegend=False)

# Save the figure offline
plot(fig, filename='../figures/clarity_pm25_aqi_scatter.html', auto_open=True)

'../figures/clarity_pm25_aqi_scatter.html'

In [13]:
# Keep the clarity_df_grouped definition
clarity_df_grouped = clarity_df.groupby(['date']).agg({'pm2_5_1h_mean': 'mean'}).reset_index()
clarity_df_grouped['month_year_str'] = clarity_df_grouped['date'].dt.strftime('%B')
clarity_df_grouped['day'] = clarity_df_grouped['date'].dt.day

# Create a facet grid with month-year as the facet column
fig = px.line(
    clarity_df_grouped,
    x="day",
    y="pm2_5_1h_mean",
    facet_col="month_year_str",
    facet_col_wrap=6,
    title="PM2.5 Concentrations by Month (Clarity Monitors)",
    labels={"pm2_5_1h_mean": "PM2.5 (1-hour mean)", "day": "Day of Month", "month_year_str": "Month"},
    height=400
)

# Update layout for better visualization
fig.update_layout(
    margin=dict(t=50, l=50, r=50, b=50),
    title_x=0.5
)

# Show the plot
fig.show()


## **Self-Prediction**

### **Experiment: Clarity Monitors**

In [90]:
# Filter the data
filtered_df = clarity_df[['location_id', 'location_name', 'date', 'time', 'pm2_5_24h_mean']]
filtered_df.head()

Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.942308
34,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 12:00:00,3.942308
49,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 13:00:00,3.942308
52,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 14:00:00,3.942308
63,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 15:00:00,3.942308


In [91]:
# Round the pm2_5_24h_mean values to 2 decimal places
filtered_df['pm2_5_24h_mean'] = filtered_df['pm2_5_24h_mean'].round(2)
filtered_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.94
34,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 12:00:00,3.94
49,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 13:00:00,3.94
52,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 14:00:00,3.94
63,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 15:00:00,3.94


In [92]:
# Remove outliers from data (either removing, or just imputing)


In [93]:
# Get the unique values by date
unique_dates_filtered = filtered_df.drop_duplicates(subset=['location_name', 'date'], keep='first')
unique_dates_filtered.head()

Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.94
150,DRCAC7970,Belle Air School,2024-11-01,2024-11-01 00:00:00,4.89
361,DRCAC7970,Belle Air School,2024-11-02,2024-11-02 00:00:00,4.02
574,DRCAC7970,Belle Air School,2024-11-03,2024-11-03 00:00:00,4.34
803,DRCAC7970,Belle Air School,2024-11-04,2024-11-04 00:00:00,5.06


In [94]:
unique_dates_filtered.shape

(2069, 5)

In [95]:
# Distribution of PM2.5 values for PurpleAir monitors
px.histogram(unique_dates_filtered[unique_dates_filtered['pm2_5_24h_mean'] <= 200], 
             x='pm2_5_24h_mean', title='Distribution of PM2.5 24hr Mean Values', nbins=50).show()

In [96]:
import numpy as np

# Distribution of the pm2_5_24h_mean values
# Apply log transformation to the pm2_5_24h_mean values
unique_dates_filtered['pm2_5_24h_mean_log'] = unique_dates_filtered['pm2_5_24h_mean'].apply(lambda x: np.log(x + 1))
px.histogram(unique_dates_filtered, x='pm2_5_24h_mean_log', title='Distribution of Log PM2.5 24hr Mean Values', nbins=25).show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [130]:
# Create bins for the log pm2_5_24h_mean values and get edges of the bins
bins = pd.qcut(unique_dates_filtered['pm2_5_24h_mean_log'], q=5)

# Get the edges of the bins
edges = bins.cat.categories
edges = [(edges[i].left, edges[i].right) for i in range(len(edges))]
edges = [(edges[i][0].round(2), edges[i][1].round(2)) for i in range(len(edges))]
edges = [(np.exp(edges[i][0]) - 1, np.exp(edges[i][1]) - 1) for i in range(len(edges))]

edges

[(1.691234472349262, 3.854955811237434),
 (3.854955811237434, 4.754602676005731),
 (4.754602676005731, 6.170676488346613),
 (6.170676488346613, 9.59095145243378),
 (9.59095145243378, 34.163197145106615)]

In [None]:
# Create final edges for the bins
final_edges = [(0, 3.50), (3.51, 5.00), (5.01, 6.50), (6.51, 9.50), (9.51, 35.00)]

# Create bins for the log pm2_5_24h_mean values and get edges of the bins from final_edges above
log_bins = np.log1p([x[0] for x in final_edges] + [final_edges[-1][1]])
log_edges = [(log_bins[i], log_bins[i+1]) for i in range(len(log_bins)-1)]
log_edges = [(log_edges[i][0].round(2), log_edges[i][1].round(2)) for i in range(len(log_edges))]

# Create a new column for the bins
unique_dates_filtered['pm2_5_24h_mean_log_bins'] = pd.cut(unique_dates_filtered['pm2_5_24h_mean_log'], 
                                                      bins=[x[0] for x in log_edges] + [log_edges[-1][1]], labels=[f"{i}" for i in range(len(final_edges))], 
                                                      include_lowest=True)

unique_dates_filtered.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean,pm2_5_24h_mean_log,pm2_5_24h_mean_log_bins
33,DRCAC7970,Belle Air School,2024-10-31,2024-10-31 11:00:00,3.94,1.597365,1
150,DRCAC7970,Belle Air School,2024-11-01,2024-11-01 00:00:00,4.89,1.773256,1
361,DRCAC7970,Belle Air School,2024-11-02,2024-11-02 00:00:00,4.02,1.61343,1
574,DRCAC7970,Belle Air School,2024-11-03,2024-11-03 00:00:00,4.34,1.675226,1
803,DRCAC7970,Belle Air School,2024-11-04,2024-11-04 00:00:00,5.06,1.80171,2


In [132]:
# Get distribution of the bins using proportions
bins_distribution = unique_dates_filtered['pm2_5_24h_mean_log_bins'].value_counts(normalize=True).sort_index()
bins_distribution = bins_distribution.reset_index()
bins_distribution.columns = ['pm2_5_24h_mean_log_bins', 'proportion']

# Create bar plot for the bins distribution
fig = px.bar(bins_distribution, x='pm2_5_24h_mean_log_bins', y='proportion', 
             title='Distribution of PM2.5 24hr Mean Values by Bins', 
             labels={'pm2_5_24h_mean_log_bins': 'PM2.5 24hr Log Mean Bins', 'proportion': 'Proportion'},
             color='proportion', color_continuous_scale=px.colors.sequential.Plasma)

fig.show()

In [133]:
# Create a new dataframe with rolling window of 7 days and the next day's bin as label
window_size = 7

# Group by location_id to ensure the rolling window is applied per location
data = []
for location_id, group in unique_dates_filtered.groupby('location_id'):
    group = group.sort_values('date').reset_index(drop=True)
    
    for i in range(len(group) - window_size):
        # Extract the 7-day window of pm2_5_24h_mean
        features = group.loc[i:i+window_size-1, 'pm2_5_24h_mean_log'].values.tolist()
        # Extract the next day's bin as the label
        label = group.loc[i+window_size, 'pm2_5_24h_mean_log_bins']
        # Include location_id and location_name
        location_id_value = group.loc[i, 'location_id']
        location_name_value = group.loc[i, 'location_name']
        data.append([location_id_value, location_name_value] + features + [label])

# Create the final dataframe
columns = ['location_id', 'location_name'] + [f'feature_day_{i+1}' for i in range(window_size)] + ['label']
rolling_window_df = pd.DataFrame(data, columns=columns)

rolling_window_df.head()

Unnamed: 0,location_id,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,label
0,DCVIM2201,Brentwood Park,1.669592,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1
1,DCVIM2201,Brentwood Park,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,0
2,DCVIM2201,Brentwood Park,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,0
3,DCVIM2201,Brentwood Park,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,0
4,DCVIM2201,Brentwood Park,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,1.040277,0


In [None]:
# One hot encode by location_id
rolling_window_df = pd.get_dummies(rolling_window_df, columns=['location_id'], prefix='location_id')

# Convert boolean columns to integers
bool_cols = rolling_window_df.select_dtypes(include='bool').columns
rolling_window_df[bool_cols] = rolling_window_df[bool_cols].astype(int)

# Move label column to the end
label_col = rolling_window_df.pop('label')
rolling_window_df['label'] = label_col
rolling_window_df.head()

Unnamed: 0,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,location_id_DCVIM2201,location_id_DETMG3939,...,location_id_DJGNN5114,location_id_DJTYV8538,location_id_DMEYT2138,location_id_DNSEJ7404,location_id_DRCAC7970,location_id_DRYLF3821,location_id_DTMSK2119,location_id_DUBTA4581,location_id_DVRGV9737,label
0,Brentwood Park,1.669592,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1,0,...,0,0,0,0,0,0,0,0,0,1
1,Brentwood Park,1.583094,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Brentwood Park,1.609438,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Brentwood Park,1.884035,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Brentwood Park,1.934416,1.735189,1.642873,1.736951,1.363537,1.050822,1.040277,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Tranform all feature_day columns with a log transformation
# rolling_window_df.iloc[:, 2:-1] = rolling_window_df.iloc[:, 2:-1].apply(lambda x: np.log(x + 1))
# rolling_window_df.head()

In [135]:
rolling_window_df.shape

(1971, 23)

In [136]:
# Build LSTM model and dataset

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(-1)  # Add a channel dimension
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class AirForecastClassifier(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=64, num_layers=2, num_classes=5):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # lstm_out: (batch, seq_len, hidden_dim)
        final_hidden = lstm_out[:, -1, :]  # take last timestep
        logits = self.fc(final_hidden)
        return logits

In [137]:
def train_model(model, dataloader, epochs=10, lr=1e-3):
    """
    AirForecast LSTM Classifier model training function.
    Args:
        model (nn.Module): The LSTM model to train.
        dataloader (DataLoader): DataLoader for the training data.
        epochs (int): Number of training epochs.
        lr (float): Learning rate for the optimizer.
    Returns:
        model (nn.Module): The trained LSTM model.
    """
    # Use available device
    device = torch.device("mps" if torch.backends.mps.is_available()
                          else "cuda" if torch.cuda.is_available()
                          else "cpu")
    model.to(device)

    # Loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0

        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            logits = model(X_batch)               # (batch_size, num_classes)
            loss = criterion(logits, y_batch)     # y_batch: (batch_size,)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Metrics
            epoch_loss += loss.item()
            preds = logits.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

        avg_loss = epoch_loss / len(dataloader)
        accuracy = correct / total
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2%}")

    return model

def evaluate(model, dataloader):
    """
    AirForecast LSTM model evaluation function.
    Args:
        model (nn.Module): The LSTM model to evaluate.
        dataloader (DataLoader): DataLoader for the validation data.
    Returns:
        preds (np.ndarray): Predicted class labels.
        truths (np.ndarray): True labels.
    """
    model.eval()
    preds, truths = [], []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            logits = model(X_batch)
            batch_preds = logits.argmax(dim=1)

            preds.append(batch_preds.cpu().numpy())
            truths.append(y_batch.cpu().numpy())

    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    accuracy = np.mean(preds == truths)

    print(f"Validation Accuracy: {accuracy:.2%}")
    return preds, truths

In [138]:
# Ensure all monitors are included in the validation set using location_index_map
val_indices = []

for location, group in rolling_window_df.groupby('location_name'):
    split_index = int(len(group) * 0.8)
    if split_index < len(group):  # Ensure split_index is within bounds
        val_indices.extend(group.index[split_index:])

val_indices = np.array(val_indices)
train_indices = np.setdiff1d(rolling_window_df.index, val_indices)

train_df = rolling_window_df.loc[train_indices]
val_df = rolling_window_df.loc[val_indices]

# Create TimeSeriesDataset instances for training and validation sets
train_dataset = TimeSeriesDataset(train_df.iloc[:, 1:-1].values, train_df['label'].astype(int).values)
val_dataset = TimeSeriesDataset(val_df.iloc[:, 1:-1].values, val_df['label'].astype(int).values)

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Train the model
model = AirForecastClassifier(input_dim=1, num_classes=5)
trained_model = train_model(model, train_loader, epochs=20)

Epoch 1/20, Loss: 1.5952, Accuracy: 25.86%
Epoch 2/20, Loss: 1.5917, Accuracy: 25.86%
Epoch 3/20, Loss: 1.5916, Accuracy: 25.86%
Epoch 4/20, Loss: 1.5880, Accuracy: 25.86%
Epoch 5/20, Loss: 1.5632, Accuracy: 28.02%
Epoch 6/20, Loss: 1.4995, Accuracy: 35.01%
Epoch 7/20, Loss: 1.4707, Accuracy: 34.69%
Epoch 8/20, Loss: 1.4502, Accuracy: 37.17%
Epoch 9/20, Loss: 1.4311, Accuracy: 37.48%
Epoch 10/20, Loss: 1.4147, Accuracy: 39.58%
Epoch 11/20, Loss: 1.4256, Accuracy: 39.96%
Epoch 12/20, Loss: 1.4304, Accuracy: 37.55%
Epoch 13/20, Loss: 1.3813, Accuracy: 41.61%
Epoch 14/20, Loss: 1.3862, Accuracy: 41.17%
Epoch 15/20, Loss: 1.4048, Accuracy: 40.41%
Epoch 16/20, Loss: 1.3631, Accuracy: 41.93%
Epoch 17/20, Loss: 1.3476, Accuracy: 42.57%
Epoch 18/20, Loss: 1.3518, Accuracy: 43.20%
Epoch 19/20, Loss: 1.3739, Accuracy: 40.85%
Epoch 20/20, Loss: 1.3406, Accuracy: 43.46%


In [139]:
# Evaluate the model by monitor location
# Move the trained model back to CPU
trained_model = trained_model.to('cpu')

# Evaluate the model by monitor location
predictions, truths = evaluate(trained_model, val_loader)

Validation Accuracy: 39.80%


In [140]:
# Find the accuracy by monitor location
location_accuracy = {}
for location_name in rolling_window_df['location_name'].unique():
    location_indices = rolling_window_df[rolling_window_df['location_name'] == location_name].index
    location_preds = predictions[np.isin(val_indices, location_indices)]
    location_truths = truths[np.isin(val_indices, location_indices)]
    accuracy = np.mean(location_preds == location_truths)
    location_accuracy[location_name] = accuracy
    print(f"Location {location_name}: Accuracy = {accuracy:.2%}")


Location Brentwood Park: Accuracy = 48.15%
Location Nora Alvarado Home: Accuracy = 34.48%
Location Evelin Pacheco Home: Accuracy = 48.28%
Location Cypress and Pine Playlot: Accuracy = 0.00%
Location Rise South City Office: Accuracy = 13.33%
Location San Bruno School District Office: Accuracy = 41.38%
Location Parkside Middle: Accuracy = 55.17%
Location Buri Buri Park: Accuracy = 48.15%
Location Clay Ave Park: Accuracy = 44.44%
Location Belle Air School: Accuracy = 44.83%
Location Marita Santos Home: Accuracy = 44.83%
Location Rollingwood Elementary: Accuracy = 44.83%
Location Portola Elementary: Accuracy = 44.83%
Location Gardiner Park: Accuracy = 44.44%


In [141]:
from sklearn.model_selection import KFold

# Create k-fold cross-validation sets
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Split the rolling_window_df into k folds
folds = []
for train_index, val_index in kf.split(rolling_window_df):
    train_set = rolling_window_df.iloc[train_index].reset_index(drop=True)
    val_set = rolling_window_df.iloc[val_index].reset_index(drop=True)
    
    # Ensure each training and validation set includes all monitors
    if (set(train_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique()) and
        set(val_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique())):
        folds.append((train_set, val_set))

len(folds)

5

In [142]:
# Create RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a list to store fold accuracy for each monitor
monitor_fold_accuracies = []

# Train and evaluate the Decision Tree Classifier for each fold
for i, (train_set, val_set) in enumerate(folds):
    # Extract features and labels
    X_train = train_set.iloc[:, 2:-1].values
    y_train = train_set['label'].astype(int).values
    X_val = val_set.iloc[:, 2:-1].values
    y_val = val_set['label'].astype(int).values

    # Create and train the Decision Tree Classifier
    clf = RandomForestClassifier(random_state=42, criterion='entropy')
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_val)

    # Calculate accuracy for each monitor
    for location_name in val_set['location_name'].unique():
        location_indices = val_set[val_set['location_name'] == location_name].index
        location_y_val = y_val[np.isin(val_set.index, location_indices)]
        location_y_pred = y_pred[np.isin(val_set.index, location_indices)]
        location_accuracy = accuracy_score(location_y_val, location_y_pred)

        # Store the results
        monitor_fold_accuracies.append({
            'fold': i + 1,
            'location_name': location_name,
            'accuracy': location_accuracy
        })

# Convert the results to a DataFrame
monitor_fold_accuracies_df = pd.DataFrame(monitor_fold_accuracies)

# Display the DataFrame
monitor_fold_accuracies_df.head()


Unnamed: 0,fold,location_name,accuracy
0,1,Brentwood Park,0.785714
1,1,Nora Alvarado Home,0.724138
2,1,Evelin Pacheco Home,0.6875
3,1,Cypress and Pine Playlot,0.740741
4,1,Rise South City Office,0.615385


In [144]:
# Find average accuracy by fold by monitor location for random forest classifier
average_accuracy_by_fold = monitor_fold_accuracies_df.groupby(['location_name']).agg({'accuracy': 'mean'}).reset_index()
average_accuracy_by_fold = average_accuracy_by_fold.sort_values(by='accuracy', ascending=False)

average_accuracy_by_fold

Unnamed: 0,location_name,accuracy
1,Brentwood Park,0.809487
8,Nora Alvarado Home,0.77496
0,Belle Air School,0.75181
2,Buri Buri Park,0.749156
5,Evelin Pacheco Home,0.730682
6,Gardiner Park,0.716764
3,Clay Ave Park,0.694575
12,Rollingwood Elementary,0.687176
7,Marita Santos Home,0.67381
10,Portola Elementary,0.664217


In [145]:
average_accuracy_by_fold.shape

(14, 2)

### **Experiment: Purple Air Monitors**

In [146]:
purple_df = pd.read_csv('../data/purpleair_2024-03-01.csv')
purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,pressure
0,2024-02-29 08:00:00,Grundy Park,21427,37.622585,-122.42097,0.21,1.0,60.976,61.451,1012.084
1,2024-02-29 08:00:00,Shelter Crik,113144,37.62002,-122.42762,0.0,0.0,58.988,60.765,1008.262
2,2024-02-29 08:00:00,terra,109718,37.669968,-122.42153,0.89,4.0,61.047,59.685,1005.107
3,2024-02-29 08:00:00,Belle Air,111235,37.631878,-122.409966,1.16,6.0,,,
4,2024-02-29 08:00:00,Crestmoor III,111498,37.616806,-122.431,1.13,6.0,58.963,64.513,1006.47


In [114]:
# Visualize the locations of the monitors on map with plotly
# Have their location names as hover text
fig = px.scatter_mapbox(purple_df, lat='latitude', lon='longitude', hover_name='location_name',
                        mapbox_style="carto-positron", zoom=10, height=600)

fig.show()

In [147]:
purple_filt_df = purple_df[['location_id', 'location_name', 'time', 'pm2_5_24h_mean']]
purple_filt_df.head()

Unnamed: 0,location_id,location_name,time,pm2_5_24h_mean
0,21427,Grundy Park,2024-02-29 08:00:00,0.21
1,113144,Shelter Crik,2024-02-29 08:00:00,0.0
2,109718,terra,2024-02-29 08:00:00,0.89
3,111235,Belle Air,2024-02-29 08:00:00,1.16
4,111498,Crestmoor III,2024-02-29 08:00:00,1.13


In [148]:
# Identify outliers using IQR method
purple_filt_df["is_outlier"] = False
q1 = purple_filt_df['pm2_5_24h_mean'].quantile(0.25)
q3 = purple_filt_df['pm2_5_24h_mean'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
purple_filt_df.loc[(purple_filt_df['pm2_5_24h_mean'] < lower_bound) | (purple_filt_df['pm2_5_24h_mean'] > upper_bound), 'is_outlier'] = True
purple_filt_df['is_outlier'].value_counts()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



is_outlier
False    9259
True      715
Name: count, dtype: int64

In [149]:
# View upper_bound and lower_bound
lower_bound, upper_bound

(-5.740000000000001, 14.82)

In [150]:
# Visualize all PM2.5 values less than 300
px.histogram(purple_filt_df, 
             x='pm2_5_24h_mean', title='Distribution of PM2.5 24hr Mean Values', nbins=200).show()

In [151]:
# Find average PM2.5 values by location
purple_filt_df_grouped = purple_filt_df.groupby(['location_name']).agg({'pm2_5_24h_mean': 'mean'}).reset_index()
purple_filt_df_grouped = purple_filt_df_grouped.sort_values(by='pm2_5_24h_mean', ascending=False)
purple_filt_df_grouped.head(10)

Unnamed: 0,location_name,pm2_5_24h_mean
26,terra,474.043948
15,SSF Cypress,375.315701
24,emalita,14.084762
11,Rise-10,10.222347
12,Rise-8,7.720414
5,Danger Stairs,7.083248
20,South City Lights,6.712126
23,Valleyview &amp; Appian Way,6.554487
22,"Sunshine Gardens, SSF",6.283392
16,San Bruno,6.100157


In [152]:
# Cap the PM2.5 values at 500
purple_filt_df.loc[purple_filt_df['pm2_5_24h_mean'] > 500, 'pm2_5_24h_mean'] = 500

# Visualize the new distribtion of PM2.5 values
px.histogram(purple_filt_df,
             x='pm2_5_24h_mean', title='Distribution of PM2.5 24hr Mean Values', nbins=200).show()

In [153]:
# Apply log transformation to the pm2_5_24h_mean values
purple_filt_df['pm2_5_24h_mean_log'] = np.log1p(purple_filt_df['pm2_5_24h_mean'])

# Visualize the distribution of log PM2.5 values
px.histogram(purple_filt_df, 
             x='pm2_5_24h_mean_log', title='Distribution of Log PM2.5 24hr Mean Values', nbins=35).show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [154]:
# Make bins for the log PM2.5 values using quantiles
bins = pd.qcut(purple_filt_df['pm2_5_24h_mean_log'], q=5)

# Get the edges of the bins and transform them back to PM2.5 values
edges = bins.cat.categories
edges = [(edges[i].left, edges[i].right) for i in range(len(edges))]
edges = [(edges[i][0].round(2), edges[i][1].round(2)) for i in range(len(edges))]
edges = [(np.expm1(edges[i][0]), np.expm1(edges[i][1])) for i in range(len(edges))]

edges

[(-0.0, 1.6116964734231176),
 (1.6116964734231176, 3.2206958169965527),
 (3.2206958169965527, 5.2338866585247175),
 (5.2338866585247175, 8.115716393040305),
 (8.115716393040305, 501.7032320202389)]

In [157]:
# Create final edges for the bins
final_edges = [(0, 1.50), (1.51, 3.50), (3.51, 5.50), (5.51, 8.50), (8.51, 502.00)]

# Create bins for the log pm2_5_24h_mean values and get edges of the bins from final_edges above
log_bins = np.log1p([x[0] for x in final_edges] + [final_edges[-1][1]])
log_edges = [(log_bins[i], log_bins[i+1]) for i in range(len(log_bins)-1)]
log_edges = [(log_edges[i][0].round(2), log_edges[i][1].round(2)) for i in range(len(log_edges))]

# Create a new column for the bins
purple_filt_df['pm2_5_24h_mean_log_bins'] = pd.cut(purple_filt_df['pm2_5_24h_mean_log'], 
                                                      bins=[x[0] for x in log_edges] + [log_edges[-1][1]], labels=[f"{i}" for i in range(len(final_edges))], 
                                                      include_lowest=True)
purple_filt_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,time,pm2_5_24h_mean,is_outlier,pm2_5_24h_mean_log,pm2_5_24h_mean_log_bins
0,21427,Grundy Park,2024-02-29 08:00:00,0.21,False,0.19062,0
1,113144,Shelter Crik,2024-02-29 08:00:00,0.0,False,0.0,0
2,109718,terra,2024-02-29 08:00:00,0.89,False,0.636577,0
3,111235,Belle Air,2024-02-29 08:00:00,1.16,False,0.770108,0
4,111498,Crestmoor III,2024-02-29 08:00:00,1.13,False,0.756122,0


In [158]:
# Distribution of the bins using proportions
bins_distribution = purple_filt_df['pm2_5_24h_mean_log_bins'].value_counts(normalize=True).sort_index()
bins_distribution = bins_distribution.reset_index()
bins_distribution.columns = ['pm2_5_24h_mean_log_bins', 'proportion']

# Create bar plot for the bins distribution
fig = px.bar(bins_distribution, x='pm2_5_24h_mean_log_bins', y='proportion', 
             title='Distribution of PM2.5 24hr Mean Values by Bins', 
             labels={'pm2_5_24h_mean_log_bins': 'PM2.5 24hr Log Mean Bins', 'proportion': 'Proportion'},
             color='proportion', color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

In [164]:
# Create a new dataframe with rolling window of 7 days and the next day's bin as label
window_size = 7

# Group by location_id to ensure the rolling window is applied per location
data = []
for location_id, group in purple_filt_df.groupby('location_id'):
    group = group.sort_values('time').reset_index(drop=True)
    
    for i in range(len(group) - window_size):
        # Extract the 7-day window of pm2_5_24h_mean
        features = group.loc[i:i+window_size-1, 'pm2_5_24h_mean_log'].values.tolist()
        # Extract the next day's bin as the label
        label = group.loc[i+window_size, 'pm2_5_24h_mean_log_bins']
        # Include location_id and location_name
        location_id_value = group.loc[i, 'location_id']
        location_name_value = group.loc[i, 'location_name']
        data.append([location_id_value, location_name_value] + features + [label])

# Create the final dataframe
columns = ['location_id', 'location_name'] + [f'feature_day_{i+1}' for i in range(window_size)] + ['label']
rolling_window_df = pd.DataFrame(data, columns=columns)

rolling_window_df.head()

Unnamed: 0,location_id,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,label
0,21427,Grundy Park,0.19062,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,1
1,21427,Grundy Park,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,0
2,21427,Grundy Park,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,0
3,21427,Grundy Park,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,0
4,21427,Grundy Park,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,0.688135,0


In [165]:
# One hot encode by location_id
rolling_window_df = pd.get_dummies(rolling_window_df, columns=['location_id'], prefix='location_id')

# Convert boolean columns to integers
bool_cols = rolling_window_df.select_dtypes(include='bool').columns
rolling_window_df[bool_cols] = rolling_window_df[bool_cols].astype(int)

# Move label column to the end
label_col = rolling_window_df.pop('label')
rolling_window_df['label'] = label_col
rolling_window_df.head()

Unnamed: 0,location_name,feature_day_1,feature_day_2,feature_day_3,feature_day_4,feature_day_5,feature_day_6,feature_day_7,location_id_21427,location_id_38589,...,location_id_113144,location_id_119179,location_id_120937,location_id_144654,location_id_158239,location_id_158259,location_id_160983,location_id_169967,location_id_177521,label
0,Grundy Park,0.19062,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,1,0,...,0,0,0,0,0,0,0,0,0,1
1,Grundy Park,0.300105,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Grundy Park,0.239017,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Grundy Park,0.350657,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Grundy Park,0.350657,0.631272,0.548121,1.088562,0.392042,0.405465,0.688135,1,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
# Ensure all monitors are included in the validation set using location_index_map
val_indices = []

for location, group in rolling_window_df.groupby('location_name'):
    split_index = int(len(group) * 0.8)
    if split_index < len(group):  # Ensure split_index is within bounds
        val_indices.extend(group.index[split_index:])

val_indices = np.array(val_indices)
train_indices = np.setdiff1d(rolling_window_df.index, val_indices)

train_df = rolling_window_df.loc[train_indices]
val_df = rolling_window_df.loc[val_indices]

# Create TimeSeriesDataset instances for training and validation sets
train_dataset = TimeSeriesDataset(train_df.iloc[:, 1:-1].values, train_df['label'].astype(int).values)
val_dataset = TimeSeriesDataset(val_df.iloc[:, 1:-1].values, val_df['label'].astype(int).values)

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Train the model
model = AirForecastClassifier(input_dim=1, num_classes=5)
trained_model = train_model(model, train_loader, epochs=20)

Epoch 1/20, Loss: 1.6072, Accuracy: 22.37%
Epoch 2/20, Loss: 1.5443, Accuracy: 28.38%
Epoch 3/20, Loss: 1.4049, Accuracy: 36.54%
Epoch 4/20, Loss: 1.3804, Accuracy: 38.33%
Epoch 5/20, Loss: 1.3693, Accuracy: 38.79%
Epoch 6/20, Loss: 1.3539, Accuracy: 39.67%
Epoch 7/20, Loss: 1.3425, Accuracy: 40.12%
Epoch 8/20, Loss: 1.3295, Accuracy: 40.98%
Epoch 9/20, Loss: 1.3294, Accuracy: 40.41%
Epoch 10/20, Loss: 1.3177, Accuracy: 41.26%
Epoch 11/20, Loss: 1.2986, Accuracy: 42.73%
Epoch 12/20, Loss: 1.3007, Accuracy: 43.19%
Epoch 13/20, Loss: 1.2819, Accuracy: 43.77%
Epoch 14/20, Loss: 1.2825, Accuracy: 44.10%
Epoch 15/20, Loss: 1.2765, Accuracy: 44.66%
Epoch 16/20, Loss: 1.2756, Accuracy: 44.56%
Epoch 17/20, Loss: 1.2660, Accuracy: 45.53%
Epoch 18/20, Loss: 1.2542, Accuracy: 46.08%
Epoch 19/20, Loss: 1.2575, Accuracy: 45.35%
Epoch 20/20, Loss: 1.2587, Accuracy: 46.17%


In [167]:
# Evaluate the model by monitor location
# Move the trained model back to CPU
trained_model = trained_model.to('cpu')

# Evaluate the model by monitor location
predictions, truths = evaluate(trained_model, val_loader)

Validation Accuracy: 41.91%


In [168]:
# Find the accuracy by monitor location
location_accuracy = {}
for location_name in rolling_window_df['location_name'].unique():
    location_indices = rolling_window_df[rolling_window_df['location_name'] == location_name].index
    location_preds = predictions[np.isin(val_indices, location_indices)]
    location_truths = truths[np.isin(val_indices, location_indices)]
    accuracy = np.mean(location_preds == location_truths)
    location_accuracy[location_name] = accuracy
    print(f"Location {location_name}: Accuracy = {accuracy:.2%}")


Location Grundy Park: Accuracy = 24.71%
Location Sign Hill: Accuracy = 36.47%
Location South San Francisco Westborough Park: Accuracy = 29.41%
Location Commodore Dr: Accuracy = 40.00%
Location Danger Stairs: Accuracy = 40.00%
Location Acacia And Crystal Springs: Accuracy = 49.40%
Location Sunshine Gardens, SSF: Accuracy = 33.33%
Location Rollingwood / Palmhaus: Accuracy = 43.53%
Location Home: Accuracy = 36.25%
Location South City Lights: Accuracy = 35.29%
Location B9 North AHU-5012: Accuracy = 47.27%
Location Rollingwood Elementary: Accuracy = 38.82%
Location Parkside Intermediate School: Accuracy = 35.29%
Location San Bruno: Accuracy = 29.17%
Location Sign Hill, Stonegate: Accuracy = 39.47%
Location terra: Accuracy = 47.17%
Location Belle Air: Accuracy = 42.86%
Location Crestmoor III: Accuracy = 41.67%
Location Shelter Crik: Accuracy = 70.89%
Location r": Accuracy = 69.41%
Location Elm Court: Accuracy = 67.80%
Location Valleyview &amp; Appian Way: Accuracy = 55.22%
Location Rise-8: A

In [169]:
# Create k-fold cross-validation sets
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Split the rolling_window_df into k folds
folds = []
for train_index, val_index in kf.split(rolling_window_df):
    train_set = rolling_window_df.iloc[train_index].reset_index(drop=True)
    val_set = rolling_window_df.iloc[val_index].reset_index(drop=True)
    
    # Ensure each training and validation set includes all monitors
    if (set(train_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique()) and
        set(val_set['location_name'].unique()) == set(rolling_window_df['location_name'].unique())):
        folds.append((train_set, val_set))

len(folds)

5

In [170]:
# Create a list to store fold accuracy for each monitor
monitor_fold_accuracies = []

# Train and evaluate the Decision Tree Classifier for each fold
for i, (train_set, val_set) in enumerate(folds):
    # Extract features and labels
    X_train = train_set.iloc[:, 2:-1].values
    y_train = train_set['label'].astype(int).values
    X_val = val_set.iloc[:, 2:-1].values
    y_val = val_set['label'].astype(int).values

    # Create and train the Decision Tree Classifier
    clf = RandomForestClassifier(random_state=42, criterion='entropy')
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_val)

    # Calculate accuracy for each monitor
    for location_name in val_set['location_name'].unique():
        location_indices = val_set[val_set['location_name'] == location_name].index
        location_y_val = y_val[np.isin(val_set.index, location_indices)]
        location_y_pred = y_pred[np.isin(val_set.index, location_indices)]
        location_accuracy = accuracy_score(location_y_val, location_y_pred)

        # Store the results
        monitor_fold_accuracies.append({
            'fold': i + 1,
            'location_name': location_name,
            'accuracy': location_accuracy
        })

# Convert the results to a DataFrame
monitor_fold_accuracies_df = pd.DataFrame(monitor_fold_accuracies)

# Display the DataFrame
monitor_fold_accuracies_df.head()

Unnamed: 0,fold,location_name,accuracy
0,1,Grundy Park,0.43956
1,1,Sign Hill,0.553191
2,1,South San Francisco Westborough Park,0.707317
3,1,Commodore Dr,0.659341
4,1,Danger Stairs,0.356164


In [171]:
average_accuracy_by_fold = monitor_fold_accuracies_df.groupby(['location_name']).agg({'accuracy': 'mean'}).reset_index()
average_accuracy_by_fold = average_accuracy_by_fold.sort_values(by='accuracy', ascending=False)

average_accuracy_by_fold

Unnamed: 0,location_name,accuracy
20,South City Lights,0.767538
22,"Sunshine Gardens, SSF",0.755021
23,Valleyview &amp; Appian Way,0.747261
19,"Sign Hill, Stonegate",0.7455
14,Rollingwood Elementary,0.734906
15,SSF Cypress,0.73029
26,terra,0.718833
13,Rollingwood / Palmhaus,0.70903
21,South San Francisco Westborough Park,0.705793
3,Commodore Dr,0.70371


In [None]:
# Combine monitor accuracy for both models\
