## **PurpleAir Monitors** ##

In [1]:
import pandas as pd

# Load cleaned data
purple_df = pd.read_csv('../data/clean_purpleair.csv')
purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,


In [59]:
# Sort by monitor and time
purple_df = purple_df.sort_values(['location_name', 'time'])

# filter to times after 2024-10-12
filtered_times_df = purple_df[purple_df['time'] > '2024-10-12']

# Show earliest time for each location_name
earliest_times = filtered_times_df.groupby('location_id')['time'].min().reset_index()
earliest_times

Unnamed: 0,location_id,time
0,21427,2024-10-12 01:00:00
1,35103,2024-10-12 01:00:00
2,65711,2024-10-12 01:00:00
3,67419,2024-10-12 01:00:00
4,67553,2024-10-12 01:00:00
5,80869,2024-10-12 01:00:00
6,86761,2024-10-12 01:00:00
7,88655,2024-10-12 01:00:00
8,91617,2024-10-12 01:00:00
9,100355,2024-10-12 01:00:00


### **Preprocessing Check**

In [3]:
# Convert longitude and latitude to correct types
purple_df['longitude'] = purple_df['longitude'].apply(lambda x: -abs(x))
purple_df['latitude'] = purple_df['latitude'].apply(lambda x: abs(x))

# Convert time to datetime (if not already done)
purple_df['time'] = pd.to_datetime(purple_df['time'], format='%Y-%m-%d %H:%M:%S')

purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
576572,2024-10-12 00:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.004,0.0,0.170667,1.0,78.233,35.0,,1011.435
576590,2024-10-12 01:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,78.0,35.0,,1011.807
576611,2024-10-12 02:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.007,0.0,0.170667,1.0,77.434,35.0,,1011.808
576634,2024-10-12 03:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,76.966,35.966,,1011.542
576640,2024-10-12 04:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.056,0.0,0.170667,1.0,77.0,36.034,,1011.55


In [4]:
# Create column for date only
purple_df['date'] = purple_df['time'].dt.date
purple_df['date'] = pd.to_datetime(purple_df['date'], format='%Y-%m-%d')

purple_df['date'].head()

576572   2024-10-12
576590   2024-10-12
576611   2024-10-12
576634   2024-10-12
576640   2024-10-12
Name: date, dtype: datetime64[ns]

### **Visualizing and Analyzing**

In [5]:
# Only take unique dates and their average pm2.5 concentrations
unique_dates = purple_df.drop_duplicates(subset=['location_name', 'date'], keep='first')

In [6]:
# Find outliers using IQR and separate them from the dataframe
def find_outliers_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    non_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return non_outliers, outliers

# Remove outliers and keep them in a separate dataframe
unique_dates, outliers_df = find_outliers_iqr(unique_dates, 'pm2_5_24h_mean')

print("Number of outliers removed:", len(outliers_df))
print("Number of unique dates remaining:", len(unique_dates))

Number of outliers removed: 2104
Number of unique dates remaining: 24034


In [7]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Add histogram for PM2.5 values without outliers
fig.add_trace(
    go.Histogram(
        x=unique_dates['pm2_5_24h_mean'],
        nbinsx=35,
        marker=dict(color='blue'),
        hoverinfo='x+y'
    ),
    row=1, col=1
)

# Add histogram for PM2.5 values outliers
fig.add_trace(
    go.Box(
        x=outliers_df['pm2_5_24h_mean'],
        marker=dict(color='green'),
        hoverinfo='x',
        name=''
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(title_text="Distribution of PM2.5 Values for PurpleAir Monitors",
                  showlegend=False)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=2)

# Save the figure offline
plot(fig, filename='../figures/purpleair_pm25_distribution.html', auto_open=True)

'../figures/purpleair_pm25_distribution.html'

In [8]:
# Calculate the rolling 24h mean for PM2.5 for each hour
purple_df['pm2_5_24h_rolling_mean'] = purple_df.groupby('location_name')['pm2_5_1h_mean'].transform(
    lambda x: x.rolling(window=24, min_periods=1).mean()
)

# Impute missing values in the rolling mean
purple_df['pm2_5_24h_rolling_mean'] = purple_df['pm2_5_24h_rolling_mean'].ffill()
purple_df['pm2_5_24h_rolling_mean'] = purple_df['pm2_5_24h_rolling_mean'].bfill()

purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure,date,pm2_5_24h_rolling_mean
576572,2024-10-12 00:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.004,0.0,0.170667,1.0,78.233,35.0,,1011.435,2024-10-12,0.004
576590,2024-10-12 01:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,78.0,35.0,,1011.807,2024-10-12,0.002
576611,2024-10-12 02:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.007,0.0,0.170667,1.0,77.434,35.0,,1011.808,2024-10-12,0.003667
576634,2024-10-12 03:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,76.966,35.966,,1011.542,2024-10-12,0.00275
576640,2024-10-12 04:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.056,0.0,0.170667,1.0,77.0,36.034,,1011.55,2024-10-12,0.0134


In [9]:
# AQI calculation function
def calculate_pm2_5_aqi(C_p):
    if pd.isna(C_p):
        return None

    C_p = float(str(C_p)[:str(C_p).find('.')+2]) if '.' in str(C_p) else float(C_p)

    breakpoints = [
        (0.0,   9.0,   0,   50),
        (9.1,   35.4,  51,  100),
        (35.5,  55.4,  101, 150),
        (55.5,  125.4, 151, 200),
        (125.5, 225.4, 201, 300),
        (225.5, 500.4, 301, 500)
    ]

    for BP_Lo, BP_Hi, I_Lo, I_Hi in breakpoints:
        if BP_Lo <= C_p <= BP_Hi:
            I_p = ((I_Hi - I_Lo) / (BP_Hi - BP_Lo)) * (C_p - BP_Lo) + I_Lo
            return round(I_p)

    return None

# Calculate AQI for PM2.5 24hr mean
purple_df['pm2_5_24h_rolling_mean_aqi'] = purple_df['pm2_5_24h_rolling_mean'].apply(calculate_pm2_5_aqi)

In [10]:
# Find outliers from dataframe
no_outliers_df, outliers_df = find_outliers_iqr(purple_df, 'pm2_5_24h_rolling_mean_aqi')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Scatterplot of PM2.5 values for PurpleAir monitors without outliers
fig.add_trace(
        go.Scatter(
                x=no_outliers_df['pm2_5_1h_mean'],
                y=no_outliers_df['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='blue', size=5),
                hovertext=no_outliers_df['location_name'],
                name=''
        ),
        row=1, col=1
)

# Scatterplot of PM2.5 values for PurpleAir monitors with outliers
fig.add_trace(
        go.Scatter(
                x=outliers_df['pm2_5_1h_mean'],
                y=outliers_df['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='red', size=5),
                hovertext=outliers_df['location_name'],
                name=''
        ),
        row=1, col=2
)

# Update axis titles
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=1)
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=2)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=2)

# Update layout
fig.update_layout(title_text="Scatter Plots of PM2.5 Values for PurpleAir Monitors",
                                  showlegend=False)

# Save the figure offline
plot(fig, filename='../figures/purpleair_pm25_aqi_scatter.html', auto_open=True)

'../figures/purpleair_pm25_aqi_scatter.html'

## **Clarity Monitors**

In [11]:
# Load cleaned data for Clarity monitors
clarity_df = pd.read_csv('../data/clean_clarity.csv')

# Sort by monitor and time
clarity_df = clarity_df.sort_values(['location_name', 'time'])

# Convert longitude and latitude to correct types
clarity_df['longitude'] = clarity_df['longitude'].apply(lambda x: -abs(x))
clarity_df['latitude'] = clarity_df['latitude'].apply(lambda x: abs(x))

# Convert time to datetime (if not already done)
clarity_df['time'] = pd.to_datetime(clarity_df['time'], format='%Y-%m-%d %H:%M:%S')

# Create column for date only
clarity_df['date'] = clarity_df['time'].dt.date
clarity_df['date'] = pd.to_datetime(clarity_df['date'], format='%Y-%m-%d')

# Only take unique dates and their average pm2.5 concentrations
unique_dates_clarity = clarity_df.drop_duplicates(subset=['location_name', 'date'], keep='first')

# Find outliers using IQR and separate them from the dataframe
unique_dates_clarity, outliers_df_clarity = find_outliers_iqr(unique_dates_clarity, 'pm2_5_24h_mean')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Add histogram for PM2.5 values without outliers
fig.add_trace(
    go.Histogram(
        x=unique_dates_clarity['pm2_5_24h_mean'],
        nbinsx=35,
        marker=dict(color='blue'),
        hoverinfo='x+y'
    ),
    row=1, col=1
)

# Add histogram for PM2.5 values outliers
fig.add_trace(
    go.Box(
        x=outliers_df_clarity['pm2_5_24h_mean'],
        marker=dict(color='green'),
        hoverinfo='x',
        name=''
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(title_text="Distribution of PM2.5 Values for Clarity Monitors",
                  showlegend=False)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Daily Average PM2.5 Concentration (µg/m³)", row=1, col=2)

# Save the figure offline
plot(fig, filename='../figures/clarity_pm25_distribution.html', auto_open=True)

'../figures/clarity_pm25_distribution.html'

In [12]:
# Calculate the rolling 24h mean for PM2.5 for each hour
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df.groupby('location_name')['pm2_5_1h_mean'].transform(
    lambda x: x.rolling(window=24, min_periods=1).mean()
)

# Impute missing values in the rolling mean
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df['pm2_5_24h_rolling_mean'].ffill()
clarity_df['pm2_5_24h_rolling_mean'] = clarity_df['pm2_5_24h_rolling_mean'].bfill()

# Calculate AQI for PM2.5 24hr mean
clarity_df['pm2_5_24h_rolling_mean_aqi'] = clarity_df['pm2_5_24h_rolling_mean'].apply(calculate_pm2_5_aqi)

# Find outliers from dataframe
no_outliers_df_clarity, outliers_df_clarity = find_outliers_iqr(clarity_df, 'pm2_5_24h_rolling_mean_aqi')

# Create figure with subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Without Outliers", "Outliers"))

# Scatterplot of PM2.5 values for Clarity monitors without outliers
fig.add_trace(
        go.Scatter(
                x=no_outliers_df_clarity['pm2_5_1h_mean'],
                y=no_outliers_df_clarity['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='blue', size=5),
                hovertext=no_outliers_df_clarity['location_name'],
                name=''
        ),
        row=1, col=1
)

# Scatterplot of PM2.5 values for Clarity monitors with outliers
fig.add_trace(
        go.Scatter(
                x=outliers_df_clarity['pm2_5_1h_mean'],
                y=outliers_df_clarity['pm2_5_24h_rolling_mean_aqi'],
                mode='markers',
                marker=dict(color='red', size=5),
                hovertext=outliers_df_clarity['location_name'],
                name=''
        ),
        row=1, col=2
)

# Update axis titles
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=1)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=1)
fig.update_xaxes(title_text="Hourly Average PM2.5 Concentration (µg/m³)", row=1, col=2)
fig.update_yaxes(title_text="Daily Average PM2.5 AQI", row=1, col=2)

# Update layout
fig.update_layout(title_text="Scatter Plots of PM2.5 Values for Clarity Monitors",
                                  showlegend=False)

# Save the figure offline
plot(fig, filename='../figures/clarity_pm25_aqi_scatter.html', auto_open=True)

'../figures/clarity_pm25_aqi_scatter.html'

In [13]:
# Keep the clarity_df_grouped definition
clarity_df_grouped = clarity_df.groupby(['date']).agg({'pm2_5_1h_mean': 'mean'}).reset_index()
clarity_df_grouped['month_year_str'] = clarity_df_grouped['date'].dt.strftime('%B')
clarity_df_grouped['day'] = clarity_df_grouped['date'].dt.day

# Create a facet grid with month-year as the facet column
fig = px.line(
    clarity_df_grouped,
    x="day",
    y="pm2_5_1h_mean",
    facet_col="month_year_str",
    facet_col_wrap=6,
    title="PM2.5 Concentrations by Month (Clarity Monitors)",
    labels={"pm2_5_1h_mean": "PM2.5 (1-hour mean)", "day": "Day of Month", "month_year_str": "Month"},
    height=400
)

# Update layout for better visualization
fig.update_layout(
    margin=dict(t=50, l=50, r=50, b=50),
    title_x=0.5
)

# Show the plot
fig.show()


## **Self-Prediction**

In [None]:
# Combine the two dataframes: purple and clarity

In [19]:
# Filter the data
filtered_df = purple_df[['location_id', 'location_name', 'date', 'time', 'pm2_5_24h_mean']]
filtered_df.head()

Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
576572,100355,805 Lomita Avenue,2024-10-12,2024-10-12 00:00:00,0.170667
576590,100355,805 Lomita Avenue,2024-10-12,2024-10-12 01:00:00,0.170667
576611,100355,805 Lomita Avenue,2024-10-12,2024-10-12 02:00:00,0.170667
576634,100355,805 Lomita Avenue,2024-10-12,2024-10-12 03:00:00,0.170667
576640,100355,805 Lomita Avenue,2024-10-12,2024-10-12 04:00:00,0.170667


In [20]:
# Round the pm2_5_24h_mean values to 2 decimal places
filtered_df['pm2_5_24h_mean'] = filtered_df['pm2_5_24h_mean'].round(2)
filtered_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
576572,100355,805 Lomita Avenue,2024-10-12,2024-10-12 00:00:00,0.17
576590,100355,805 Lomita Avenue,2024-10-12,2024-10-12 01:00:00,0.17
576611,100355,805 Lomita Avenue,2024-10-12,2024-10-12 02:00:00,0.17
576634,100355,805 Lomita Avenue,2024-10-12,2024-10-12 03:00:00,0.17
576640,100355,805 Lomita Avenue,2024-10-12,2024-10-12 04:00:00,0.17


In [None]:
# Remove outliers from data (either removing, or just imputing)


In [30]:
# Get the unique values by date
unique_dates_filtered = filtered_df.drop_duplicates(subset=['location_name', 'date'], keep='first')
unique_dates_filtered.head()

Unnamed: 0,location_id,location_name,date,time,pm2_5_24h_mean
576572,100355,805 Lomita Avenue,2024-10-12,2024-10-12,0.17
576993,100355,805 Lomita Avenue,2024-10-13,2024-10-13,6.62
577389,100355,805 Lomita Avenue,2024-10-14,2024-10-14,3.84
577807,100355,805 Lomita Avenue,2024-10-15,2024-10-15,2.55
578208,100355,805 Lomita Avenue,2024-10-16,2024-10-16,0.44


In [37]:
# Distribution of PM2.5 values for PurpleAir monitors
px.histogram(unique_dates_filtered[unique_dates_filtered['pm2_5_24h_mean'] <= 200], 
             x='pm2_5_24h_mean', title='Distribution of PM2.5 24hr Mean Values', nbins=100).show()

In [34]:
import numpy as np

# Distribution of the pm2_5_24h_mean values
# Apply log transformation to the pm2_5_24h_mean values
unique_dates_filtered['pm2_5_24h_mean_log'] = unique_dates_filtered['pm2_5_24h_mean'].apply(lambda x: np.log(x + 1))
px.histogram(unique_dates_filtered, x='pm2_5_24h_mean_log', title='Distribution of Log PM2.5 24hr Mean Values', nbins=35).show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [51]:
# Create bins for the log pm2_5_24h_mean values and get edges of the bins
unique_dates_filtered['bins'] = pd.qcut(unique_dates_filtered['pm2_5_24h_mean_log'], q=20)

# Get the edges of the bins
edges = unique_dates_filtered['bins'].cat.categories
edges = [(edges[i].left, edges[i].right) for i in range(len(edges))]
edges = [(edges[i][0].round(2), edges[i][1].round(2)) for i in range(len(edges))]
edges = [(np.exp(edges[i][0]) - 1, np.exp(edges[i][1]) - 1) for i in range(len(edges))]

edges



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



[(0.0, 0.17351087099181028),
 (0.17351087099181028, 0.8221188003905089),
 (0.8221188003905089, 1.2933187402641826),
 (1.2933187402641826, 1.6644562419294169),
 (1.6644562419294169, 2.0041660239464334),
 (2.0041660239464334, 2.3201169227365472),
 (2.3201169227365472, 2.6327865557528094),
 (2.6327865557528094, 2.9353506954704733),
 (2.9353506954704733, 3.263114515168817),
 (3.263114515168817, 3.618176822299781),
 (3.618176822299781, 4.002811227833588),
 (4.002811227833588, 4.4739473917272),
 (4.4739473917272, 4.9894524663831135),
 (4.9894524663831135, 5.619368681043077),
 (5.619368681043077, 6.315533762309567),
 (6.315533762309567, 7.331137487687693),
 (7.331137487687693, 8.67940081407284),
 (8.67940081407284, 11.182493960703473),
 (11.182493960703473, 17.17414536944306),
 (17.17414536944306, 1771.2407759321766)]

In [34]:
import numpy as np

X = []  # past 168 hours
y = []  # next 24 hours

# Parameters
window_size = 24 * 7  # 168 hours = 1 week
output_size = 24  # 24 hours

# Location ID to index mapping in arrays
location_index_map = {}

for location, group in filtered_df.groupby('location_id'):
    group = group.sort_values('time').reset_index(drop=True)
    values = group[['pm2_5_1h_mean']].values  # use only one feature
    start_index = len(X)  # Track the starting index for this location in X
    
    for i in range(len(values) - window_size - output_size):
        X.append(values[i:i+window_size])
        y.append(values[i+window_size:i+window_size+output_size])
    
    end_index = len(X)  # Track the ending index for this location in X
    location_index_map[location] = (start_index, end_index)
        
X = np.array(X)  # shape: (samples, 168, 1)
y = np.array(y)  # shape: (samples, 24, 1)

X.shape, y.shape

((143792, 168, 1), (143792, 24, 1))

In [35]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class AirForecastLSTM(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=2, output_size=2):
        super(AirForecastLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size * 24)
        self.output_size = output_size

    def forward(self, x):
        lstm_out, _ = self.lstm(x)              # (batch, 168, hidden)
        last_output = lstm_out[:, -1, :]        # (batch, hidden)
        out = self.fc(last_output)              # (batch, 24 * 3)
        return out.view(-1, 24, self.output_size)  # (batch, 24, 3)

In [36]:
def train_model(model, dataloader, epochs=10, lr=1e-3):
    """
    AirForecast LSTM model training function.
    Args:
        model (nn.Module): The LSTM model to train.
        dataloader (DataLoader): DataLoader for the training data.
        epochs (int): Number of training epochs.
        lr (float): Learning rate for the optimizer.
    Returns:
        model (nn.Module): The trained LSTM model.
    """
    # Find device if available for training
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        # Iterate over batches
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            preds = model(X_batch)
            loss = criterion(preds, y_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(dataloader):.4f}")

    return model

def evaluate(model, dataloader):
    """
    AirForecast LSTM model evaluation function.
    Args:
        model (nn.Module): The LSTM model to evaluate.
        dataloader (DataLoader): DataLoader for the validation data.
    Returns:
        preds (np.ndarray): Predicted values.
        truths (np.ndarray): True values.
    """
    model.eval()
    preds, truths = [], []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            out = model(X_batch)
            preds.append(out.squeeze(-1).numpy())
            truths.append(y_batch.squeeze(-1).numpy())

    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    mse = (np.mean((preds - truths) ** 2))
    print(f"Validation MSE: {mse:.4f}")
    return preds, truths

In [37]:
# Check for MPS device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Ensure all monitors are included in the validation set using location_index_map
val_indices = []

for location, (start_index, end_index) in location_index_map.items():
    split_index = int((end_index - start_index) * 0.8) + start_index
    if split_index < end_index:  # Ensure split_index is within bounds
        val_indices.extend(range(split_index, end_index))

val_indices = np.array(val_indices)
train_indices = np.setdiff1d(np.arange(len(X)), val_indices)

X_train, y_train = X[train_indices], y[train_indices]
X_val, y_val = X[val_indices], y[val_indices]

# Create TimeSeriesDataset instances for training and validation sets
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Train the model
model = AirForecastLSTM(input_size=1, output_size=1)
trained_model = train_model(model, train_loader, epochs=20)

Using device: mps
Epoch 1/20, Loss: 2448.0669
Epoch 2/20, Loss: 2395.0831
Epoch 3/20, Loss: 2354.2942
Epoch 4/20, Loss: 2319.5651
Epoch 5/20, Loss: 2277.9930
Epoch 6/20, Loss: 2257.9340
Epoch 7/20, Loss: 2218.4293
Epoch 8/20, Loss: 2165.6279
Epoch 9/20, Loss: 2122.3734
Epoch 10/20, Loss: 2098.1370
Epoch 11/20, Loss: 2036.7691
Epoch 12/20, Loss: 2040.4579
Epoch 13/20, Loss: 2011.4831
Epoch 14/20, Loss: 1964.7786
Epoch 15/20, Loss: 1932.7549
Epoch 16/20, Loss: 1898.8006
Epoch 17/20, Loss: 1872.3779
Epoch 18/20, Loss: 1839.5265
Epoch 19/20, Loss: 1820.5571
Epoch 20/20, Loss: 1760.8221


In [38]:
# Evaluate the model by monitor location
# Move the trained model back to CPU
trained_model = trained_model.to('cpu')

# Evaluate the model by monitor location
predictions, truths = evaluate(trained_model, val_loader)

Validation MSE: 10.3852


In [39]:
from sklearn.metrics import mean_squared_error

# Calculate validation error per monitor location using location_index_map
validation_errors = []

for location_id, (start_index, end_index) in location_index_map.items():
    # Filter validation indices for the current location
    location_val_indices = val_indices[(val_indices >= start_index) & (val_indices < end_index)]
    
    if len(location_val_indices) > 0:
        # Extract predictions and truths for the current location
        location_preds = predictions[location_val_indices - start_index]
        location_truths = truths[location_val_indices - start_index]
        
        # Flatten the arrays for MSE calculation
        location_preds_flat = location_preds.flatten()
        location_truths_flat = location_truths.flatten()
        
        # Calculate MSE for the current location
        mse = mean_squared_error(location_truths_flat, location_preds_flat)
        validation_errors.append({'location_id': location_id, 'mse': mse})

# Create a DataFrame with the validation errors per location
validation_errors_df = pd.DataFrame(validation_errors)

# Sort the DataFrame by MSE
validation_errors_df = validation_errors_df.sort_values(by='mse', ascending=True)
validation_errors_df


Unnamed: 0,location_id,mse
33,DVRGV9737,1.581805
28,DNSEJ7404,1.581805
27,DMEYT2138,1.581805
23,DHPSP8686,1.581805
20,DCVIM2201,1.581805
2,109192,1.742884
19,91617,1.742884
7,158259,1.742884
18,88655,1.742884
15,67553,1.742884
