## **PurpleAir Monitors** ##

In [3]:
import pandas as pd

# Load cleaned data
purple_df = pd.read_csv('../data/clean_purpleair.csv')
purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
0,2018-12-27 04:00:00,Grundy Park,21427,37.622585,-122.42097,0.736345,4.0,2.999879,16.0,53.318182,59.818182,86.0,
1,2018-12-27 05:00:00,Grundy Park,21427,37.622585,-122.42097,0.739827,4.0,2.999879,16.0,51.777778,59.955556,86.0,
2,2018-12-27 06:00:00,Grundy Park,21427,37.622585,-122.42097,1.038868,6.0,2.999879,16.0,52.068182,56.681818,86.0,
3,2018-12-27 07:00:00,Grundy Park,21427,37.622585,-122.42097,1.214613,7.0,2.999879,16.0,52.755556,56.933333,86.0,
4,2018-12-27 08:00:00,Grundy Park,21427,37.622585,-122.42097,1.127572,6.0,2.999879,16.0,65.883721,54.372093,86.0,


In [8]:
# Sort by monitor and time
purple_df = purple_df.sort_values(['location_name', 'time'])
purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
576572,2024-10-12 00:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.004,0.0,0.170667,1.0,78.233,35.0,,1011.435
576590,2024-10-12 01:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,78.0,35.0,,1011.807
576611,2024-10-12 02:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.007,0.0,0.170667,1.0,77.434,35.0,,1011.808
576634,2024-10-12 03:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,76.966,35.966,,1011.542
576640,2024-10-12 04:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.056,0.0,0.170667,1.0,77.0,36.034,,1011.55


### **Preprocessing Check**

In [9]:
# Convert longitude and latitude to correct types
purple_df['longitude'] = purple_df['longitude'].apply(lambda x: -abs(x))
purple_df['latitude'] = purple_df['latitude'].apply(lambda x: abs(x))

# Convert time to datetime (if not already done)
purple_df['time'] = pd.to_datetime(purple_df['time'], format='%Y-%m-%d %H:%M:%S')

purple_df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_1h_mean,pm2_5_1h_mean_aqi,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,elevation,pressure
576572,2024-10-12 00:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.004,0.0,0.170667,1.0,78.233,35.0,,1011.435
576590,2024-10-12 01:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,78.0,35.0,,1011.807
576611,2024-10-12 02:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.007,0.0,0.170667,1.0,77.434,35.0,,1011.808
576634,2024-10-12 03:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.0,0.0,0.170667,1.0,76.966,35.966,,1011.542
576640,2024-10-12 04:00:00,805 Lomita Avenue,100355,37.608963,-122.41856,0.056,0.0,0.170667,1.0,77.0,36.034,,1011.55


In [19]:
# Create column for date only
purple_df['date'] = purple_df['time'].dt.date
purple_df['date'] = pd.to_datetime(purple_df['date'], format='%Y-%m-%d')

purple_df['date'].head()

576572   2024-10-12
576590   2024-10-12
576611   2024-10-12
576634   2024-10-12
576640   2024-10-12
Name: date, dtype: datetime64[ns]

### **Visualizing and Analyzing**

In [57]:
# Only take unique dates and their average pm2.5 concentrations
unique_dates = purple_df.drop_duplicates(subset=['location_name', 'date'], keep='first')

In [58]:
# Find outliers using IQR and separate them from the dataframe
def find_outliers_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    non_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return non_outliers, outliers

# Remove outliers and keep them in a separate dataframe
unique_dates, outliers_df = find_outliers_iqr(unique_dates, 'pm2_5_24h_mean')

print("Number of outliers removed:", len(outliers_df))
print("Number of unique dates remaining:", len(unique_dates))

Number of outliers removed: 2104
Number of unique dates remaining: 24034


In [59]:
import plotly.express as px
# Visualize distribution of PM2.5 values (w/o outliers) for PurpleAir monitors
# Group by location and only consider unique dates
fig = px.histogram(unique_dates, x='pm2_5_24h_mean',
                   nbins=35,
                   title='Distribution of PM2.5 Values for PurpleAir Monitors',
                   labels={'pm2_5_24h_mean': 'Daily Average PM2.5 Concentration (µg/m³)', 'location_name': 'Location'},
                   marginal='box',  # Add box plot to the histogram
                   hover_data=['date'])  # Show date on hover

fig.show()

In [60]:
# Visualize distribution of PM2.5 outliers for PurpleAir monitors
# Group by location and only consider unique dates
fig_outliers = px.box(outliers_df, x='pm2_5_24h_mean',
                        title='Distribution of PM2.5 Outliers for PurpleAir Monitors',
                        labels={'pm2_5_24h_mean': 'Daily Average PM2.5 Concentration (µg/m³)', 'location_name': 'Location'},
                        hover_data=['date'])  # Show date on hover

fig_outliers.show()

In [7]:
filtered_df = purple_df[['location_id', 'time', 'pm2_5_1h_mean']]
filtered_df.head()

Unnamed: 0,location_id,time,pm2_5_1h_mean
576572,100355,2024-10-12 00:00:00,0.004
576590,100355,2024-10-12 01:00:00,0.0
576611,100355,2024-10-12 02:00:00,0.007
576634,100355,2024-10-12 03:00:00,0.0
576640,100355,2024-10-12 04:00:00,0.056


In [34]:
import numpy as np

X = []  # past 168 hours
y = []  # next 24 hours

# Parameters
window_size = 24 * 7  # 168 hours = 1 week
output_size = 24  # 24 hours

# Location ID to index mapping in arrays
location_index_map = {}

for location, group in filtered_df.groupby('location_id'):
    group = group.sort_values('time').reset_index(drop=True)
    values = group[['pm2_5_1h_mean']].values  # use only one feature
    start_index = len(X)  # Track the starting index for this location in X
    
    for i in range(len(values) - window_size - output_size):
        X.append(values[i:i+window_size])
        y.append(values[i+window_size:i+window_size+output_size])
    
    end_index = len(X)  # Track the ending index for this location in X
    location_index_map[location] = (start_index, end_index)
        
X = np.array(X)  # shape: (samples, 168, 1)
y = np.array(y)  # shape: (samples, 24, 1)

X.shape, y.shape

((143792, 168, 1), (143792, 24, 1))

In [35]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class AirForecastLSTM(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=2, output_size=2):
        super(AirForecastLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size * 24)
        self.output_size = output_size

    def forward(self, x):
        lstm_out, _ = self.lstm(x)              # (batch, 168, hidden)
        last_output = lstm_out[:, -1, :]        # (batch, hidden)
        out = self.fc(last_output)              # (batch, 24 * 3)
        return out.view(-1, 24, self.output_size)  # (batch, 24, 3)

In [36]:
def train_model(model, dataloader, epochs=10, lr=1e-3):
    """
    AirForecast LSTM model training function.
    Args:
        model (nn.Module): The LSTM model to train.
        dataloader (DataLoader): DataLoader for the training data.
        epochs (int): Number of training epochs.
        lr (float): Learning rate for the optimizer.
    Returns:
        model (nn.Module): The trained LSTM model.
    """
    # Find device if available for training
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        # Iterate over batches
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            preds = model(X_batch)
            loss = criterion(preds, y_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(dataloader):.4f}")

    return model

def evaluate(model, dataloader):
    """
    AirForecast LSTM model evaluation function.
    Args:
        model (nn.Module): The LSTM model to evaluate.
        dataloader (DataLoader): DataLoader for the validation data.
    Returns:
        preds (np.ndarray): Predicted values.
        truths (np.ndarray): True values.
    """
    model.eval()
    preds, truths = [], []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            out = model(X_batch)
            preds.append(out.squeeze(-1).numpy())
            truths.append(y_batch.squeeze(-1).numpy())

    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    mse = (np.mean((preds - truths) ** 2))
    print(f"Validation MSE: {mse:.4f}")
    return preds, truths

In [37]:
# Check for MPS device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Ensure all monitors are included in the validation set using location_index_map
val_indices = []

for location, (start_index, end_index) in location_index_map.items():
    split_index = int((end_index - start_index) * 0.8) + start_index
    if split_index < end_index:  # Ensure split_index is within bounds
        val_indices.extend(range(split_index, end_index))

val_indices = np.array(val_indices)
train_indices = np.setdiff1d(np.arange(len(X)), val_indices)

X_train, y_train = X[train_indices], y[train_indices]
X_val, y_val = X[val_indices], y[val_indices]

# Create TimeSeriesDataset instances for training and validation sets
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Train the model
model = AirForecastLSTM(input_size=1, output_size=1)
trained_model = train_model(model, train_loader, epochs=20)

Using device: mps
Epoch 1/20, Loss: 2448.0669
Epoch 2/20, Loss: 2395.0831
Epoch 3/20, Loss: 2354.2942
Epoch 4/20, Loss: 2319.5651
Epoch 5/20, Loss: 2277.9930
Epoch 6/20, Loss: 2257.9340
Epoch 7/20, Loss: 2218.4293
Epoch 8/20, Loss: 2165.6279
Epoch 9/20, Loss: 2122.3734
Epoch 10/20, Loss: 2098.1370
Epoch 11/20, Loss: 2036.7691
Epoch 12/20, Loss: 2040.4579
Epoch 13/20, Loss: 2011.4831
Epoch 14/20, Loss: 1964.7786
Epoch 15/20, Loss: 1932.7549
Epoch 16/20, Loss: 1898.8006
Epoch 17/20, Loss: 1872.3779
Epoch 18/20, Loss: 1839.5265
Epoch 19/20, Loss: 1820.5571
Epoch 20/20, Loss: 1760.8221


In [38]:
# Evaluate the model by monitor location
# Move the trained model back to CPU
trained_model = trained_model.to('cpu')

# Evaluate the model by monitor location
predictions, truths = evaluate(trained_model, val_loader)

Validation MSE: 10.3852


In [39]:
from sklearn.metrics import mean_squared_error

# Calculate validation error per monitor location using location_index_map
validation_errors = []

for location_id, (start_index, end_index) in location_index_map.items():
    # Filter validation indices for the current location
    location_val_indices = val_indices[(val_indices >= start_index) & (val_indices < end_index)]
    
    if len(location_val_indices) > 0:
        # Extract predictions and truths for the current location
        location_preds = predictions[location_val_indices - start_index]
        location_truths = truths[location_val_indices - start_index]
        
        # Flatten the arrays for MSE calculation
        location_preds_flat = location_preds.flatten()
        location_truths_flat = location_truths.flatten()
        
        # Calculate MSE for the current location
        mse = mean_squared_error(location_truths_flat, location_preds_flat)
        validation_errors.append({'location_id': location_id, 'mse': mse})

# Create a DataFrame with the validation errors per location
validation_errors_df = pd.DataFrame(validation_errors)

# Sort the DataFrame by MSE
validation_errors_df = validation_errors_df.sort_values(by='mse', ascending=True)
validation_errors_df


Unnamed: 0,location_id,mse
33,DVRGV9737,1.581805
28,DNSEJ7404,1.581805
27,DMEYT2138,1.581805
23,DHPSP8686,1.581805
20,DCVIM2201,1.581805
2,109192,1.742884
19,91617,1.742884
7,158259,1.742884
18,88655,1.742884
15,67553,1.742884


In [15]:
import plotly.express as px

# Make histogram of pm2_5_1h_mean readings
fig = px.histogram(df, x='pm2_5_1h_mean', nbins=100, title='Histogram of PM2.5 1-Hour Mean Readings')
fig.update_layout(xaxis_title='PM2.5 1-Hour Mean', yaxis_title='Count')
fig.show()

In [12]:
# Clip the DataFrame to remove outliers
clipped_df = df[df['pm2_5_1h_mean'] < 50]

# Make histogram of pm2_5_1h_mean readings
fig = px.histogram(clipped_df, x='pm2_5_1h_mean', nbins=25, title='Histogram of PM2.5 1-Hour Mean Readings')
fig.update_layout(xaxis_title='PM2.5 1-Hour Mean', yaxis_title='Count')
fig.show()