Weather Forecasting with 4 features using historical data via TDNN

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

# Required Import Statements
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math


In [None]:
# Link to 6 weather datasets
url_ottawa = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_ottawa_daily.csv"
url_ottawa_south = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_ottawasouth_daily.csv"
url_gatineau = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_gatineau_daily.csv"
url_chelsea = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_chelsea_daily.csv"
url_kemptville = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_kemptville_daily.csv"
url_renfrew = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_renfrew_daily.csv"

# Load and Preprocess Data
url = url_ottawa
#url = 'https://raw.githubusercontent.com/Sattar-A/HonoursProject_CSI4900/main/data/weatherstats_ottawa_daily.csv?token=GHSAT0AAAAAACNBWJ7MI3IZIOPLGQ6OLTU4ZPPCOYA'
data = pd.read_csv(url)

# data processing transformation
data['date'] = pd.to_datetime(data['date'])
filtered_data = data[(data['date'].dt.month >= 5) & (data['date'].dt.month <= 11)]  # restrict data range from May to November
filtered_data = filtered_data[(filtered_data['date'].dt.year >= 2013) & (filtered_data['date'].dt.year <= 2023)]  # overall data range from years 2013-2023
selected_columns = ['avg_hourly_temperature', 'precipitation', 'solar_radiation', 'avg_hourly_pressure_station']  # set 4 features for input and output
final_data = filtered_data[selected_columns]
# test data processing transformation
filtered_data_test = filtered_data[(filtered_data['date'].dt.year == 2023)] # data from 2023 only
test_data = filtered_data_test[selected_columns]

#window size
window_size = 14
pred_size = 1

# Handle Missing Values
#final_data = final_data.fillna(method='ffill')  # option 1: forward fill
final_data = final_data.dropna()  # option 2: drop data with null values

# Normalize the Data
scaler = MinMaxScaler()  # MinMax scaler
scaled_data = scaler.fit_transform(final_data)


  data = pd.read_csv(url)


In [None]:

# URL to the dataset
url_ottawa = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_ottawa_daily.csv"

# Step 1: Load the dataset
data = pd.read_csv(url_ottawa)

# Step 2: Filter data based on the specified date range and months
data['date'] = pd.to_datetime(data['date'])
filtered_data = data[(data['date'].dt.month >= 5) & (data['date'].dt.month <= 11)]
filtered_data = filtered_data[(filtered_data['date'].dt.year >= 2013) & (filtered_data['date'].dt.year <= 2023)]

# Step 3: Select the relevant columns
selected_columns = ['avg_hourly_temperature', 'precipitation', 'solar_radiation', 'avg_hourly_pressure_station']
final_data = filtered_data[selected_columns]

# Step 4: Handle missing values
final_data = final_data.dropna()  # Option 2: drop data with null values

# Step 5: Normalize the dataset
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(final_data)

# Convert scaled_data to a DataFrame (optional, if you need to inspect or use it further in DataFrame format)
#scaled_data_df = pd.DataFrame(scaled_data, columns=selected_columns)

  data = pd.read_csv(url_ottawa)


Training

In [None]:
# Data Loader for Training
def create_sequences(data, sequence_length):
    xs, ys = [], []
    for i in range(len(data) - sequence_length):
        x = data[i:i+sequence_length]
        y = data[i+sequence_length]
        xs.append(x)
        ys.append(y)
    return torch.FloatTensor(xs), torch.FloatTensor(ys)

In [None]:
# Convert scaled_data to PyTorch tensors
scaled_data_tensor = torch.FloatTensor(scaled_data).reshape(-1, len(selected_columns))

# Assuming the sequence length for the model is defined
# Here you need to ensure that the data is correctly reshaped for your model's input.
# For instance, if your model expects inputs shaped as (batch_size, sequence_length, number_of_features):
sequence_length = 14  # This is an example value; adjust based on your model's expected input
batch_size = scaled_data_tensor.shape[0] // sequence_length

if scaled_data_tensor.shape[0] % sequence_length == 0:
    # If the total number of data points is perfectly divisible by the sequence_length
    reshaped_data = scaled_data_tensor.reshape(batch_size, sequence_length, len(selected_columns))
else:
    # Handle cases where the total number of data points is not perfectly divisible by the sequence_length
    # This might involve trimming some data points or adjusting your approach
    reshaped_data = scaled_data_tensor[:- (scaled_data_tensor.shape[0] % sequence_length)].reshape(batch_size, sequence_length, len(selected_columns))

# Proceed with using reshaped_data as input to your model or data preparation steps


# Assuming 'scaled_data' from the preprocessing steps
# Convert scaled data to PyTorch tensors
#scaled_data_tensor = torch.FloatTensor(scaled_data).view(-1)



# window size 14 * 4 features = 56
sequence_length = window_size * len(selected_columns)
X, y = create_sequences(reshaped_data, sequence_length)

# Split the data (adjust based on the actual train/test split by date)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)

# Define the model
class TDNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TDNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Model parameters
hidden_size = 128
model = TDNN(sequence_length, hidden_size, len(selected_columns))
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
epochs = 20
for epoch in range(epochs):
    for seq, labels in train_loader:
        optimizer.zero_grad()
        y_pred = model(seq)
        loss = criterion(y_pred, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


ValueError: only one element tensors can be converted to Python scalars

Testing

In [None]:
  # Assuming 'final_data' is the DataFrame after preprocessing, and 'scaler' is already fitted
# Filter to get 2022 data
data_2022 = final_data[(final_data['date'].dt.year == 2022)]

# Assuming 'data_2022' does not include the 'date' column after selection; if it does, drop or ignore it in the scaling process
# Select the last 14 days of 2022
last_14_days = data_2022.tail(14)

# Scale the data
# Ensure the 'scaler' has been fitted on the training dataset to avoid data leakage
last_14_days_scaled = scaler.transform(last_14_days[selected_columns])

# Convert to PyTorch tensor and reshape to match the model input
# Here, we're assuming each feature from the last 14 days is a separate input to the model,
# thus needing to reshape to (1, sequence_length * number_of_features)
last_window_2022 = torch.tensor(last_14_days_scaled, dtype=torch.float).view(1, -1)

In [None]:
def rolling_window_predictions(model, initial_window, n_predictions, scaler):
    """
    Generate predictions using a rolling window approach.

    :param model: The trained PyTorch model.
    :param initial_window: The last known data window to start predictions.
    :param n_predictions: Number of future time steps (days) to predict.
    :param scaler: Instance of MinMaxScaler for inverse transforming predictions.
    :return: Array of predictions.
    """
    model.eval()
    current_window = initial_window
    predictions = []

    with torch.no_grad():
        for _ in range(n_predictions):
            # Reshape current window to match model input
            current_input = current_window.view(-1, sequence_length)
            prediction = model(current_input)

            # Inverse transform the prediction
            prediction_np = prediction.numpy()
            prediction_transformed = scaler.inverse_transform(prediction_np).flatten()
            predictions.append(prediction_transformed)

            # Update current window with new prediction
            current_window = torch.roll(current_window, -len(selected_columns))
            current_window[-len(selected_columns):] = torch.tensor(prediction_transformed)

    return np.array(predictions)

# Preparing the initial window from the end of 2022 data
# Assume `last_window_2022` is extracted appropriately as a tensor
# Example: last_window_2022 = X_test[-1] # This is just for demonstration. In practice, extract the actual last 14 days of 2022.


# Forecasting
n_predictions = 365  # Number of days to predict for 2023
predictions_2023 = rolling_window_predictions(model, last_window_2022, n_predictions, scaler)

print(predictions_2023)

Model prediction visualization

In [None]:
import matplotlib.pyplot as plt

# Assuming 'predictions' and 'y_test_scaled' are your model's predictions and actual values, respectively, scaled back to their original range

feature_names = ['Precipitation', 'Avg Hourly Temperature', 'Solar Radiation', 'Avg Hourly Pressure Station']
test_dates = test_data['date'].reset_index(drop=True)[:len(predictions)]

for i, feature_name in enumerate(feature_names):
    plt.figure(figsize=(10, 6))
    plt.plot(test_dates, y_test_scaled[:, i], label='Actual', marker='.', zorder=-1)
    plt.plot(test_dates, predictions[:, i], label='Predicted', marker='.', zorder=1)
    plt.title(f'{feature_name} Prediction vs Actual')
    plt.xlabel('Date')
    plt.ylabel(feature_name)
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


Divide

In [None]:
# Link to weather dataset
url_ottawa = "https://raw.githubusercontent.com/noobstang/NNtraining/master/Weather49Sets/weatherstats_ottawa_daily.csv"

# Load and Preprocess Data
url = url_ottawa
data = pd.read_csv(url)

# data processing transformation
data['date'] = pd.to_datetime(data['date'])
filtered_data = data[(data['date'].dt.month >= 5) & (data['date'].dt.month <= 11)]  # restrict data range from May to November
filtered_data = filtered_data[(filtered_data['date'].dt.year >= 2013) & (filtered_data['date'].dt.year <= 2023)]  # overall data range from years 2013-2023
selected_columns = ['avg_hourly_temperature', 'precipitation', 'solar_radiation', 'avg_hourly_pressure_station']  # set 4 features for input and output
final_data = filtered_data[selected_columns]
# test data processing
test_data = final_data[(filtered_data['date'].dt.year == 2023)] # data from 2023 only

#window size
window_size = 14
pred_size = 1

# Handle Missing Values
#final_data = final_data.fillna(method='ffill')  # option 1: forward fill
final_data = final_data.dropna()  # option 2: drop data with null values

# Normalize the Data
scaler = MinMaxScaler()  # MinMax scaler
scaled_data = scaler.fit_transform(final_data)



In [None]:
def create_inout_sequences(input_data, tw):
    inout_seq = []
    L = len(input_data)
    for i in range(L-tw):
        train_seq = input_data[i:i+tw]
        train_label = input_data[i+tw:i+tw+1]
        inout_seq.append((train_seq ,train_label))
    return inout_seq

# Assuming 'scaled_data' is the scaled dataset from the preprocessing steps
input_size = window_size * len(selected_columns)  # Number of features * window size
output_size = len(selected_columns)  # Predicting the same number of features
sequences = create_inout_sequences(scaled_data, window_size)


In [None]:
# Splitting data (Using the first 10 years for training as per guide)
train_sequences, val_sequences = train_test_split(sequences, test_size=0.2, random_state=42)

# Converting sequences to DataLoader for batch processing
batch_size = 64
train_loader = DataLoader(train_sequences, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_sequences, batch_size=batch_size, shuffle=False)

# Model instantiation
hidden_size = 128  # Example hidden size
model = TDNN(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10  # Example epoch count
for epoch in range(epochs):
    for seqs, labels in train_loader:
        optimizer.zero_grad()
        y_pred = model(seqs)
        loss = criterion(y_pred, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Validation step can be added here for monitoring overfitting
