# Team 0: Weather Forecasting Model with RNN

##Members
*   Matthew Grech
*   Andrew Radke
*   Liza Abraham
*   Mitchell Palermo

####Created: Jun 15, 2023
This model will take in weather data and return a prediction for the amount of rainfall in a day in mm



# Step 1. Import Libraries and Mount Google Drive to Load Data

In [None]:
import numpy as np
import pandas as pd
import torch
import statsmodels.api as sm
import torch.nn as nn
import torch.optim as optim
from statsmodels.tsa.statespace.varmax import VARMAX
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step 2. Define The Nerual Network as a LSTM RNN

In [None]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers = 10)
        self.dense = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)

        # Take the last hidden state h_n as output
        out = self.dense(h_n[-1])
        return out

# TEMP: Daily Data for Testing

should replace step 3

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
from pyspark.sql.functions import col, when

In [None]:
# Create a spark session using getOrCreate() function
spark = SparkSession.builder.getOrCreate()

# Applying custom schema to data frame
df = spark.read.format(
    "csv").option(
    "header", True).load("/content/drive/MyDrive/Dataset2") #Matt's path /content/drive/MyDrive/Dataset2
# Display the updated schema of the data frame
df.printSchema()

root
 |-- Longitude (x): string (nullable = true)
 |-- Latitude (y): string (nullable = true)
 |-- Station Name: string (nullable = true)
 |-- Climate ID: string (nullable = true)
 |-- Date/Time (LST): string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- Time (LST): string (nullable = true)
 |-- Temp (°C): string (nullable = true)
 |-- Temp Flag: string (nullable = true)
 |-- Dew Point Temp (°C): string (nullable = true)
 |-- Dew Point Temp Flag: string (nullable = true)
 |-- Rel Hum (%): string (nullable = true)
 |-- Rel Hum Flag: string (nullable = true)
 |-- Precip. Amount (mm): string (nullable = true)
 |-- Precip. Amount Flag: string (nullable = true)
 |-- Wind Dir (10s deg): string (nullable = true)
 |-- Wind Dir Flag: string (nullable = true)
 |-- Wind Spd (km/h): string (nullable = true)
 |-- Wind Spd Flag: string (nullable = true)
 |-- Visibility (km): string (nullable = true)
 |-- Visibility 

In [None]:
columns = ["Year", "Month", "Day", "Time (LST)", "Dew Point Temp (°C)", "Rel Hum (%)", "Wind Spd (km/h)", "Visibility (km)", "Stn Press (kPa)", "Temp (°C)"]
df = df.select(*columns)

In [None]:
column_mapping={"Year": "year", "Month": "month", "Day": "day", "Time (LST)": "time", "Dew Point Temp (°C)": "dew_point_temp", "Rel Hum (%)": "rem_hum", "Wind Spd (km/h)": "wind_speed",
                "Visibility (km)": "visibility", "Stn Press (kPa)": "press", "Temp (°C)": "temp"}
for old_col, new_col in column_mapping.items():
    df = df.withColumnRenamed(old_col, new_col)

In [None]:
columns_to_cast = [
    ('year', 'int'),
    ('month', 'int'),
    ('day', 'int'),
    ('time', 'string'),
    ('dew_point_temp', 'double'),
    ('rem_hum', 'int'),
    ('wind_speed', 'int'),
    ('visibility', 'double'),
    ('press', 'double'),
    ('temp', 'double')
]

# Cast the columns to the specified data types
for col_name, data_type in columns_to_cast:
    df = df.withColumn(col_name, col(col_name).cast(data_type))

In [None]:
df = df.orderBy(["year", "month", "day", "time"])
df = df.drop("year", "month", "day", "time")

In [None]:
################################## IGNORE ########################################
################################# for daily ####################################
# columns = ["Longitude (x)", "Latitude (y)", "Station Name", "Climate ID", "Date/Time", "Year", "Month", "Day", "Max Temp (°C)",
#          "Min Temp (°C)", "Mean Temp (°C)", "Total Rain (mm)", "Total Snow (cm)", "Total Precip (mm)", "Spd of Max Gust (km/h)"]
# df = df.select(*columns)
# column_mapping={"Longitude (x)": "long", "Latitude (y)": "lat", "Station Name": "location_name", "Climate ID": "location_id",
#              "Date/Time": "date", "Year": "year", "Month": "month", "Day": "day", "Max Temp (°C)": "max_temp",
#          "Min Temp (°C)": "min_temp", "Mean Temp (°C)": "avg_temp", "Total Rain (mm)": "total_rain", "Total Snow (cm)": "total_snow",
#              "Total Precip (mm)": "total_percip", "Spd of Max Gust (km/h)": "wind_speed"}
# for old_col, new_col in column_mapping.items():
#     df = df.withColumnRenamed(old_col, new_col)

# columns_to_cast = [
#     ('long', 'double'),
#     ('lat', 'double'),
#     ('location_name', 'string'),
#     ('location_id', 'int'),
#     ('date', 'string'),
#     ('year', 'int'),
#     ('month', 'int'),
#     ('day', 'int'),
#     ('max_temp', 'double'),
#     ('min_temp', 'double'),
#     ('avg_temp', 'double'),
#     ('total_rain', 'double'),
#     ('total_snow', 'double'),
#     ('total_percip', 'double')
# ]

# # Cast the columns to the specified data types
# for col_name, data_type in columns_to_cast:
#     df = df.withColumn(col_name, col(col_name).cast(data_type))

# df = df.withColumn("is_less_than_31", when(col("wind_speed") == "<31", True).otherwise(False))
# df = df.withColumn("wind_speed", when(col("wind_speed") == "<31", None).otherwise(col("wind_speed")))
# df = df.withColumn("wind_speed", col("wind_speed").cast("int"))

In [None]:
df = df.toPandas()

In [None]:
dfTensor = torch.tensor(df.values, dtype=torch.float32)
dfTensor = dfTensor.unsqueeze(0)

# # Reshape the tensor to include the sequence length dimension
# seq_len = 3
# batch_size = 8768
# dfTensor = dfTensor.view(batch_size, seq_len, dfTensor.shape[1])

# Create a TensorDataset from the reshaped tensor
trainingDataset = TensorDataset(dfTensor)

train_loader = torch.utils.data.DataLoader(trainingDataset, batch_size=64)

(tensor(-4.1000),)


In [None]:
# apply the dtype attribute
result = df.dtypes
print(df)
print(dfTensor)
# print("Output:")
# print(result)
# print("TrainLoader")

# print("Input batch:")
# print(inputs_batch)

# print("Target batch:")
# print(targets_batch)
# print("Something else batch:")
# print(something_else)

       dew_point_temp  rem_hum  wind_speed  visibility  press  temp
0                -4.1     77.0        14.0        11.3  99.32  -0.6
1                -4.8     72.0        17.0        19.3  99.29  -0.4
2                -4.6     74.0        14.0        19.3  99.23  -0.5
3                -4.8     73.0        20.0        19.3  99.22  -0.5
4                -4.2     77.0        21.0        19.3  99.19  -0.7
...               ...      ...         ...         ...    ...   ...
26299             1.2     96.0        16.0         6.4  98.28   1.8
26300             1.7     97.0        18.0         3.6  97.95   2.2
26301             2.6     97.0        18.0         3.2  97.79   3.0
26302             2.9     98.0        24.0         2.8  97.34   3.2
26303             3.2     98.0        21.0         4.8  97.13   3.5

[26304 rows x 6 columns]
tensor([[-4.1000, 77.0000, 14.0000, 11.3000, 99.3200, -0.6000],
        [-4.8000, 72.0000, 17.0000, 19.3000, 99.2900, -0.4000],
        [-4.6000, 74.0000, 14.

#Step 3. Load and Process Data (DONT RUN for now)


In [None]:
# Split the Data
# train_data = df.sample(frac=0.6, random_state=42)
# valid_data = df.drop(train_data.index).sample(frac=0.5, random_state=42)
# test_data = df.drop(train_data.index).drop(valid_data.index)

# Define a function for preprocessing the data and creating input sequences
def create_input_sequences(data, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequence = data[i:i + sequence_length, :-1] #get sequence_length number of days of input data
        target = data[i + sequence_length, -1] # get the coresponding percipitation values
        sequences.append(sequence)
        targets.append(target)
    return sequences, targets

# Set random seed for reproducibility
torch.manual_seed(42)

sequence_length = 5  # Number of previous time steps to consider
batch_size = 32

# Create input sequences and targets for training, validation and testing data
train_sequences, train_targets = create_input_sequences(train_data.values, sequence_length)
valid_sequences, valid_targets = create_input_sequences(valid_data.values, sequence_length)
test_sequences, test_targets = create_input_sequences(test_data.values, sequence_length)

# Convert the lists to PyTorch tensors
train_sequences = torch.tensor(train_sequences, dtype=torch.float32)
train_targets = torch.tensor(train_targets, dtype=torch.float32)

valid_sequences = torch.tensor(valid_sequences, dtype=torch.float32)
valid_targets = torch.tensor(valid_targets, dtype=torch.float32)

test_sequences = torch.tensor(test_sequences, dtype=torch.float32)
test_targets = torch.tensor(test_targets, dtype=torch.float32)

# Create DataLoaders's for training, validation and testing data
train_dataset = TensorDataset(train_sequences, train_targets)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(valid_sequences, valid_targets)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

test_dataset = TensorDataset(test_sequences, test_targets)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


  train_sequences = torch.tensor(train_sequences, dtype=torch.float32)


TypeError: ignored

#?Baseline Model:

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.varmax import VARMAX
from sklearn.metrics import mean_squared_error

train = df.iloc[1:100, :-1]

def VARMAX_model(train, test):
    # Fit model
    model = VARMAX(train.iloc[1:1000], order=(1, 1))
    model_fit = model.fit(disp=False)

    # Initialize the prediction list
    predictions = []

    # Make predictions for each time step
    for i in range(len(test)):
        # Get the current test input
        exog_test = test.iloc[i, :-1].values.reshape(1, -1)

        # Make prediction for the next time step
        yhat = model_fit.forecast(steps=1, exog=exog_test)

        # Extract the predicted value
        pred_value = yhat.iloc[0, -1]

        # Store the predicted value
        predictions.append(pred_value)

        # Update the model with the current test input
        model = VARMAX(pd.concat([train, test.iloc[:i+1]]), order=(1, 1))
        model_fit = model.fit(disp=False)

    # Create a DataFrame with the predicted values and the corresponding actual values
    res = pd.DataFrame({'Pred': predictions, 'Act': test.iloc[:, -1].values})

    # Calculate MSE
    mse = mean_squared_error(res['Act'], res['Pred'])
    print("Mean Squared Error (MSE):", mse)

    # Plot the actual and predicted data points
    plt.plot(res['Act'], label='Actual')
    plt.plot(res['Pred'], label='Predicted')
    plt.xlabel('Test Number')
    plt.ylabel('Temperature')
    plt.title('VARMAX Model - Actual vs Predicted')
    plt.legend()
    plt.show()

    return res

# Example usage:
# train = df(100, 7)
print(df.shape)
train = df[1:1000]
test = df[1001: 1009]

df_ret = VARMAX_model(train, test)
print(df_ret)



#Step 4. Define Hyperparameters and Model Settings

In [None]:
# Define hyperparameters
input_size = 6
hidden_size = 120
output_size = 1
num_epochs = 60
learning_rate = 0.001

model = LSTMModel(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#############################################################
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

# model = keras.sequential()
# model.add(layers.LSTM(120, dropout=.1,recurrent_dropout=.2, activation='relu', return_sequences=True ,input_shape=(input_size,hidden_size)))
# model.add(layers.LSTM(120,activation='relu'))
# model.add(layers.Dense(output_size))
# model.compile(optimizer='adam', loss='mse')
# print(model.summary)
#############################################################


#Step 5. Train Model


In [None]:
# Create the input and target tensors
dfTensor = dfTensor.squeeze(0)
inputs = dfTensor[:-1, :]
targets = dfTensor[1:, -1].unsqueeze(1)  # Selecting the "temp" column of the next row as the target

In [None]:
print(dfTensor)
print(inputs)
print(targets)

tensor([[-4.1000, 77.0000, 14.0000, 11.3000, 99.3200, -0.6000],
        [-4.8000, 72.0000, 17.0000, 19.3000, 99.2900, -0.4000],
        [-4.6000, 74.0000, 14.0000, 19.3000, 99.2300, -0.5000],
        ...,
        [ 2.6000, 97.0000, 18.0000,  3.2000, 97.7900,  3.0000],
        [ 2.9000, 98.0000, 24.0000,  2.8000, 97.3400,  3.2000],
        [ 3.2000, 98.0000, 21.0000,  4.8000, 97.1300,  3.5000]])
tensor([[-4.1000, 77.0000, 14.0000, 11.3000, 99.3200, -0.6000],
        [-4.8000, 72.0000, 17.0000, 19.3000, 99.2900, -0.4000],
        [-4.6000, 74.0000, 14.0000, 19.3000, 99.2300, -0.5000],
        ...,
        [ 1.7000, 97.0000, 18.0000,  3.6000, 97.9500,  2.2000],
        [ 2.6000, 97.0000, 18.0000,  3.2000, 97.7900,  3.0000],
        [ 2.9000, 98.0000, 24.0000,  2.8000, 97.3400,  3.2000]])
tensor([[-0.4000],
        [-0.5000],
        [-0.5000],
        ...,
        [ 3.0000],
        [ 3.2000],
        [ 3.5000]])


In [None]:


# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Learning rate adjustment
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Normalize the input data
mean = inputs.mean(dim=0)
std = inputs.std(dim=0)
inputs = (inputs - mean) / std


# Create the training dataset and data loader
trainingDataset = TensorDataset(inputs, targets)
train_loader = torch.utils.data.DataLoader(trainingDataset, batch_size=64, shuffle=True)

# Training loop
losses = []
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0

    for i, (inputs_batch, targets_batch) in enumerate(train_loader, 0):
        inputs_batch = inputs_batch.to(device)
        targets_batch = targets_batch.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs_batch)

        # Compute loss
        loss = criterion(outputs, targets_batch.squeeze())

        # Check for NaN or infinite loss values
        if torch.isnan(loss) or torch.isinf(loss):
            print(f"Invalid loss value at Epoch {epoch+1}, Batch {i+1}. Skipping batch...")
            continue

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate loss
        running_loss += loss.item()

    # Print average loss for the epoch
    average_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.6f}")

    # Adjust learning rate
    scheduler.step()
    losses.append(average_loss)

print("Training finished!")




# Plot the training loss graph
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

  return F.mse_loss(input, target, reduction=self.reduction)


Invalid loss value at Epoch 1, Batch 1. Skipping batch...
Invalid loss value at Epoch 1, Batch 2. Skipping batch...
Invalid loss value at Epoch 1, Batch 3. Skipping batch...
Invalid loss value at Epoch 1, Batch 4. Skipping batch...
Invalid loss value at Epoch 1, Batch 5. Skipping batch...
Invalid loss value at Epoch 1, Batch 6. Skipping batch...
Invalid loss value at Epoch 1, Batch 7. Skipping batch...
Invalid loss value at Epoch 1, Batch 8. Skipping batch...
Invalid loss value at Epoch 1, Batch 9. Skipping batch...
Invalid loss value at Epoch 1, Batch 10. Skipping batch...
Invalid loss value at Epoch 1, Batch 11. Skipping batch...
Invalid loss value at Epoch 1, Batch 12. Skipping batch...
Invalid loss value at Epoch 1, Batch 13. Skipping batch...
Invalid loss value at Epoch 1, Batch 14. Skipping batch...
Invalid loss value at Epoch 1, Batch 15. Skipping batch...
Invalid loss value at Epoch 1, Batch 16. Skipping batch...
Invalid loss value at Epoch 1, Batch 17. Skipping batch...
Invali

  return F.mse_loss(input, target, reduction=self.reduction)


Invalid loss value at Epoch 2, Batch 4. Skipping batch...
Invalid loss value at Epoch 2, Batch 5. Skipping batch...
Invalid loss value at Epoch 2, Batch 6. Skipping batch...
Invalid loss value at Epoch 2, Batch 7. Skipping batch...
Invalid loss value at Epoch 2, Batch 8. Skipping batch...
Invalid loss value at Epoch 2, Batch 9. Skipping batch...
Invalid loss value at Epoch 2, Batch 10. Skipping batch...
Invalid loss value at Epoch 2, Batch 11. Skipping batch...
Invalid loss value at Epoch 2, Batch 12. Skipping batch...
Invalid loss value at Epoch 2, Batch 13. Skipping batch...
Invalid loss value at Epoch 2, Batch 14. Skipping batch...
Invalid loss value at Epoch 2, Batch 15. Skipping batch...
Invalid loss value at Epoch 2, Batch 16. Skipping batch...
Invalid loss value at Epoch 2, Batch 17. Skipping batch...
Invalid loss value at Epoch 2, Batch 18. Skipping batch...
Invalid loss value at Epoch 2, Batch 19. Skipping batch...
Invalid loss value at Epoch 2, Batch 20. Skipping batch...
Inv

KeyboardInterrupt: ignored

#Step 6. Evaluate Model

In [None]:
# Load the best model's parameters
model.load_state_dict(torch.load('best_model.pt'))

# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(test_sequences.permute(1, 0, 2))  # Reshape inputs to (sequence_length, batch_size, input_size)
    test_loss = criterion(outputs.squeeze(), test_targets)

print(f'Test Loss: {test_loss.item():.4f}')

#Step 7. Make Predictions

In [None]:
# Make predictions
predictions = []
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs = inputs.unsqueeze(1)
        outputs = model(inputs)
        predictions.extend(outputs.squeeze().tolist())

# Print some example predictions
for i in range(10):
    print('Expected:', test_data['rainfall_mm'].values[i], 'mm', 'Predicted:', predictions[i], 'mm')