In [1]:
pip install numpy pandas scikit-learn tensorflow matplotlib

Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy>=1.6.0
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting opt-einsum>=2.3.2
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fonttools>=4.22.0
  Downloading fonttools-4.54.1-cp310-

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [3]:
# Load the weather data
data = pd.read_csv('combined_weather_data.csv')

# Convert the Year, Month, Day columns into a single datetime column
data['date'] = pd.to_datetime(data[['Year', 'Month', 'Day']])
data.set_index('date', inplace=True)
data.drop(columns=['Year', 'Month', 'Day'], inplace=True)

# Select features for prediction
features = ['Min Temp', 'Max Temp', 'Humidity', 'Pressure', 'Precipitation']
data = data[features]

# Scale the data between 0 and 1
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
scaled_df = pd.DataFrame(scaled_data, columns=features, index=data.index)

# Split the data into training and testing sets (80% training, 20% testing)
train_size = int(len(scaled_df) * 0.8)
train_data = scaled_df[:train_size]
test_data = scaled_df[train_size:]


In [4]:
train_data , test_data

(            Min Temp  Max Temp  Humidity  Pressure  Precipitation
 date                                                             
 2000-01-01  0.382166  0.300699  0.483333  0.850932       0.000000
 2000-01-02  0.414013  0.335664  0.450000  0.850932       0.000000
 2000-01-03  0.382166  0.307692  0.500000  0.838509       0.000000
 2000-01-04  0.350318  0.300699  0.500000  0.844720       0.000000
 2000-01-05  0.350318  0.300699  0.500000  0.850932       0.000000
 ...              ...       ...       ...       ...            ...
 2019-10-05  0.866242  0.601399  0.783333  0.813665       0.011359
 2019-10-06  0.866242  0.580420  0.783333  0.819876       0.027586
 2019-10-07  0.815287  0.524476  0.800000  0.826087       0.066126
 2019-10-08  0.796178  0.538462  0.700000  0.826087       0.064097
 2019-10-09  0.783439  0.433566  0.916667  0.826087       0.117647
 
 [6985 rows x 5 columns],
             Min Temp  Max Temp  Humidity  Pressure  Precipitation
 date                             

In [5]:
# Create sequences
def create_sequences(data, sequence_length, prediction_length):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length - prediction_length + 1):
        seq = data.iloc[i:i + sequence_length].values
        label = data.iloc[i + sequence_length: i + sequence_length + prediction_length].values
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

# Define sequence and prediction lengths
sequence_length = 90  # Input sequence (e.g., past 90 days)
prediction_length = 90  # Predict the next 90 days

# Create sequences for training and testing
X_train, y_train = create_sequences(train_data, sequence_length, prediction_length)
X_test, y_test = create_sequences(test_data, sequence_length, prediction_length)

# Ensure the data type is float32
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.float32)

# Check the shape of the generated sequences and labels
print("X_train shape:", X_train.shape)  # Should be (samples, sequence_length, num_features)
print("y_train shape:", y_train.shape)  # Should be (samples, prediction_length, num_features)


X_train shape: (6806, 90, 5)
y_train shape: (6806, 90, 5)


In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Reshape
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

2024-09-25 16:20:18.973662: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-25 16:20:19.056517: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-25 16:20:19.646512: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-25 16:20:19.699863: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-25 16:20:19.856357: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

In [7]:
# Define the LSTM model
model = Sequential()
model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))  # Input layer
model.add(LSTM(100, return_sequences=True))  # LSTM with return_sequences=True
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))  # Another LSTM layer
model.add(Dropout(0.2))
model.add(Dense(prediction_length * X_train.shape[2]))  # Adjust based on total outputs
model.add(tf.keras.layers.Reshape((prediction_length, X_train.shape[2])))  # Reshape to (90, 5)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model
model.summary()

# Set up callbacks for early stopping and model checkpointing
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True)

In [None]:
# Train the model and save the history
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping, model_checkpoint])


Epoch 1/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 62ms/step - loss: 0.1066 - val_loss: 0.0156
Epoch 2/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 62ms/step - loss: 0.0189 - val_loss: 0.0114
Epoch 3/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 62ms/step - loss: 0.0124 - val_loss: 0.0088
Epoch 4/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 63ms/step - loss: 0.0103 - val_loss: 0.0080
Epoch 5/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 63ms/step - loss: 0.0091 - val_loss: 0.0074
Epoch 6/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 62ms/step - loss: 0.0084 - val_loss: 0.0071
Epoch 7/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 63ms/step - loss: 0.0083 - val_loss: 0.0070
Epoch 8/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 62ms/step - loss: 0.0077 - val_loss: 0.0072
Epoch 9/50
[1m213/213[

In [None]:
# Plot the training and validation loss
plt.figure(figsize=(12, 6))  # Optional: Set figure size
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Make predictions
predictions = model.predict(X_test)

# Rescale the predicted values back to their original scale
predicted_data = scaler.inverse_transform(predictions)

# Rescale the test data back to its original scale for comparison
true_data = scaler.inverse_transform(y_test)

# Plot the actual vs predicted values for one feature (e.g., Min Temp)
plt.plot(true_data[:, 0], label='True Min Temp')
plt.plot(predicted_data[:, 0], label='Predicted Min Temp')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate the errors
mae = mean_absolute_error(true_data[:, 0], predicted_data[:, 0])
rmse = np.sqrt(mean_squared_error(true_data[:, 0], predicted_data[:, 0]))

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


In [None]:
# Use the last sequence from the test data to predict future weather
last_sequence = X_test[-1].reshape(1, sequence_length, X_train.shape[2])
future_prediction = model.predict(last_sequence)

# Inverse transform the prediction back to the original scale
future_weather = scaler.inverse_transform(future_prediction)
print(f"Predicted future weather: {future_weather}")
