In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import matplotlib.pyplot as plt

In [2]:
# Load dataset
df = pd.read_csv("/content/weather_2016_2020_daily.csv", parse_dates=['Date'], index_col='Date')

# Sort and fill missing values if any
df.sort_index(inplace=True)
df.fillna(method='ffill', inplace=True)

  df.fillna(method='ffill', inplace=True)


In [3]:
# Feature Engineering: Lag Features
for lag in range(1, 8):
    df[f'Precipit_lag_{lag}'] = df['Precipit'].shift(lag)

# Rolling window features
df['Precipit_3d_avg'] = df['Precipit'].rolling(window=3).mean()
df['Precipit_7d_avg'] = df['Precipit'].rolling(window=7).mean()
df['Precipit_7d_std'] = df['Precipit'].rolling(window=7).std()

# Time-based features
df['day_of_year'] = df.index.dayofyear
df['month'] = df.index.month
df['weekday'] = df.index.weekday


In [4]:
# Drop rows with NaN
df.dropna(inplace=True)

In [12]:
# Define features and target
X, y = df.drop(columns=['Precipit']), df['Precipit']

In [13]:
# Split the data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

# Save training columns.
X_train_columns = X_train.columns.tolist()

# Feature Scaling using StandardScaler
scaler_std = StandardScaler()
X_train_scaled = scaler_std.fit_transform(X_train)
X_test_scaled = scaler_std.transform(X_test)

**Model Implementations**

In [14]:
y = df['Precipit']
exog = df[['Precipit_lag_1', 'Precipit_3d_avg', 'day_of_week']]

train_size = int(len(df) * 0.8)
y_train, y_test = y[:train_size], y[train_size:]
exog_train, exog_test = exog[:train_size], exog[train_size:]


sarimax_model_default = SARIMAX(y_train, exog=exog_train, order=(1, 0, 1), seasonal_order=(0, 0, 1, 7))
results_default = sarimax_model_default.fit(disp=False)
y_pred_default = results_default.forecast(steps=len(y_test), exog=exog_test)

mse_default = mean_squared_error(y_test, y_pred_default)
mae_default = mean_absolute_error(y_test, y_pred_default)
r2_default = r2_score(y_test, y_pred_default)

mse_default, mae_default, r2_default

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


(0.046879168276924246, 0.10383254410456537, 0.49286387375333207)

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize baseline RF model
rf_baseline = RandomForestRegressor(random_state=42)
rf_baseline.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf_base = rf_baseline.predict(X_test_scaled)

# Evaluate
mse_rf_base = mean_squared_error(y_test, y_pred_rf_base)
mae_rf_base = mean_absolute_error(y_test, y_pred_rf_base)
r2_rf_base = r2_score(y_test, y_pred_rf_base)

print("\nRandom Forest (Before Tuning):")
print(f"MSE: {mse_rf_base:.4f}, MAE: {mae_rf_base:.4f}, R²: {r2_rf_base:.4f}")



Random Forest (Before Tuning):
MSE: 0.0091, MAE: 0.0372, R²: 0.9012


In [16]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define base models (default settings)
base_models = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42))
]

# Stacking Regressor
stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=RandomForestRegressor(random_state=42)
)

# Train
stacking_model.fit(X_train_scaled, y_train)

# Predict
y_pred_stack = stacking_model.predict(X_test_scaled)

# Evaluate
mse_stack = mean_squared_error(y_test, y_pred_stack)
mae_stack = mean_absolute_error(y_test, y_pred_stack)
r2_stack = r2_score(y_test, y_pred_stack)

print(f"Stacking Regressor (Before Tuning) - MSE: {mse_stack:.4f}, MAE: {mae_stack:.4f}, R²: {r2_stack:.4f}")


Stacking Regressor (Before Tuning) - MSE: 0.0056, MAE: 0.0301, R²: 0.9396


In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# Normalize data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)

# Create sequences
def create_sequences(data, time_steps=14):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps])
        y.append(data[i + time_steps, 0])
    return np.array(X), np.array(y)

time_steps = 14
train_size = int(len(scaled_data) * 0.8)
train, test = scaled_data[:train_size], scaled_data[train_size:]
X_train, y_train = create_sequences(train, time_steps)
X_test, y_test = create_sequences(test, time_steps)

# Build a basic LSTM model (no tuning)
model = Sequential([
    LSTM(64, activation='relu', return_sequences=True, input_shape=(time_steps, X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, activation='relu'),
    Dropout(0.2),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')

# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)

# Predict
y_pred = model.predict(X_test)
y_test_inv = scaler.inverse_transform(np.c_[y_test, np.zeros((y_test.shape[0], scaled_data.shape[1]-1))])[:, 0]
y_pred_inv = scaler.inverse_transform(np.c_[y_pred, np.zeros((y_pred.shape[0], scaled_data.shape[1]-1))])[:, 0]

# Evaluate
mse = mean_squared_error(y_test_inv, y_pred_inv)
mae = mean_absolute_error(y_test_inv, y_pred_inv)
r2 = r2_score(y_test_inv, y_pred_inv)
print(f"LSTM (Without Tuning) - MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")


  super().__init__(**kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step
LSTM (Without Tuning) - MSE: 20.4777, MAE: 2.8999, R²: 0.7160
