In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from pmdarima import auto_arima

In [5]:
# Load Data
data = pd.read_csv('data.csv')  # Ensure filename matches

data.columns = ['Year', 'Annual_Mean', 'Smooth_5yr']  # Ensure proper column names
data = data[['Year', 'Annual_Mean']]
data.set_index('Year', inplace=True)
data.head()

Unnamed: 0_level_0,Annual_Mean
Year,Unnamed: 1_level_1
1901,27.34
1902,27.16
1903,27.12
1904,27.09
1905,27.06


In [6]:
# Train-Test Split
split_index = int(len(data) * 0.8)
train, test = data.iloc[:split_index], data.iloc[split_index:]

In [11]:
test.head(5)

Unnamed: 0_level_0,Annual_Mean
Year,Unnamed: 1_level_1
1998,27.83
1999,27.31
2000,27.84
2001,27.9
2002,28.27


In [12]:
# SARIMA Model using AutoARIMA
auto_arima_model = auto_arima(train, seasonal=True, m=10, trace=True, suppress_warnings=True)
best_order = auto_arima_model.order
best_seasonal_order = auto_arima_model.seasonal_order

sarima_model = SARIMAX(train, order=best_order, seasonal_order=best_seasonal_order)
sarima_fit = sarima_model.fit()
sarima_pred = sarima_fit.forecast(steps=len(test))

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(1,0,1)[10] intercept   : AIC=inf, Time=1.25 sec
 ARIMA(0,1,0)(0,0,0)[10] intercept   : AIC=67.564, Time=0.07 sec
 ARIMA(1,1,0)(1,0,0)[10] intercept   : AIC=59.681, Time=0.10 sec
 ARIMA(0,1,1)(0,0,1)[10] intercept   : AIC=44.863, Time=0.14 sec
 ARIMA(0,1,0)(0,0,0)[10]             : AIC=65.585, Time=0.03 sec
 ARIMA(0,1,1)(0,0,0)[10] intercept   : AIC=43.141, Time=0.07 sec
 ARIMA(0,1,1)(1,0,0)[10] intercept   : AIC=44.926, Time=0.13 sec
 ARIMA(0,1,1)(1,0,1)[10] intercept   : AIC=inf, Time=0.57 sec
 ARIMA(1,1,1)(0,0,0)[10] intercept   : AIC=45.013, Time=0.08 sec
 ARIMA(0,1,2)(0,0,0)[10] intercept   : AIC=44.793, Time=0.15 sec
 ARIMA(1,1,0)(0,0,0)[10] intercept   : AIC=57.684, Time=0.06 sec
 ARIMA(1,1,2)(0,0,0)[10] intercept   : AIC=40.762, Time=0.16 sec
 ARIMA(1,1,2)(1,0,0)[10] intercept   : AIC=42.668, Time=0.22 sec
 ARIMA(1,1,2)(0,0,1)[10] intercept   : AIC=42.638, Time=0.26 sec
 ARIMA(1,1,2)(1,0,1)[10] intercept   : AIC=44.386, Ti

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(


In [19]:
# LSTM Model
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

X_train, y_train = train_scaled[:-1], train_scaled[1:]
X_test, y_test = test_scaled[:-1], test_scaled[1:]

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

lstm_model = Sequential([
    LSTM(50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], 1)),
    LSTM(50, activation='relu'),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(X_train, y_train, epochs=50, verbose=0)

lstm_pred = lstm_model.predict(X_test)
lstm_pred = scaler.inverse_transform(lstm_pred)

  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 343ms/step


In [20]:
# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(np.array(train.index).reshape(-1,1), train['Annual_Mean'])
rf_pred = rf_model.predict(np.array(test.index).reshape(-1,1))


In [21]:
# Evaluation
def evaluate(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    print(f'{model_name} -> MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}')

evaluate(test['Annual_Mean'], sarima_pred, 'SARIMA')
evaluate(test['Annual_Mean'], lstm_pred, 'LSTM')
evaluate(test['Annual_Mean'], rf_pred, 'Random Forest')

SARIMA -> MAE: 0.4177, RMSE: 0.5448, R²: -0.7254


ValueError: Found input variables with inconsistent numbers of samples: [25, 24]