In [3]:
import pandas as pd
import numpy as np
import math
import warnings
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.stats.diagnostic import acorr_ljungbox

# Suppress warnings
warnings.filterwarnings("ignore")

# Load your data
file_path = 'C:\\Users\\loydt\\Downloads\\Projects\\Superstore Sales Dataset.xlsx'
data = pd.read_excel(file_path)

data['Sales'] = pd.to_numeric(data['Sales'], errors='coerce')
series = data['Sales'].dropna()  # Ensure no missing values for analysis

# Apply logarithm transformation if necessary
series = series.apply(lambda x: math.log(x) if x > 0 else None)

# List to store results
results = []

# Define the range of ARIMA orders to try
p = 1  # AR order
q_max = 5  # Maximum MA order to try

# Fit ARIMA models with varying MA orders
for q in range(q_max + 1):
    model = ARIMA(series, order=(p, 0, q))
    fitted_model = model.fit()
    
    # Calculate AIC
    aic = fitted_model.aic
    
    # Check residuals for autocorrelation
    lb_test = acorr_ljungbox(fitted_model.resid, lags=[10], return_df=True)
    lb_p_value = lb_test['lb_pvalue'].iloc[0]
    
    # Store results
    results.append({
        'p': p,
        'q': q,
        'AIC': aic,
        'LB_P_Value': lb_p_value
    })

# Convert results to DataFrame for easy analysis
results_df = pd.DataFrame(results)

# Find the best model based on AIC
best_model = results_df.loc[results_df['AIC'].idxmin()]

print("Best Model:")
print(f"AR Order (p): {best_model['p']}, MA Order (q): {best_model['q']}")
print(f"AIC: {best_model['AIC']}, Ljung-Box p-value: {best_model['LB_P_Value']}")

# Fit the best model
final_model = ARIMA(series, order=(best_model['p'], 0, best_model['q'])).fit()
print(final_model.summary())


Best Model:
AR Order (p): 1.0, MA Order (q): 0.0
AIC: 37582.01396924527, Ljung-Box p-value: 0.5434659788425967
                               SARIMAX Results                                
Dep. Variable:                  Sales   No. Observations:                 9800
Model:                 ARIMA(1, 0, 0)   Log Likelihood              -18788.007
Date:                Wed, 30 Oct 2024   AIC                          37582.014
Time:                        20:54:18   BIC                          37603.584
Sample:                             0   HQIC                         37589.323
                               - 9800                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.1111      0.017    240.792      0.000       4.078       4.145
ar.L1          0.014