In [None]:
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss
import warnings

# Suppress specific warnings for KPSS
warnings.filterwarnings("ignore", category=UserWarning, message="The test statistic is outside of the range of p-values available in the look-up table.")

# 2. Load Dataset
file_path = 'C:\\Users\\loydt\\Downloads\\Projects\\Superstore Sales Dataset.xlsx'
data = pd.read_excel(file_path)

# Convert 'Order Date' to datetime format and set as index
data['Order Date'] = pd.to_datetime(data['Order Date'], errors='coerce')
data.set_index('Order Date', inplace=True)

# List of unique sub-categories
sub_categories = ['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage', 'Furnishings', 
                  'Art', 'Phones', 'Binders', 'Appliances', 'Paper', 'Accessories', 
                  'Envelopes', 'Fasteners', 'Supplies', 'Machines', 'Copiers']

# 3. Define function to test stationarity and autocorrelation
def test_stationarity_and_autocorrelation(series, sub_category):
    # ADF Test
    adf_result = adfuller(series)
    print(f"\nSub-Category: {sub_category}")
    print("ADF Test:")
    print(f"  ADF Statistic: {adf_result[0]}")
    print(f"  p-value: {adf_result[1]}")
    
    # KPSS Test
    kpss_result = kpss(series, regression='c')
    print("\nKPSS Test:")
    print(f"  KPSS Statistic: {kpss_result[0]}")
    print(f"  p-value: {kpss_result[1]}")

# 4. Loop through each sub-category to apply stationarity tests
for sub_category in sub_categories:
    # Filter data for the current sub-category
    sub_category_data = data[data['Sub-Category'] == sub_category]
    
    # Aggregate sales by month
    monthly_sales = sub_category_data['Sales'].resample('ME').sum()
    
    # Apply logarithmic transformation for stationarity testing
    transformed_sales = monthly_sales.apply(lambda x: math.log(x) if x > 0 else None).dropna()

    # Perform the stationarity tests if there is sufficient data
    if len(transformed_sales) > 0:
        test_stationarity_and_autocorrelation(transformed_sales, sub_category)



Sub-Category: Bookcases
ADF Test:
  ADF Statistic: -6.027134171995194
  p-value: 1.446920160127645e-07

KPSS Test:
  KPSS Statistic: 0.26257551393839756
  p-value: 0.1

Sub-Category: Chairs
ADF Test:
  ADF Statistic: -6.197659232292192
  p-value: 5.911326600428518e-08

KPSS Test:
  KPSS Statistic: 0.4162803438561973
  p-value: 0.07013778282060462

Sub-Category: Labels
ADF Test:
  ADF Statistic: -5.182598245492526
  p-value: 9.540530724034473e-06

KPSS Test:
  KPSS Statistic: 0.21972387687240708
  p-value: 0.1

Sub-Category: Tables
ADF Test:
  ADF Statistic: -2.5114700477055205
  p-value: 0.11270842284915639

KPSS Test:
  KPSS Statistic: 0.40354771286030183
  p-value: 0.0756259858360768

Sub-Category: Storage
ADF Test:
  ADF Statistic: -0.1061438024383304
  p-value: 0.9488231382203466

KPSS Test:
  KPSS Statistic: 0.7388013844100835
  p-value: 0.010018055962719683

Sub-Category: Furnishings
ADF Test:
  ADF Statistic: -5.266380938270393
  p-value: 6.425515681850899e-06

KPSS Test:
  KPS

look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series, regression='c')
look-up table. The actual p-value is greater than the p-