In [1]:
import rpy2.robjects as ro

In [2]:
pip install rpy2



In [3]:
%load_ext rpy2.ipython

In [4]:
%%R
install.packages(c("patchwork", "fpp3", "tidyverse", "lubridate", "stringr", "urca"))

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
also installing the dependencies ‘quadprog’, ‘progressr’, ‘ggdist’, ‘numDeriv’, ‘warp’, ‘BH’, ‘fabletools’, ‘distributional’, ‘slider’, ‘anytime’, ‘fable’, ‘feasts’, ‘tsibble’, ‘tsibbledata’

trying URL 'https://cran.rstudio.com/src/contrib/quadprog_1.5-8.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/progressr_0.18.0.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/ggdist_3.3.3.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/numDeriv_2016.8-1.1.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/warp_0.2.2.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/BH_1.90.0-1.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/fabletools_0.5.1.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/distributional_0.5.0.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/slider_0.3.3.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/anytime_0.3.12.tar.gz'
t

Now you can use the `%%R` magic command to run R code:

In [5]:
%%R
library(fpp3)
library(tidyverse)
library(lubridate)
library(stringr)
library(patchwork)
library(urca)

sales_raw <- read_csv("sales_train_validation_afcs2025.csv")
calendar_raw <- read_csv("calendar_afcs2025.csv")
prices_raw <- read_csv("sell_prices_afcs2025.csv")

Error in library(fpp3) : there is no package called ‘fpp3’


RInterpreterError: Failed to parse and evaluate line 'library(fpp3)\nlibrary(tidyverse)\nlibrary(lubridate)\nlibrary(stringr)\nlibrary(patchwork)\nlibrary(urca)\n\nsales_raw <- read_csv("sales_train_validation_afcs2025.csv")\ncalendar_raw <- read_csv("calendar_afcs2025.csv")\nprices_raw <- read_csv("sell_prices_afcs2025.csv")\n'.
R error message: 'Error in library(fpp3) : there is no package called ‘fpp3’'

In [None]:
%%R
calendar <- calendar_raw %>%
  mutate(date = mdy(date)) %>%
  # Create a 'd' column to match with sales columns (d_1, d_2, ...)
  mutate(d = paste0("d_", row_number()))

In [None]:
%%R
event_types <- c(unique(calendar_raw$event_name_1), unique(calendar_raw$event_name_2))
event_types <- event_types[!is.na(event_types)]
event_types <- unique(event_types)

In [None]:
%%R
normalized_events <- tolower(str_replace_all(event_types, "[^[:alnum:]]", ""))
if(length(unique(normalized_events)) < length(event_types)) {
  warning("Same events found with different casing or formatting. Please inspect.")
}

In [None]:
%%R
# Ensure calendar is sorted by date for lead() to work correctly
calendar <- calendar %>% arrange(date)

# Create binary columns for each event
for(evt in event_types) {
  # Create a clean column name (remove spaces and special chars)
  col_name <- str_replace_all(evt, "[^[:alnum:]]", "")

  # Add binary column
  calendar <- calendar %>%
    mutate(
      is_event_today = coalesce(event_name_1 == evt, FALSE) | coalesce(event_name_2 == evt, FALSE),
      !!col_name := as.integer(
        is_event_today |
        lead(is_event_today, 1, default = FALSE) |
        lead(is_event_today, 2, default = FALSE) |
        lead(is_event_today, 3, default = FALSE)
      )
    ) %>%
    select(-is_event_today)
}

In [None]:
%%R
sales_long <- sales_raw %>%
  pivot_longer(
    cols = starts_with("d_"),
    names_to = "d",
    values_to = "sales_volume"
  )

In [None]:
%%R
sales_long <- sales_long %>%
  mutate(
    # Extract first 11 characters for item_id
    item_id = str_sub(id, 1, 11),
    # Extract store_id.
    # Logic: The id is typically {item_id}_{store_id}_validation
    # We take the substring after the item_id (from char 13) and remove '_validation'
    store_id_temp = str_sub(id, 13, nchar(id)),
    store_id = str_remove(store_id_temp, "_validation")
  ) %>%
  select(-store_id_temp)

In [None]:
%%R
sales_with_calendar <- sales_long %>%
  left_join(calendar, by = "d")

In [None]:
%%R
final_data <- sales_with_calendar %>%
  left_join(prices_raw, by = c("store_id", "item_id", "wm_yr_wk"))

In [None]:
%%R
product_tsibble <- final_data %>%
  # Ensure date is sorted (tsibble expects sorted index usually, though as_tsibble handles it)
  arrange(id, date) %>%
  as_tsibble(key = id, index = date)

# Cleanup large intermediate objects to free memory
rm(sales_raw, calendar_raw, prices_raw, sales_long, sales_with_calendar, final_data)
gc()

# Output the final tsibble
product_tsibble

In [None]:
print(product_tsibble)

In [None]:
%%R
# 1. Daily Aggregation & Analysis
# -------------------------------
daily_sales <- product_tsibble %>%
  index_by(date) %>%
  summarise(total_sales = sum(sales_volume, na.rm = TRUE))

# Plot: Daily Sales Over Time
daily_sales %>%
  autoplot(total_sales) +
  labs(
    title = "Total Daily Sales Volume Over Time",
    y = "Sum of Sales",
    x = "Date"
  ) +
  theme_minimal()

# Plot: Daily ACF and PACF (14 lags / 2 weeks)
p1_daily <- daily_sales %>%
  ACF(total_sales, lag_max = 14) %>%
  autoplot() +
  labs(title = "Daily ACF (14 Lags)") +
  theme_minimal()

p2_daily <- daily_sales %>%
  PACF(total_sales, lag_max = 14) %>%
  autoplot() +
  labs(title = "Daily PACF (14 Lags)") +
  theme_minimal()

# Display side-by-side
p1_daily + p2_daily

In [None]:
%%R
# 2. Weekly Aggregation & Analysis
# --------------------------------
weekly_sales <- daily_sales %>%
  index_by(week = yearweek(date)) %>%
  summarise(total_sales = sum(total_sales, na.rm = TRUE))

# Plot: Weekly Sums
weekly_sales %>%
  autoplot(total_sales) +
  labs(
    title = "Total Weekly Sales Volume",
    y = "Weekly Sum of Sales",
    x = "Week"
  ) +
  theme_minimal()

# Plot: Weekly ACF and PACF (8 lags / 2 months)
p1_weekly <- weekly_sales %>%
  ACF(total_sales, lag_max = 8) %>%
  autoplot() +
  labs(title = "Weekly ACF (8 Lags)") +
  theme_minimal()

p2_weekly <- weekly_sales %>%
  PACF(total_sales, lag_max = 8) %>%
  autoplot() +
  labs(title = "Weekly PACF (8 Lags)") +
  theme_minimal()

p1_weekly + p2_weekly

In [None]:

%%R
# 3. Monthly Aggregation & Analysis
# ---------------------------------
monthly_sales <- daily_sales %>%
  index_by(month = yearmonth(date)) %>%
  summarise(total_sales = sum(total_sales, na.rm = TRUE))

# Plot: Monthly Sums
monthly_sales %>%
  autoplot(total_sales) +
  labs(
    title = "Total Monthly Sales Volume",
    y = "Monthly Sum of Sales",
    x = "Month"
  ) +
  theme_minimal()

# Plot: Monthly ACF and PACF (24 lags / 2 years)
p1_monthly <- monthly_sales %>%
  ACF(total_sales, lag_max = 24) %>%
  autoplot() +
  labs(title = "Monthly ACF (24 Lags)") +
  theme_minimal()

p2_monthly <- monthly_sales %>%
  PACF(total_sales, lag_max = 24) %>%
  autoplot() +
  labs(title = "Monthly PACF (24 Lags)") +
  theme_minimal()

p1_monthly + p2_monthly

In [None]:
%%R
# 1. Select 10 random products
set.seed(123) # Set seed for reproducibility
all_ids <- unique(product_tsibble$id)
sample_ids <- sample(all_ids, 10)

# 2. Define a helper function to plot "Sales vs Price" for a given ID
plot_price_vs_sales <- function(target_id) {
  product_tsibble %>%
    filter(id == target_id) %>%
    ggplot(aes(x = sell_price, y = sales_volume)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    labs(
      title = paste("Item:", target_id),
      x = "Sell Price",
      y = "Sales Volume"
    ) +
    theme_minimal() +
    theme(plot.title = element_text(size = 10, face = "bold"))
}

# 3. Generate plots for all 10 IDs
plot_list <- map(sample_ids, plot_price_vs_sales)

# 4. Arrange in a grid: 5 rows, 2 columns
wrap_plots(plot_list, ncol = 2)

In [None]:
%%R
# Aggregate total sales per day
daily_sales_agg <- product_tsibble %>%
  index_by(date) %>%
  summarise(total_sales = sum(sales_volume, na.rm = TRUE)) %>%
  # Join with calendar to get the snap_TX column
  left_join(calendar, by = "date")

# Create comparative boxplot
daily_sales_agg %>%
  mutate(snap_TX = factor(snap_TX, levels = c(0, 1), labels = c("No SNAP", "SNAP Active"))) %>%
  ggplot(aes(x = snap_TX, y = total_sales, fill = snap_TX)) +
  geom_boxplot() +
  labs(
    title = "Total Sales Distribution: SNAP vs Non-SNAP Days",
    x = "Is SNAP Active?",
    y = "Total Daily Sales Volume"
  ) +
  theme_minimal() +
  theme(legend.position = "none")


In [None]:
%%R
# 1. Aggregate to Daily Global Sales
# ----------------------------------
daily_sales_agg <- product_tsibble %>%
  index_by(date) %>%
  summarise(total_sales = sum(sales_volume, na.rm = TRUE))

# 2. Join with Calendar to retrieve Event Flags
# ---------------------------------------------
# We join the daily sales with the calendar to access the binary event columns created earlier.
analysis_data <- daily_sales_agg %>%
  left_join(calendar, by = "date")

# 3. Define the list of events to analyze (as requested)
# -----------------------------------------------------
# We list them in lowercase to match case-insensitively against the actual columns.
target_events_list <- c(
  "superbowl", "valentinesday", "presidentsday", "lentstart", "lentweek2",
  "stpatricksday", "purimend", "orthodoxeaster", "pesachend", "cincodemayo",
  "mothersday", "memorialday", "nbafinalsstart", "nbafinalsend", "fathersday",
  "independenceday", "ramadanstarts", "eidalfitr", "laborday", "columbusday",
  "halloween", "eidaladha", "veteransday", "thanksgiving", "christmas",
  "chanukahend", "newyear", "orthodoxchristmas", "martinlutherkingday", "easter",
  "snap_TX"
)

# 4. Process Data for Plotting
# ----------------------------
plot_data <- analysis_data %>%
  # Select date, sales, and columns that match our target list (ignoring case)
  select(date, total_sales, matches(paste0("^(", paste(target_events_list, collapse="|"), ")$"), ignore.case = TRUE)) %>%
  # Pivot longer to stack all events into two columns: 'event_name' and 'is_active'
  pivot_longer(
    cols = -c(date, total_sales),
    names_to = "event_name",
    values_to = "is_active"
  ) %>%
  # Convert 1/0 to meaningful Factors ("Yes"/"No")
  mutate(
    is_active = factor(is_active, levels = c(0, 1), labels = c("No", "Yes")),
    # Format labels to "CamelCase" / Title Case for better readability
    event_label = str_to_title(event_name),
    # Manual fix for 'snap_TX' if str_to_title messes it up (optional polish)
    event_label = ifelse(event_name == "snap_TX", "Snap TX", event_label)
  )

# 5. Generate Boxplots
# --------------------
plot_data %>%
  ggplot(aes(x = is_active, y = total_sales, fill = is_active)) +
  geom_boxplot(outlier.alpha = 0.3, outlier.size = 1) +
  # Facet wrap creates the "numerous box plots" layout
  facet_wrap(~ event_label, scales = "free_y", ncol = 5) +
  labs(
    title = "Impact of Events on Total Daily Sales Volume",
    subtitle = "Comparison of Daily Sales: Days with Event (Yes) vs. Days without (No)",
    x = "Is Event Active?",
    y = "Total Daily Sales Volume"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold", size = 9)
  )

In [None]:
%%R
# 1. Aggregate to Daily Global Sales
# ----------------------------------
daily_sales_agg <- product_tsibble %>%
  index_by(date) %>%
  summarise(total_sales = sum(sales_volume, na.rm = TRUE)) %>%
  left_join(calendar, by = "date")

# 2. Define the list of events to analyze
# ---------------------------------------
target_events_list <- c(
  "superbowl", "valentinesday", "presidentsday", "lentstart", "lentweek2",
  "stpatricksday", "purimend", "orthodoxeaster", "pesachend", "cincodemayo",
  "mothersday", "memorialday", "nbafinalsstart", "nbafinalsend", "fathersday",
  "independenceday", "ramadanstarts", "eidalfitr", "laborday", "columbusday",
  "halloween", "eidaladha", "veteransday", "thanksgiving", "christmas",
  "chanukahend", "newyear", "orthodoxchristmas", "martinlutherkingday", "easter",
  "snap_TX"
)
# 3. Compute Averages and Differences
# -----------------------------------
impact_table <- daily_sales_agg %>%
  # IMPORTANT: Convert to a standard tibble to drop the date index during summarise
  as_tibble() %>%
  select(total_sales, matches(paste0("^(", paste(target_events_list, collapse="|"), ")$"), ignore.case = TRUE)) %>%
  # Pivot to long format
  pivot_longer(
    cols = -total_sales,
    names_to = "event_name",
    values_to = "is_active"
  ) %>%
  # Now this will group globally across all dates
  group_by(event_name, is_active) %>%
  summarise(avg_sales = mean(total_sales, na.rm = TRUE), .groups = "drop") %>%
  # Reshape to have columns for "No Event" (0) and "Event" (1)
  pivot_wider(
    names_from = is_active,
    values_from = avg_sales,
    names_prefix = "active_"
  ) %>%
  # Calculate the Difference
  mutate(
    avg_sales_without = active_0,
    avg_sales_with = active_1,
    difference = avg_sales_with - avg_sales_without
  ) %>%
  select(event_name, avg_sales_with, avg_sales_without, difference) %>%
  arrange(desc(difference))

# 4. Display the Table
# --------------------
# Using knitr::kable for a clean table output in RMarkdown
knitr::kable(impact_table, caption = "Average Sales Impact by Event (Sorted by Positive Effect)")

In [None]:
%%R
# 1. Aggregate to Daily Global Sales & Join Calendar
# --------------------------------------------------
daily_sales_agg <- product_tsibble %>%
  index_by(date) %>%
  summarise(total_sales = sum(sales_volume, na.rm = TRUE)) %>%
  left_join(calendar, by = "date")

# 2. Define the list of events
# ----------------------------
target_events_list <- c(
  "superbowl", "valentinesday", "presidentsday", "lentstart", "lentweek2",
  "stpatricksday", "purimend", "orthodoxeaster", "pesachend", "cincodemayo",
  "mothersday", "memorialday", "nbafinalsstart", "nbafinalsend", "fathersday",
  "independenceday", "ramadanstarts", "eidalfitr", "laborday", "columbusday",
  "halloween", "eidaladha", "veteransday", "thanksgiving", "christmas",
  "chanukahend", "newyear", "orthodoxchristmas", "martinlutherkingday", "easter",
  "snap_TX"
)

# 3. Perform T-Tests for each event
# ---------------------------------
# We iterate over each event name, find the matching column, and run a t-test.
hypothesis_results <- map_dfr(target_events_list, function(evt_name) {

  # Identify the actual column name in the dataframe (handling case insensitivity)
  # We look for a column that matches the event name exactly (anchored)
  col_match <- names(daily_sales_agg)[str_detect(names(daily_sales_agg), regex(paste0("^", evt_name, "$"), ignore_case = TRUE))]

  # Safety check: if column not found or multiple matches, skip or pick first
  if(length(col_match) == 0) return(NULL)
  col_name <- col_match[1]

  # Extract relevant data
  test_data <- daily_sales_agg %>%
    as_tibble() %>%
    select(total_sales, is_active = !!sym(col_name))

  # Ensure we have data for both groups (0 and 1)
  # A t-test will fail if one group is empty or has only 1 observation sometimes
  counts <- table(test_data$is_active)
  if(length(counts) < 2 || min(counts) < 2) {
    return(tibble(
      event_name = evt_name,
      t_statistic = NA,
      p_value = NA,
      mean_with = NA,
      mean_without = NA,
      note = "Insufficient data"
    ))
  }

  # Run Two-Sided T-Test (assuming unequal variances / Welch's t-test)
  # H0: mean(sales|active) == mean(sales|inactive)
  # Ha: mean(sales|active) != mean(sales|inactive)
  t_res <- t.test(total_sales ~ is_active, data = test_data, alternative = "two.sided")

  # Return row
  tibble(
    event_name = evt_name,
    t_statistic = t_res$statistic,
    p_value = t_res$p.value,
    mean_without = t_res$estimate[1], # usually group 0
    mean_with = t_res$estimate[2]     # usually group 1
  )
})

# 4. Process and Display Results
# ------------------------------
final_hypothesis_table <- hypothesis_results %>%
  # Sort by statistical significance (lowest p-value first)
  arrange(p_value) %>%
  mutate(
    # Format p-values for readability (< 0.001, etc.)
    p_value_fmt = ifelse(p_value < 0.001, "< 0.001", round(p_value, 4)),
    t_statistic = round(t_statistic, 2),
    mean_with = round(mean_with, 1),
    mean_without = round(mean_without, 1),
    diff = mean_with - mean_without
  ) %>%
  select(event_name, t_statistic, p_value = p_value_fmt, mean_with, mean_without, diff)

# Display table
knitr::kable(final_hypothesis_table,
             caption = "Hypothesis Test Results: Impact of Events on Daily Sales (Sorted by Significance)")

In [None]:
%%R

library(dplyr)
library(fable)
library(tsibble)
library(stringr)
library(tidyr)
library(readr)

# 0. Data Overview
# ----------------------------
# Assumes 'product_tsibble' is already loaded in your environment
print(paste("Processing ALL", length(unique(product_tsibble$id)), "products."))

# 1. Define Predictors & Helper
# -----------------------------
sig_events <- c("snap_TX", "halloween", "columbusday", "laborday",
                "independenceday", "fathersday", "easter", "eidalfitr", "christmas")

build_harmonic_formula <- function(K, include_price = FALSE, include_events = FALSE) {
  rhs <- paste0("fourier(period = 'year', K = ", K, ")")
  if(include_price) rhs <- paste(rhs, "+ sell_price")
  if(include_events) {
    events_string <- paste(sig_events, collapse = " + ")
    rhs <- paste(rhs, "+", events_string)
  }
  as.formula(paste("sales_volume ~", rhs))
}

# 2. Fit The "Horse Race" Models (ON ALL DATA)
# ------------------------------
# Warning: This step may take significant time/memory depending on dataset size
robust_models <- product_tsibble %>%
  model(
    arima_complex = ARIMA(!!build_harmonic_formula(K=3, include_price=TRUE, include_events=TRUE)),
    croston = CROSTON(sales_volume),
    snaive = SNAIVE(sales_volume ~ lag("year"))
  )

print("Models fitted on all products. Moving to forecast phase...")

# -----------------------------------------------------------------------------
# LOAD EXTERNAL DATA
# -----------------------------------------------------------------------------

# Load Prices
prices_raw <- read_csv("sell_prices_afcs2025.csv")

# Load Validation Data
validation_raw <- read_csv("sales_test_validation_afcs2025.csv")

# Ensure Calendar is loaded
if(!exists("calendar")) {
  calendar <- read_csv("calendar_afcs2025.csv")
}

# -----------------------------------------------------------------------------

# 3. Prepare Validation Set (ALL PRODUCTS)
# -------------------------
validation_ts <- validation_raw %>%
  pivot_longer(cols = starts_with("d_"), names_to = "d", values_to = "sales_volume") %>%
  mutate(
    item_id = str_sub(id, 1, 11),
    store_id = str_remove(str_sub(id, 13, nchar(id)), "_validation")
  ) %>%
  left_join(calendar, by = "d") %>%
  select(id, date, sales_volume) %>%
  arrange(id, date) %>%
  as_tsibble(key = id, index = date)
  # Removed filter(id %in% selected_ids)

# 4. Prepare Future Regressors
# ----------------------------
# Using product_tsibble (all data) instead of subset
future_scenarios <- new_data(product_tsibble, n = 28) %>%
  left_join(calendar, by = "date") %>%
  mutate(
    item_id = str_sub(id, 1, 11),
    store_id = str_remove(str_sub(id, 13, nchar(id)), "_validation")
  ) %>%
  left_join(prices_raw, by = c("store_id", "item_id", "wm_yr_wk")) %>%
  group_by(id) %>%
  fill(sell_price, .direction = "downup") %>%
  ungroup()

# 5. Generate Forecasts
# ---------------------
forecasts <- robust_models %>%
  forecast(new_data = future_scenarios)

# 6. Race Results: Who Won? (Compute RMSE)
# ----------------------------------------
accuracy_results <- forecasts %>%
  accuracy(validation_ts, measures = list(RMSE = RMSE))

# 7. Select the Winner Per Product
# --------------------------------
best_model_per_product <- accuracy_results %>%
  group_by(id) %>%
  filter(RMSE == min(RMSE, na.rm = TRUE)) %>%
  slice(1) %>%
  ungroup() %>%
  select(id, .model, RMSE)

# 8. Create Final Submission
# --------------------------
final_submission <- forecasts %>%
  inner_join(best_model_per_product, by = c("id", ".model")) %>%
  as_tibble() %>%
  select(id, date, .mean) %>%
  rename(forecast = .mean)

print("Winning Model Counts:")
print(table(best_model_per_product$.model))

print("Preview of Final Forecasts:")
print(head(final_submission))

In [None]:
%%R
library(ggplot2)

# 1. Identify Products with > 75% Non-Zero Sales
# ----------------------------------------------
high_velocity_ids <- product_tsibble %>%
  as_tibble() %>%
  group_by(id) %>%
  summarise(
    n_total = n(),
    n_nonzero = sum(sales_volume > 0, na.rm = TRUE),
    nonzero_pct = n_nonzero / n_total
  ) %>%
  filter(nonzero_pct > 0.75) %>%
  pull(id)

print(paste("Found", length(high_velocity_ids), "products with > 75% non-zero sales."))

# 2. Select One Specific Product
# ------------------------------
# We pick the first one found, or sample one if you prefer
target_id <- high_velocity_ids[1]
print(paste("Visualizing Product:", target_id))

# 3. Retrieve Winner & Plot
# -------------------------
# Get the name of the winning model for this specific ID
target_winner_name <- best_model_per_product %>%
  filter(id == target_id) %>%
  pull(.model)

print(paste("Winning Model for this product:", target_winner_name))

# Filter forecasts for this specific ID and the winning model
target_forecast <- forecasts %>%
  filter(id == target_id, .model == target_winner_name)

# Filter actual validation data for this specific ID
target_actuals <- validation_ts %>%
  filter(id == target_id)

# 4. Generate the Plot
# --------------------
target_forecast %>%
  autoplot(target_actuals) +
  labs(
    title = paste("Forecast vs Actuals:", target_id),
    subtitle = paste("Winning Model:", target_winner_name, "| Data: Validation Set"),
    y = "Sales Volume",
    x = "Date"
  ) +
  theme_minimal()

In [None]:
%%R
library(feasts)

# 1. Isolate the specific winning model object (mable)
# ---------------------------------------------------
target_mable <- robust_models %>%
  filter(id == target_id) %>%
  select(id, !!sym(target_winner_name))

# 2. Plot Residuals
# ---------------------------------------------------
# This shows the Innovation Residuals, ACF, and Distribution
target_mable %>%
  gg_tsresiduals() +
  labs(title = paste("Residual Diagnostics:", target_id, "(", target_winner_name, ")"))

In [None]:
%%R
# 1. Extract residuals for the target product
# ------------------------------------------
target_residuals <- target_mable %>%
  augment() %>%
  as_tibble()

# 2. Find the top 10 largest misses (absolute error)
# --------------------------------------------------
top_10_misses <- target_residuals %>%
  mutate(abs_residual = abs(.innov)) %>%
  arrange(desc(abs_residual)) %>%
  slice(1:10) %>%
  select(id, date, .innov, sales_volume) %>%
  rename(residual = .innov, actual_sales = sales_volume)

print("Top 10 Dates with Highest Residuals:")
print(top_10_misses)

In [None]:
%%R
top_10_with_context <- top_10_misses %>%
  left_join(calendar, by = "date") %>%
  select(date, residual, actual_sales, event_name_1, snap_TX)

print(top_10_with_context)

In [None]:
%%R
tail(final_submission)

In [None]:
%%R
head(final_submission)