In [None]:
# @title Package Installation

library(modules)
library(tidyverse)
library(lubridate)
library(caret)
library(ggplot2)

shared <- modules::use("../shared")


In [None]:
# @title Dataset reading

data <- read.csv("../sources/sales.csv")
head(data)
summary(data)


In [None]:
# @title Convert date field

data$Date <- as.Date(data$Date, format = "%Y-%m-%d")
head(data)
summary(data)


In [None]:
# @title Filter Data By Store and Product

data <- data %>%
    filter(store == 0, product == 1) %>%
    select(-c(store, product))
head(data)
summary(data)
print(nrow(data))


In [None]:
# @title Plot dataset

ggplot(data, aes(x = Date, y = number_sold)) +
  geom_line(color = "blue")


In [None]:
# @title Break down Date field

data <- data %>%
  mutate(
    Year = year(Date),
    Month = as.factor(month(Date)),
    DayOfWeek = as.factor(wday(Date)),
    DayOfMonth = as.factor(day(Date))
  )

dummies <- dummyVars(~ Month + DayOfWeek + DayOfMonth, data = data)
data_transformed <- predict(dummies, newdata = data)
data <- cbind(data, data_transformed)

data <- data %>%
  mutate(
    Month = NULL,
    DayOfWeek = NULL,
    DayOfMonth = NULL
  )

head(data)


In [None]:
# @title Add lag columns

data <- data %>%
  mutate(
    Lag1 = lag(number_sold, 1),
    Lag2 = lag(number_sold, 2)
  )

data <- data %>% filter(!is.na(Lag1) & !is.na(Lag2))

head(data)


In [None]:
# @title Scale and normalize columns

number_sold_min <- min(data$number_sold)
number_sold_max <- max(data$number_sold)

year_min <- min(data$Year)
year_max <- max(data$Year)

date_min <- min(data$Date)
date_max <- max(data$Date)

data <- data %>% mutate(
    number_sold = (number_sold - number_sold_min) / (number_sold_max - number_sold_min),
    Lag1 = (Lag1 - number_sold_min) / (number_sold_max - number_sold_min),
    Lag2 = (Lag2 - number_sold_min) / (number_sold_max - number_sold_min),
    SYear = (Year - year_min) / (year_max - year_min),
    SDate = as.numeric(difftime(Date, date_min), units = "days")
    / as.numeric(difftime(date_max, date_min), units = "days"),
)

head(data)


In [None]:
# @title Split data in train and test sets

set.seed(0)

# 80% of the original data is used for training, the rest is used for testing
train_indices <- sample(seq_len(nrow(data)), 0.8 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]


In [None]:
lin_reg_01 <- train(
    number_sold ~ Date,
    data = train_data,
    method = "lm",
    metric = "RMSE",
    na.action = na.pass
)
print(summary(lin_reg_01))
number_sold_pred_01 <- predict(lin_reg_01, test_data)


In [None]:
lin_reg_02 <- train(
    number_sold ~ .,
    data = select(train_data, -c(Date, Year)),
    method = "lm",
    metric = "RMSE",
    na.action = na.pass
)
print(summary(lin_reg_02))
number_sold_pred_02 <- predict(lin_reg_02, test_data)


In [None]:
knn_reg <- train(
    number_sold ~ .,
    data = select(train_data, -c(Date, Year)),
    method = "knn",
    metric = "RMSE",
    na.action = na.pass
)
print(summary(knn_reg))
number_sold_pred_03 <- predict(knn_reg, test_data)


In [None]:
tree_reg <- train(
    number_sold ~ .,
    data = select(train_data, -c(Date, Year)),
    method = "rpart",
    metric = "RMSE",
    na.action = na.pass
)

number_sold_pred_04 <- predict(tree_reg, test_data)


In [None]:
ggplot() +
  geom_point(data = test_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_line(data = test_data, aes(x = Date, y = predict(lin_reg_01, test_data)), color = "red") +
  geom_line(data = test_data, aes(x = Date, y = predict(lin_reg_02, test_data)), color = "green") +
  labs(title = "Regresión Lineal", x = "Date", y = "number_sold") +
  theme_minimal()


In [None]:
ggplot() +
  geom_point(data = train_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_point(data = test_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_line(data = train_data, aes(x = Date, y = predict(lin_reg_01, train_data)), color = "red") +
  geom_line(data = test_data, aes(x = Date, y = predict(lin_reg_01, test_data)), color = "red") +
  labs(title = "Simple Linear Regression (Train + Test)", x = "Date", y = "number_sold") +
  theme_minimal()

ggplot() +
  geom_point(data = train_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_point(data = test_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_line(data = train_data, aes(x = Date, y = predict(lin_reg_02, train_data)), color = "green") +
  geom_line(data = test_data, aes(x = Date, y = predict(lin_reg_02, test_data)), color = "green") +
  labs(title = "Multiple Linear Regression (Train + Test)", x = "Date", y = "number_sold") +
  theme_minimal()

ggplot() +
  geom_point(data = train_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_point(data = test_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_line(data = train_data, aes(x = Date, y = predict(knn_reg, train_data)), color = "orange") +
  geom_line(data = test_data, aes(x = Date, y = predict(knn_reg, test_data)), color = "orange") +
  labs(title = "KNN (Train + Test)", x = "Date", y = "number_sold") +
  theme_minimal()

ggplot() +
  geom_point(data = train_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_point(data = test_data, aes(x = Date, y = number_sold), color = "blue") +
  geom_line(data = train_data, aes(x = Date, y = predict(tree_reg, train_data)), color = "black") +
  geom_line(data = test_data, aes(x = Date, y = predict(tree_reg, test_data)), color = "black") +
  labs(title = "Tree (Train + Test)", x = "Date", y = "number_sold") +
  theme_minimal()


In [None]:
rbind(
  shared$measures$accuracy("Simple Linear Regression", number_sold_pred_01, test_data$number_sold),
  shared$measures$accuracy("Multiple Linear Regression", number_sold_pred_02, test_data$number_sold),
  shared$measures$accuracy("KNN", number_sold_pred_03, test_data$number_sold),
  shared$measures$accuracy("Tree", number_sold_pred_04, test_data$number_sold)
) %>% arrange(as.double(rmse))
