In [None]:
# @title Package Installation

library(modules)
library(tidyverse)
library(lubridate)
library(caret)
library(ggplot2)

shared <- modules::use("../shared")


In [None]:
# @title Dataset reading

data <- read.csv("../sources/housing.csv")
head(data)
summary(data)


In [None]:
# @title Plot dataset

ggplot(data, aes(x = area, y = price)) +
  geom_line(color = "blue")


In [None]:
# @title Convert fields

price_min <- min(data$price)
price_max <- max(data$price)

area_min <- min(data$area)
area_max <- max(data$area)

bedrooms_min <- min(data$bedrooms)
bedrooms_max <- max(data$bedrooms)

bathrooms_min <- min(data$bathrooms)
bathrooms_max <- max(data$bathrooms)

stories_min <- min(data$stories)
stories_max <- max(data$stories)

parking_min <- min(data$parking)
parking_max <- max(data$parking)

data <- data %>% mutate(
    price = (price - price_min) / (price_max - price_min),
    area = (area - area_min) / (area_max - area_min),
    bedrooms = (bedrooms - bedrooms_min) / (bedrooms_max - bedrooms_min),
    bathrooms = (bathrooms - bathrooms_min) / (bathrooms_max - bathrooms_min),
    stories = (stories - stories_min) / (stories_max - stories_min),
    mainroad = as.factor(mainroad),
    guestroom = as.factor(guestroom),
    basement = as.factor(basement),
    hotwaterheating = as.factor(hotwaterheating),
    airconditioning = as.factor(airconditioning),
    parking = (parking - parking_min) / (parking_max - parking_min),
    prefarea = as.factor(prefarea),
    furnishingstatus = as.factor(furnishingstatus)
)

dummies <- dummyVars(
    ~ mainroad +
        guestroom +
        basement +
        hotwaterheating +
        airconditioning +
        prefarea +
        furnishingstatus,
    data = data
)
data_transformed <- predict(dummies, newdata = data)
data <- cbind(data, data_transformed)

data <- data %>% mutate(
    mainroad = NULL,
    guestroom = NULL,
    basement = NULL,
    hotwaterheating = NULL,
    airconditioning = NULL,
    prefarea = NULL,
    furnishingstatus = NULL
)

head(data)
summary(data)


In [None]:
# @title Split data in train and test sets

set.seed(0)

# 80% of the original data is used for training, the rest is used for testing
train_indices <- sample(seq_len(nrow(data)), 0.8 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]


In [None]:
lin_reg_01 <- train(
    price ~ area,
    data = train_data,
    method = "lm",
    metric = "RMSE",
    na.action = na.pass
)
print(summary(lin_reg_01))
price_pred_01 <- predict(lin_reg_01, test_data)


In [None]:
lin_reg_02 <- train(
    price ~ .,
    data = train_data,
    method = "lm",
    metric = "RMSE",
    na.action = na.pass
)
print(summary(lin_reg_02))
price_pred_02 <- predict(lin_reg_02, test_data)


In [None]:
knn_reg <- train(
    price ~ .,
    data = train_data,
    method = "knn",
    metric = "RMSE",
    na.action = na.pass
)
print(summary(knn_reg))
price_pred_03 <- predict(knn_reg, test_data)


In [None]:
tree_reg <- train(
    price ~ .,
    data = train_data,
    method = "rpart",
    metric = "RMSE",
    na.action = na.pass
)

price_pred_04 <- predict(tree_reg, test_data)


In [None]:
ggplot() +
  geom_point(data = test_data, aes(x = area, y = price), color = "blue") +
  geom_line(data = test_data, aes(x = area, y = predict(lin_reg_01, test_data)), color = "red") +
  geom_line(data = test_data, aes(x = area, y = predict(lin_reg_02, test_data)), color = "green") +
  labs(title = "Regresión Lineal", x = "area", y = "price") +
  theme_minimal()


In [None]:
ggplot() +
  geom_point(data = train_data, aes(x = area, y = price), color = "blue") +
  geom_point(data = test_data, aes(x = area, y = price), color = "blue") +
  geom_line(data = train_data, aes(x = area, y = predict(lin_reg_01, train_data)), color = "red") +
  geom_line(data = test_data, aes(x = area, y = predict(lin_reg_01, test_data)), color = "red") +
  labs(title = "Simple Linear Regression (Train + Test)", x = "area", y = "price") +
  theme_minimal()

ggplot() +
  geom_point(data = train_data, aes(x = area, y = price), color = "blue") +
  geom_point(data = test_data, aes(x = area, y = price), color = "blue") +
  geom_line(data = train_data, aes(x = area, y = predict(lin_reg_02, train_data)), color = "green") +
  geom_line(data = test_data, aes(x = area, y = predict(lin_reg_02, test_data)), color = "green") +
  labs(title = "Simple Linear Regression (Train + Test)", x = "area", y = "price") +
  theme_minimal()

ggplot() +
  geom_point(data = train_data, aes(x = area, y = price), color = "blue") +
  geom_point(data = test_data, aes(x = area, y = price), color = "blue") +
  geom_line(data = train_data, aes(x = area, y = predict(knn_reg, train_data)), color = "orange") +
  geom_line(data = test_data, aes(x = area, y = predict(knn_reg, test_data)), color = "orange") +
  labs(title = "KNN (Train + Test)", x = "area", y = "price") +
  theme_minimal()

ggplot() +
  geom_point(data = train_data, aes(x = area, y = price), color = "blue") +
  geom_point(data = test_data, aes(x = area, y = price), color = "blue") +
  geom_line(data = train_data, aes(x = area, y = predict(tree_reg, train_data)), color = "black") +
  geom_line(data = test_data, aes(x = area, y = predict(tree_reg, test_data)), color = "black") +
  labs(title = "Tree (Train + Test)", x = "area", y = "price") +
  theme_minimal()


In [None]:
rbind(
  shared$measures$accuracy("Simple Linear Regression", price_pred_01, test_data$price),
  shared$measures$accuracy("Mulitple Linear Regression", price_pred_02, test_data$price),
  shared$measures$accuracy("KNN", price_pred_03, test_data$price),
  shared$measures$accuracy("Tree", price_pred_04, test_data$price)
) %>% arrange(as.double(rmse))
