In [None]:
library(modules)
library(tidyverse)
library(lubridate)
library(caret)
library(ggplot2)
library(dplyr)

process_dataframe <- function(url) {
    dataframe <- read.csv(url)

    dataframe <- data.frame(dataframe[3:nrow(dataframe), ])

    dataframe$clase <- as.factor(dataframe$Class)
    dataframe$Class <- NULL
    dataframe$x <- as.double(dataframe$x)
    dataframe$y <- as.double(dataframe$y)s
    dataframe$x_square <- dataframe$x^2
    dataframe$y_square <- dataframe$y^2
    x_mean <- mean(dataframe$x)
    y_mean <- mean(dataframe$y)
    dataframe$radius <- sqrt(dataframe$x_square + dataframe$y_square)
    dataframe$distance_to_mean <- sqrt((dataframe$x - x_mean)^2 + (dataframe$y - y_mean)^2)
    dataframe$product <- dataframe$x * dataframe$y
    dataframe$x_ln <- log(dataframe$x)
    dataframe$y_ln <- log(dataframe$y)

    return(dataframe)
}

plot_dataframe <- function(dataframe) {
    print(ggplot(dataframe, aes(x = dataframe$x, y = dataframe$y, color = dataframe$clase)) +
        geom_point() +
        labs(title = "Distribución de las Clases", x = "x", y = "y") +
        theme_minimal())
}

train_models <- function(dataframe, method, family, trControl, tuneGrid) {
    if (is.null(trControl)) {
        return(
            list(
                original = train(
                    clase ~ x + y,
                    data = dataframe,
                    method = method,
                    tuneGrid = tuneGrid
                ),
                completo = train(
                    clase ~ .,
                    data = dataframe,
                    method = method,
                    tuneGrid = tuneGrid
                )
            )
        )
    } else {
        return(
            list(
                original = train(
                    clase ~ x + y,
                    data = dataframe,
                    method = method,
                    trControl = trControl,
                    tuneGrid = tuneGrid
                ),
                completo = train(
                    clase ~ .,
                    data = dataframe,
                    method = method,
                    trControl = trControl,
                    tuneGrid = tuneGrid
                )
            )
        )
    }
}

part_dataframe <- function(dataframe) {
    set.seed(123)
    indice_train <- createDataPartition(dataframe$clase, p = 0.7, list = FALSE)
    return(list(train = dataframe[indice_train, ], test = dataframe[-indice_train, ]))
}

predict_models <- function(models, datos_test) {
    summary(models$original)
    summary(models$completo)

    prediccion_original <- predict(models$original, newdata = datos_test)
    confusionMatrix(prediccion_original, datos_test$clase)

    prediccion_completo <- predict(models$completo, newdata = datos_test)
    confusionMatrix(prediccion_completo, datos_test$clase)

    return(list(original = prediccion_original, completo = prediccion_completo))
}

plot_models <- function(datos, predicciones) {
    print(ggplot(datos, aes(x = x, y = y, color = clase, shape = predicciones$original)) +
        geom_point() +
        labs(title = "Distribución de las Clases", x = "x", y = "y") +
        theme_minimal())

    print(ggplot(datos, aes(x = x, y = y, color = clase, shape = predicciones$completo)) +
        geom_point() +
        labs(title = "Distribución de las Clases", x = "x", y = "y") +
        theme_minimal())
}

evaluate_model <- function(prediccion, datos_test) {
    precision <- posPredValue(prediccion, datos_test$clase, positive = "C1")
    recall <- sensitivity(prediccion, datos_test$clase, positive = "C1")
    F1 <- (2 * precision * recall) / (precision + recall)
    return(
        list(
            precision = precision,
            recall = recall,
            F1 = F1
        )
    )
}

evaluate_models <- function(predicciones, datos_test) {
    print("original")
    print(evaluate_model(predicciones$original, datos_test))
    print("completo")
    print(evaluate_model(predicciones$completo, datos_test))
}

process_model <- function(parted_dataframe, method, family = NULL, trControl = NULL, tuneGrid = NULL) {
    print(method)
    models <- train_models(parted_dataframe$train, method, family, trControl, tuneGrid)
    predicciones <- predict_models(models, parted_dataframe$test)
    plot_models(parted_dataframe$test, predicciones)
    evaluate_models(predicciones, parted_dataframe$test)

    return(predicciones)
}

process_file <- function(name, url) {
    print(name)
    dataframe <- process_dataframe(url)
    head(dataframe)
    summary(dataframe)
    plot_dataframe(dataframe)
    parted_dataframe <- part_dataframe(dataframe)
    control <- trainControl(method = "none")
    predicciones_gml <- process_model(parted_dataframe, "glm", family = "binomial", trControl = control)
    predicciones_knn <- process_model(parted_dataframe, "knn", trControl = control, tuneGrid = expand.grid(k = 5))
    predicciones_tree <- process_model(parted_dataframe, "rpart")

    parted_dataframe$test <- parted_dataframe$test %>%
        mutate(
            prediccion_original_gml = predicciones_gml$original,
            prediccion_completo_gml = predicciones_gml$completo,
            prediccion_original_knn = predicciones_knn$original,
            prediccion_completo_knn = predicciones_knn$completo,
            prediccion_original_tree = predicciones_tree$original,
            prediccion_completo_tree = predicciones_tree$completo
        )

    head(parted_dataframe$test)


    write.csv(parted_dataframe$test, file.path("../outputs/", paste(name, ".csv", sep = "")))
}


In [None]:
process_file("Circles", "../sources/circles.csv")
process_file("Spirals", "../sources/spirals.csv")
