Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign up| globalVariables(c( | |
| "term", ".resid", "AIC", "BIC", "deviance", "df.residual", "logLik", "ID", | |
| "mse", "rmse", "residual", "r_squared", "adj_r_squared", "conf_low", | |
| "conf_high" | |
| )) | |
| #' Get regression table | |
| #' | |
| #' Output regression table for an \code{lm()} regression in "tidy" format. This function | |
| #' is a wrapper function for \code{broom::tidy()} and includes confidence | |
| #' intervals in the output table by default. | |
| #' | |
| #' @param model an \code{lm()} model object | |
| #' @param digits number of digits precision in output table | |
| #' @param print If TRUE, return in print format suitable for R Markdown | |
| #' | |
| #' @return A tibble-formatted regression table along with lower and upper end | |
| #' points of all confidence intervals for all parameters \code{lower_ci} and | |
| #' \code{upper_ci}. | |
| #' @importFrom stats lm | |
| #' @importFrom stats predict | |
| #' @importFrom formula.tools lhs | |
| #' @importFrom formula.tools rhs | |
| #' @importFrom broom tidy | |
| #' @importFrom tibble as_tibble | |
| #' @importFrom janitor clean_names | |
| #' @importFrom knitr kable | |
| #' @export | |
| #' @seealso \code{\link[broom:reexports]{tidy}}, \code{\link{get_regression_points}}, \code{\link{get_regression_summaries}} | |
| #' | |
| #' @examples | |
| #' library(moderndive) | |
| #' | |
| #' # Fit lm() regression: | |
| #' mpg_model <- lm(mpg ~ cyl, data = mtcars) | |
| #' | |
| #' # Get regression table: | |
| #' get_regression_table(mpg_model) | |
| get_regression_table <- function(model, digits = 3, print = FALSE) { | |
| # Check inputs | |
| input_checks(model, digits, print) | |
| # Define outcome and explanatory/predictor variables | |
| outcome_variable <- formula(model) %>% | |
| lhs() %>% | |
| all.vars() | |
| explanatory_variable <- formula(model) %>% | |
| rhs() %>% | |
| all.vars() | |
| # Create output tibble | |
| regression_table <- model %>% | |
| tidy(conf.int = TRUE) %>% | |
| mutate_if(is.numeric, round, digits = digits) %>% | |
| mutate(term = ifelse(term == "(Intercept)", "intercept", term)) %>% | |
| as_tibble() %>% | |
| clean_names() %>% | |
| rename( | |
| lower_ci = conf_low, | |
| upper_ci = conf_high | |
| ) | |
| # Transform to markdown | |
| if (print) { | |
| regression_table <- regression_table %>% | |
| kable() | |
| } | |
| return(regression_table) | |
| } | |
| #' Get regression points | |
| #' | |
| #' Output information on each point/observation used in an \code{lm()} regression in | |
| #' "tidy" format. This function is a wrapper function for \code{broom::augment()} | |
| #' and renames the variables to have more intuitive names. | |
| #' | |
| #' @inheritParams get_regression_table | |
| #' @param newdata A new data frame of points/observations to apply \code{model} to | |
| #' obtain new fitted values and/or predicted values y-hat. Note the format of | |
| #' \code{newdata} must match the format of the original \code{data} used to fit | |
| #' \code{model}. | |
| #' @param ID A string indicating which variable in either the original data used to fit | |
| #' \code{model} or \code{newdata} should be used as | |
| #' an identification variable to distinguish the observational units | |
| #' in each row. This variable will be the left-most variable in the output data | |
| #' frame. If \code{ID} is unspecified, a column \code{ID} with values 1 through the number of | |
| #' rows is returned as the identification variable. | |
| #' | |
| #' @return A tibble-formatted regression table of outcome/response variable, | |
| #' all explanatory/predictor variables, the fitted/predicted value, and residual. | |
| #' @importFrom dplyr select | |
| #' @importFrom dplyr rename_at | |
| #' @importFrom dplyr vars | |
| #' @importFrom dplyr rename | |
| #' @importFrom dplyr mutate | |
| #' @importFrom dplyr pull | |
| #' @importFrom dplyr everything | |
| #' @importFrom dplyr mutate_if | |
| #' @importFrom dplyr summarise | |
| #' @importFrom stats formula | |
| #' @importFrom formula.tools lhs | |
| #' @importFrom formula.tools rhs | |
| #' @importFrom broom augment | |
| #' @importFrom tibble as_tibble | |
| #' @importFrom janitor clean_names | |
| #' @importFrom stringr str_c | |
| #' @importFrom knitr kable | |
| #' @importFrom rlang sym | |
| #' @importFrom rlang ":=" | |
| #' @export | |
| #' @seealso \code{\link[broom:reexports]{augment}}, \code{\link{get_regression_table}}, \code{\link{get_regression_summaries}} | |
| #' | |
| #' @examples | |
| #' library(dplyr) | |
| #' library(tibble) | |
| #' | |
| #' # Convert rownames to column | |
| #' mtcars <- mtcars %>% | |
| #' rownames_to_column(var = "automobile") | |
| #' | |
| #' # Fit lm() regression: | |
| #' mpg_model <- lm(mpg ~ cyl, data = mtcars) | |
| #' | |
| #' # Get information on all points in regression: | |
| #' get_regression_points(mpg_model, ID = "automobile") | |
| #' | |
| #' # Create training and test set based on mtcars: | |
| #' training_set <- mtcars %>% | |
| #' sample_frac(0.5) | |
| #' test_set <- mtcars %>% | |
| #' anti_join(training_set, by = "automobile") | |
| #' | |
| #' # Fit model to training set: | |
| #' mpg_model_train <- lm(mpg ~ cyl, data = training_set) | |
| #' | |
| #' # Make predictions on test set: | |
| #' get_regression_points(mpg_model_train, newdata = test_set, ID = "automobile") | |
| get_regression_points <- | |
| function(model, digits = 3, print = FALSE, newdata = NULL, ID = NULL) { | |
| # Check inputs | |
| input_checks(model, digits, print) | |
| if (!is.null(ID)) { | |
| check_character(ID) | |
| } | |
| if (!is.null(newdata)) { | |
| check_data_frame(newdata) | |
| } | |
| # Define outcome and explanatory/predictor variables | |
| outcome_variable <- formula(model) %>% | |
| lhs() %>% | |
| all.vars() | |
| outcome_variable_hat <- str_c(outcome_variable, "_hat") | |
| explanatory_variable <- formula(model) %>% | |
| rhs() %>% | |
| all.vars() | |
| # Compute all fitted/predicted values and residuals for three possible | |
| # cases/scenarios | |
| if (is.null(newdata)) { | |
| # Case 1: For the same data set used to fit model, compute fitted values | |
| # and residuals | |
| regression_points <- model %>% | |
| augment() %>% | |
| select(!!c(outcome_variable, explanatory_variable, ".fitted", ".resid")) %>% | |
| rename_at(vars(".fitted"), ~outcome_variable_hat) %>% | |
| rename(residual = .resid) | |
| } else { | |
| # Two cases when we wanted to return point information on a new data set, | |
| # newdata, different than the one used to fit the model with: | |
| if (outcome_variable %in% names(newdata)) { | |
| # Case 2.a) If outcome variable is included, we can compute both fitted | |
| # values and residuals. | |
| regression_points <- newdata %>% | |
| select(!!c(outcome_variable, explanatory_variable)) %>% | |
| # Compute fitted values | |
| mutate(y_hat = predict(model, newdata = newdata)) %>% | |
| rename_at(vars("y_hat"), ~outcome_variable_hat) %>% | |
| # Compute residuals | |
| mutate(residual := !!sym(outcome_variable) - !!sym(outcome_variable_hat)) | |
| } else { | |
| # Case 2.b) If outcome variable is not included, we can only return | |
| # predicted values and not the residuals. This corresponds to typical | |
| # prediction scenario. | |
| regression_points <- model %>% | |
| # Compute fitted values: | |
| augment(newdata = newdata) %>% | |
| select(!!c(explanatory_variable, ".fitted")) %>% | |
| rename_at(vars(".fitted"), ~ str_c(outcome_variable, "_hat")) | |
| } | |
| } | |
| # Set identification variable for three possible cases/scenarios | |
| if (is.null(ID)) { | |
| # Case 1: If ID argument is not specified, set as ID variable as 1 through | |
| # number of rows | |
| regression_points <- regression_points %>% | |
| mutate(ID = 1:n()) %>% | |
| select(ID, everything()) | |
| } else { | |
| # Two cases when ID argument is specified: | |
| if (is.null(newdata)) { | |
| # Case 2.a) When computing fitted values and residuals for the same data | |
| # used to fit the model, extract ID variable from original model fit. | |
| identification_variable <- eval(model$call$data, environment(formula(model))) %>% | |
| pull(!!ID) | |
| } else { | |
| # Case 2.b) When computing predicted values for a new dataset newdata than | |
| # the one used to fit the model, extract ID variable from newdata. | |
| identification_variable <- newdata %>% | |
| pull(!!ID) | |
| } | |
| # Set ID variable | |
| regression_points <- regression_points %>% | |
| mutate(ID = identification_variable) %>% | |
| select(ID, everything()) %>% | |
| rename_at(vars("ID"), ~ID) | |
| } | |
| # Final clean-up | |
| regression_points <- regression_points %>% | |
| mutate_if(is.double, round, digits = digits) %>% | |
| as_tibble() | |
| # Transform to markdown | |
| if (print) { | |
| regression_points <- regression_points %>% | |
| kable() | |
| } | |
| return(regression_points) | |
| } | |
| #' Get regression summary values | |
| #' | |
| #' Output scalar summary statistics for an \code{lm()} regression in "tidy" | |
| #' format. This function is a wrapper function for \code{broom::glance()}. | |
| #' | |
| #' @inheritParams get_regression_table | |
| #' | |
| #' @return A single-row tibble with regression summaries. Ex: \code{r_squared} and \code{mse}. | |
| #' @importFrom dplyr select | |
| #' @importFrom dplyr rename_at | |
| #' @importFrom dplyr vars | |
| #' @importFrom dplyr rename | |
| #' @importFrom dplyr mutate | |
| #' @importFrom dplyr everything | |
| #' @importFrom dplyr mutate_if | |
| #' @importFrom dplyr summarise | |
| #' @importFrom dplyr bind_cols | |
| #' @importFrom dplyr n | |
| #' @importFrom stats formula | |
| #' @importFrom formula.tools lhs | |
| #' @importFrom formula.tools rhs | |
| #' @importFrom broom glance | |
| #' @importFrom broom augment | |
| #' @importFrom tibble as_tibble | |
| #' @importFrom janitor clean_names | |
| #' @importFrom knitr kable | |
| #' @export | |
| #' @seealso \code{\link[broom:reexports]{glance}}, \code{\link{get_regression_table}}, \code{\link{get_regression_points}} | |
| #' | |
| #' @examples | |
| #' library(moderndive) | |
| #' | |
| #' # Fit lm() regression: | |
| #' mpg_model <- lm(mpg ~ cyl, data = mtcars) | |
| #' | |
| #' # Get regression summaries: | |
| #' get_regression_summaries(mpg_model) | |
| get_regression_summaries <- | |
| function(model, digits = 3, print = FALSE) { | |
| # Check inputs | |
| input_checks(model, digits, print) | |
| # Define outcome and explanatory/predictor variables | |
| outcome_variable <- formula(model) %>% | |
| lhs() %>% | |
| all.vars() | |
| explanatory_variable <- formula(model) %>% | |
| rhs() %>% | |
| all.vars() | |
| # Compute mean-squared error and root mean-squared error | |
| mse_and_rmse <- model %>% | |
| augment() %>% | |
| select(!!c(outcome_variable, explanatory_variable, ".fitted", ".resid")) %>% | |
| rename_at(vars(".fitted"), ~ str_c(outcome_variable, "_hat")) %>% | |
| rename(residual = .resid) %>% | |
| summarise(mse = mean(residual^2), rmse = sqrt(mse)) | |
| # Create output tibble | |
| regression_summaries <- model %>% | |
| glance() %>% | |
| mutate_if(is.numeric, round, digits = digits) %>% | |
| select(-c(AIC, BIC, deviance, df.residual, logLik)) %>% | |
| as_tibble() %>% | |
| clean_names() %>% | |
| bind_cols(mse_and_rmse) %>% | |
| select(r_squared, adj_r_squared, mse, rmse, everything()) | |
| # Transform to markdown | |
| if (print) { | |
| regression_summaries <- regression_summaries %>% | |
| kable() | |
| } | |
| return(regression_summaries) | |
| } | |
| # Check input functions ---- | |
| input_checks <- function(model, digits = 3, print = FALSE) { | |
| # Since the `"glm"` class also contains the `"lm"` class | |
| if (length(class(model)) != 1 | !("lm" %in% class(model))) { | |
| stop(paste( | |
| "Only simple linear regression", | |
| "models are supported. Try again using only `lm()`", | |
| "models as appropriate." | |
| )) | |
| } | |
| check_numeric(digits) | |
| check_logical(print) | |
| } | |
| check_numeric <- function(input) { | |
| if (!is.numeric(input)) { | |
| stop("The input entry must be numeric.") | |
| } | |
| } | |
| check_logical <- function(input) { | |
| if (!is.logical(input)) { | |
| stop("The input must be logical.") | |
| } | |
| } | |
| check_character <- function(input) { | |
| if (!is.character(input)) { | |
| stop("The input must be a character.") | |
| } | |
| } | |
| check_data_frame <- function(input) { | |
| if (!is.data.frame(input)) { | |
| stop("The input must be a data frame.") | |
| } | |
| } |