Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
354 lines (325 sloc) 11.3 KB
globalVariables(c(
"term", ".resid", "AIC", "BIC", "deviance", "df.residual", "logLik", "ID",
"mse", "rmse", "residual", "r_squared", "adj_r_squared", "conf_low",
"conf_high"
))
#' Get regression table
#'
#' Output regression table for an \code{lm()} regression in "tidy" format. This function
#' is a wrapper function for \code{broom::tidy()} and includes confidence
#' intervals in the output table by default.
#'
#' @param model an \code{lm()} model object
#' @param digits number of digits precision in output table
#' @param print If TRUE, return in print format suitable for R Markdown
#'
#' @return A tibble-formatted regression table along with lower and upper end
#' points of all confidence intervals for all parameters \code{lower_ci} and
#' \code{upper_ci}.
#' @importFrom stats lm
#' @importFrom stats predict
#' @importFrom formula.tools lhs
#' @importFrom formula.tools rhs
#' @importFrom broom tidy
#' @importFrom tibble as_tibble
#' @importFrom janitor clean_names
#' @importFrom knitr kable
#' @export
#' @seealso \code{\link[broom:reexports]{tidy}}, \code{\link{get_regression_points}}, \code{\link{get_regression_summaries}}
#'
#' @examples
#' library(moderndive)
#'
#' # Fit lm() regression:
#' mpg_model <- lm(mpg ~ cyl, data = mtcars)
#'
#' # Get regression table:
#' get_regression_table(mpg_model)
get_regression_table <- function(model, digits = 3, print = FALSE) {
# Check inputs
input_checks(model, digits, print)
# Define outcome and explanatory/predictor variables
outcome_variable <- formula(model) %>%
lhs() %>%
all.vars()
explanatory_variable <- formula(model) %>%
rhs() %>%
all.vars()
# Create output tibble
regression_table <- model %>%
tidy(conf.int = TRUE) %>%
mutate_if(is.numeric, round, digits = digits) %>%
mutate(term = ifelse(term == "(Intercept)", "intercept", term)) %>%
as_tibble() %>%
clean_names() %>%
rename(
lower_ci = conf_low,
upper_ci = conf_high
)
# Transform to markdown
if (print) {
regression_table <- regression_table %>%
kable()
}
return(regression_table)
}
#' Get regression points
#'
#' Output information on each point/observation used in an \code{lm()} regression in
#' "tidy" format. This function is a wrapper function for \code{broom::augment()}
#' and renames the variables to have more intuitive names.
#'
#' @inheritParams get_regression_table
#' @param newdata A new data frame of points/observations to apply \code{model} to
#' obtain new fitted values and/or predicted values y-hat. Note the format of
#' \code{newdata} must match the format of the original \code{data} used to fit
#' \code{model}.
#' @param ID A string indicating which variable in either the original data used to fit
#' \code{model} or \code{newdata} should be used as
#' an identification variable to distinguish the observational units
#' in each row. This variable will be the left-most variable in the output data
#' frame. If \code{ID} is unspecified, a column \code{ID} with values 1 through the number of
#' rows is returned as the identification variable.
#'
#' @return A tibble-formatted regression table of outcome/response variable,
#' all explanatory/predictor variables, the fitted/predicted value, and residual.
#' @importFrom dplyr select
#' @importFrom dplyr rename_at
#' @importFrom dplyr vars
#' @importFrom dplyr rename
#' @importFrom dplyr mutate
#' @importFrom dplyr pull
#' @importFrom dplyr everything
#' @importFrom dplyr mutate_if
#' @importFrom dplyr summarise
#' @importFrom stats formula
#' @importFrom formula.tools lhs
#' @importFrom formula.tools rhs
#' @importFrom broom augment
#' @importFrom tibble as_tibble
#' @importFrom janitor clean_names
#' @importFrom stringr str_c
#' @importFrom knitr kable
#' @importFrom rlang sym
#' @importFrom rlang ":="
#' @export
#' @seealso \code{\link[broom:reexports]{augment}}, \code{\link{get_regression_table}}, \code{\link{get_regression_summaries}}
#'
#' @examples
#' library(dplyr)
#' library(tibble)
#'
#' # Convert rownames to column
#' mtcars <- mtcars %>%
#' rownames_to_column(var = "automobile")
#'
#' # Fit lm() regression:
#' mpg_model <- lm(mpg ~ cyl, data = mtcars)
#'
#' # Get information on all points in regression:
#' get_regression_points(mpg_model, ID = "automobile")
#'
#' # Create training and test set based on mtcars:
#' training_set <- mtcars %>%
#' sample_frac(0.5)
#' test_set <- mtcars %>%
#' anti_join(training_set, by = "automobile")
#'
#' # Fit model to training set:
#' mpg_model_train <- lm(mpg ~ cyl, data = training_set)
#'
#' # Make predictions on test set:
#' get_regression_points(mpg_model_train, newdata = test_set, ID = "automobile")
get_regression_points <-
function(model, digits = 3, print = FALSE, newdata = NULL, ID = NULL) {
# Check inputs
input_checks(model, digits, print)
if (!is.null(ID)) {
check_character(ID)
}
if (!is.null(newdata)) {
check_data_frame(newdata)
}
# Define outcome and explanatory/predictor variables
outcome_variable <- formula(model) %>%
lhs() %>%
all.vars()
outcome_variable_hat <- str_c(outcome_variable, "_hat")
explanatory_variable <- formula(model) %>%
rhs() %>%
all.vars()
# Compute all fitted/predicted values and residuals for three possible
# cases/scenarios
if (is.null(newdata)) {
# Case 1: For the same data set used to fit model, compute fitted values
# and residuals
regression_points <- model %>%
augment() %>%
select(!!c(outcome_variable, explanatory_variable, ".fitted", ".resid")) %>%
rename_at(vars(".fitted"), ~outcome_variable_hat) %>%
rename(residual = .resid)
} else {
# Two cases when we wanted to return point information on a new data set,
# newdata, different than the one used to fit the model with:
if (outcome_variable %in% names(newdata)) {
# Case 2.a) If outcome variable is included, we can compute both fitted
# values and residuals.
regression_points <- newdata %>%
select(!!c(outcome_variable, explanatory_variable)) %>%
# Compute fitted values
mutate(y_hat = predict(model, newdata = newdata)) %>%
rename_at(vars("y_hat"), ~outcome_variable_hat) %>%
# Compute residuals
mutate(residual := !!sym(outcome_variable) - !!sym(outcome_variable_hat))
} else {
# Case 2.b) If outcome variable is not included, we can only return
# predicted values and not the residuals. This corresponds to typical
# prediction scenario.
regression_points <- model %>%
# Compute fitted values:
augment(newdata = newdata) %>%
select(!!c(explanatory_variable, ".fitted")) %>%
rename_at(vars(".fitted"), ~ str_c(outcome_variable, "_hat"))
}
}
# Set identification variable for three possible cases/scenarios
if (is.null(ID)) {
# Case 1: If ID argument is not specified, set as ID variable as 1 through
# number of rows
regression_points <- regression_points %>%
mutate(ID = 1:n()) %>%
select(ID, everything())
} else {
# Two cases when ID argument is specified:
if (is.null(newdata)) {
# Case 2.a) When computing fitted values and residuals for the same data
# used to fit the model, extract ID variable from original model fit.
identification_variable <- eval(model$call$data, environment(formula(model))) %>%
pull(!!ID)
} else {
# Case 2.b) When computing predicted values for a new dataset newdata than
# the one used to fit the model, extract ID variable from newdata.
identification_variable <- newdata %>%
pull(!!ID)
}
# Set ID variable
regression_points <- regression_points %>%
mutate(ID = identification_variable) %>%
select(ID, everything()) %>%
rename_at(vars("ID"), ~ID)
}
# Final clean-up
regression_points <- regression_points %>%
mutate_if(is.double, round, digits = digits) %>%
as_tibble()
# Transform to markdown
if (print) {
regression_points <- regression_points %>%
kable()
}
return(regression_points)
}
#' Get regression summary values
#'
#' Output scalar summary statistics for an \code{lm()} regression in "tidy"
#' format. This function is a wrapper function for \code{broom::glance()}.
#'
#' @inheritParams get_regression_table
#'
#' @return A single-row tibble with regression summaries. Ex: \code{r_squared} and \code{mse}.
#' @importFrom dplyr select
#' @importFrom dplyr rename_at
#' @importFrom dplyr vars
#' @importFrom dplyr rename
#' @importFrom dplyr mutate
#' @importFrom dplyr everything
#' @importFrom dplyr mutate_if
#' @importFrom dplyr summarise
#' @importFrom dplyr bind_cols
#' @importFrom dplyr n
#' @importFrom stats formula
#' @importFrom formula.tools lhs
#' @importFrom formula.tools rhs
#' @importFrom broom glance
#' @importFrom broom augment
#' @importFrom tibble as_tibble
#' @importFrom janitor clean_names
#' @importFrom knitr kable
#' @export
#' @seealso \code{\link[broom:reexports]{glance}}, \code{\link{get_regression_table}}, \code{\link{get_regression_points}}
#'
#' @examples
#' library(moderndive)
#'
#' # Fit lm() regression:
#' mpg_model <- lm(mpg ~ cyl, data = mtcars)
#'
#' # Get regression summaries:
#' get_regression_summaries(mpg_model)
get_regression_summaries <-
function(model, digits = 3, print = FALSE) {
# Check inputs
input_checks(model, digits, print)
# Define outcome and explanatory/predictor variables
outcome_variable <- formula(model) %>%
lhs() %>%
all.vars()
explanatory_variable <- formula(model) %>%
rhs() %>%
all.vars()
# Compute mean-squared error and root mean-squared error
mse_and_rmse <- model %>%
augment() %>%
select(!!c(outcome_variable, explanatory_variable, ".fitted", ".resid")) %>%
rename_at(vars(".fitted"), ~ str_c(outcome_variable, "_hat")) %>%
rename(residual = .resid) %>%
summarise(mse = mean(residual^2), rmse = sqrt(mse))
# Create output tibble
regression_summaries <- model %>%
glance() %>%
mutate_if(is.numeric, round, digits = digits) %>%
select(-c(AIC, BIC, deviance, df.residual, logLik)) %>%
as_tibble() %>%
clean_names() %>%
bind_cols(mse_and_rmse) %>%
select(r_squared, adj_r_squared, mse, rmse, everything())
# Transform to markdown
if (print) {
regression_summaries <- regression_summaries %>%
kable()
}
return(regression_summaries)
}
# Check input functions ----
input_checks <- function(model, digits = 3, print = FALSE) {
# Since the `"glm"` class also contains the `"lm"` class
if (length(class(model)) != 1 | !("lm" %in% class(model))) {
stop(paste(
"Only simple linear regression",
"models are supported. Try again using only `lm()`",
"models as appropriate."
))
}
check_numeric(digits)
check_logical(print)
}
check_numeric <- function(input) {
if (!is.numeric(input)) {
stop("The input entry must be numeric.")
}
}
check_logical <- function(input) {
if (!is.logical(input)) {
stop("The input must be logical.")
}
}
check_character <- function(input) {
if (!is.character(input)) {
stop("The input must be a character.")
}
}
check_data_frame <- function(input) {
if (!is.data.frame(input)) {
stop("The input must be a data frame.")
}
}
You can’t perform that action at this time.