Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
262 lines (218 sloc) 9.67 KB
library(tidyverse)
library(stringr)
library(lubridate)
library(devtools)
library(usethis)
library(janitor)
library(openintro)
library(ggplot2movies)
#-------------------------------------------------------------------------------
# Datasets: Documented in R/datasets.R
#-------------------------------------------------------------------------------
# Random sample of 68 action and romance movies from ggplot2movies::movies.
# Set random number generator seed value for reproducible/replicable random
# sampling:
set.seed(2017)
movies_sample <- ggplot2movies::movies %>%
select(title, year, rating, Action, Romance) %>%
# Note that Action & Romance variables are binary. To remove any movies
# that are both Action & Romance, we will remove them:
filter(!(Action == 1 & Romance == 1)) %>%
# Create a new variable genre that specifies whether a movie is
# "Action", "Romance", or "Neither":
mutate(genre = case_when(
Action == 1 ~ "Action",
Romance == 1 ~ "Romance",
TRUE ~ "Neither")
) %>%
# We aren't really interested "Neither", so remove these rows:
filter(genre != "Neither") %>%
# Action & Romance columns variables are not needed anymore since info is in
# genre column, so remove these columns
select(-Action, -Romance) %>%
# Sample 68 rows
sample_n(68)
usethis::use_data(movies_sample, overwrite = TRUE)
# evals data from:
# https://cran.r-project.org/web/packages/openintro/openintro.pdf#page=66
set.seed(76)
promotions <- gender.discrimination %>%
as_tibble() %>%
mutate(decision = factor(decision, levels = c("not", "promoted")),
gender = factor(gender, levels = c("male", "female"))) %>%
arrange(desc(decision), gender) %>%
mutate(id = 1:n()) %>%
select(id, decision, gender)
usethis::use_data(promotions, overwrite = TRUE)
# one shuffle of promotions
set.seed(2019)
promotions_shuffled <- promotions %>%
mutate(gender = sample(gender))
usethis::use_data(promotions_shuffled, overwrite = TRUE)
# Massachusetts Public Schools Data: Student body, funding levels, and outcomes
# (SAT, MCAS, APs, college attendance) from Kaggle:
# https://www.kaggle.com/ndalziel/massachusetts-public-schools-data
MA_schools <-
read_csv("data-raw/MA_Public_Schools_2017.csv") %>%
clean_names() %>%
# This converts the numerical variable total_enrollment into a categorical one
# school_size by cutting it into three chunks:
mutate(school_size = cut_number(total_enrollment, n = 3)) %>%
# For aesthetic purposes we changed the levels of the school_size variable to be
# small, medium, and large
mutate(size = recode_factor(school_size,
"[0,341]" = "small",
"(341,541]" = "medium",
"(541,4.26e+03]" = "large")) %>%
# Next we filtered to only include schools that had 11th and 12th grade
# students. We do this because students in the 11th and 12th grade take the math
# SAT.
filter(x11_enrollment > 0 & x12_enrollment > 0) %>%
# 58 schools has NA's for average_sat_math, we remove them:
filter(!is.na(average_sat_math)) %>%
select(school_name, average_sat_math, perc_disadvan = percent_economically_disadvantaged, size)
usethis::use_data(MA_schools, overwrite = TRUE)
# Dunkin Donuts and Starbucks counts in 2016 for 1024 Eastern Massachusetts census tracts
DD_vs_SB <-
# Read in eastern MA census tract population counts. Source:
# https://github.com/DelaneyMoran/FinalProject/blob/master/data/MAincomedata.csv
read_csv("data-raw/MAincomedata.csv") %>%
select(Geo_FIPS, county = Geo_NAME, population = SE_T001_001) %>%
mutate(Geo_FIPS = as.double(Geo_FIPS)) %>%
separate(county, into = c("fluff", "county", "state"), sep = ",") %>%
mutate(county = str_sub(county, 2,)) %>%
separate(county, into = c("county", "fluff"), sep = " County") %>%
select(-c(fluff, state)) %>%
# Join with Dunkin Donuts and Starbucks counts
right_join(read_csv("data-raw/DD_vs_SB.csv"), by = "Geo_FIPS") %>%
mutate(
FIPS_county = as.character(Geo_FIPS),
FIPS_county = str_sub(FIPS_county, 1, 5)
) %>%
select(county, FIPS = Geo_FIPS, median_income = med_inc, population, dunkin_donuts = numDD, starbucks = numSB) %>%
gather(shop_type, shops, c(dunkin_donuts, starbucks)) %>%
arrange(county, FIPS)
usethis::use_data(DD_vs_SB, overwrite = TRUE)
# House price data from https://www.kaggle.com/harlfoxem/housesalesprediction
house_prices <- read_csv("data-raw/kc_house_data.csv") %>%
mutate(
date = ymd(date),
condition = factor(condition),
grade = factor(grade),
zipcode = factor(zipcode),
waterfront = ifelse(waterfront == 0, FALSE, TRUE)
)
usethis::use_data(house_prices, overwrite = TRUE)
# evals data from: https://www.openintro.org/stat/data/?data=evals
load("data-raw/evals.RData")
evals <- evals %>%
as_tibble() %>%
select(-starts_with("bty_m")) %>%
select(-starts_with("bty_f")) %>%
select(-c(cls_perc_eval, cls_credits, cls_profs)) %>%
mutate(ID = 1:n())
# ID 94 unique profs in this data. 94 value confirmed here:
# https://chance.amstat.org/2013/04/looking-good/
unique_profs <- evals %>%
select(rank, ethnicity, gender, language, age, bty_avg) %>%
distinct() %>%
mutate(prof_ID = 1:n())
# join
evals <- evals %>%
left_join(unique_profs, by = c("rank", "ethnicity", "gender", "language", "age", "bty_avg")) %>%
select(ID, prof_ID, score, age, bty_avg, gender, ethnicity, language, rank, starts_with("pic_"), everything())
usethis::use_data(evals, overwrite = TRUE)
# Data derived from the results of a study conducted
# on the Mythbusters television show on Discovery Network
# investigating whether yawning is contagious
# https://www.discovery.com/tv-shows/mythbusters/videos/is-yawning-contagious
group <- c(rep("control", 12), rep("seed", 24),
rep("control", 4), rep("seed", 10))
yawn <- c(rep("no", 36), rep("yes", 14))
mythbusters_yawn <- tibble::tibble(group, yawn) %>%
sample_n(50) %>%
mutate(subj = seq(1, 50)) %>%
select(subj, group, yawn)
usethis::use_data(mythbusters_yawn, overwrite = TRUE)
#-------------------------------------------------------------------------------
# Sampling bowl: Documented in R/bowl.R
#-------------------------------------------------------------------------------
# Sampling bowl used at Amherst College http://www.qualitytng.com/sampling-bowls/
set.seed(76)
N <- 2400
bowl <-
data_frame(
color = c(rep("red", 900), rep("white", N-900))
) %>%
sample_frac(1) %>%
mutate(
ball_ID = 1:N) %>%
select(ball_ID, everything())
usethis::use_data(bowl, overwrite = TRUE)
# 10 samples of size n=50 from
# https://github.com/moderndive/moderndive_book/blob/master/images/sampling_bowl.jpeg
bowl_samples <- read_csv("data-raw/sampling_responses.csv") %>%
mutate(n = red + white + green)
usethis::use_data(bowl_samples, overwrite = TRUE)
# 33 tactile samples of size n=50 from
# https://github.com/moderndive/moderndive_book/blob/master/images/sampling_bowl.jpeg
tactile_prop_red <- read_csv("data-raw/sampling_red_balls.csv")
usethis::use_data(tactile_prop_red, overwrite = TRUE)
# Ilyas and Yohan's shovel sample in Chapter 9 case study:
set.seed(76)
bowl_sample_1 <- c(rep("red", 21), rep("white", 50 - 21)) %>%
sample() %>%
tibble::tibble(color = .)
usethis::use_data(bowl_sample_1, overwrite = TRUE)
# Tactile samples from bowl
# Original Google Sheet here:
# https://docs.google.com/spreadsheets/d/1KgJYLiKZ7yhXUAvL4Xacchz3H_aNfCPmrFdFsmeb1VE/
bowl_samples <-
"https://docs.google.com/spreadsheets/d/e/2PACX-1vSEzMpfzZ-VC2sEUpc97d3IdzqEyMANrgG0jbzzRwpiNPJu1sNgO_oWopl5yctUzmn0N-8yHPcmjfIh/pub?gid=36230158&single=true&output=csv" %>%
read_csv() %>%
mutate(replicate = 1:n()) %>%
select(replicate, name = Name,
num_red = `How many of the balls in your shovel were red?`, everything()) %>%
# Transform
gather(ID, color, -c(replicate, name, num_red)) %>%
select(-c(num_red, ID)) %>%
arrange(name) %>%
mutate(color = tolower(color))
usethis::use_data(bowl_samples, overwrite = TRUE)
#-------------------------------------------------------------------------------
# Pennies: Documented in R/pennie.R
#-------------------------------------------------------------------------------
# Population of 800 pennies from
# https://www.statcrunch.com/app/index.php?dataid=301596
pennies <- read_csv("data-raw/population_of_pennies.csv")
usethis::use_data(pennies, overwrite = TRUE)
# A pseudorandom sample of the `pennies` tibble used
# in explaining bootstrapping
set.seed(2018)
orig_pennies_sample <- pennies %>%
sample_n(40)
usethis::use_data(orig_pennies_sample, overwrite = TRUE)
# Sample of 50 pennies from Florence Bank at the corner of Main Street and
# Pleasant/King Street in Northampton MA on Friday 2019/2/1
# Original Google Sheet here:
# https://docs.google.com/spreadsheets/d/1kG_s7LhGVusL-oFqWPHygX6cebOTzSERxbJXT3I2xoo/
pennies_sample <-
"https://docs.google.com/spreadsheets/d/e/2PACX-1vRtLeHU6j9PRTAJ0bRcUF2uVc1TzYeXd9cC0lwCRfBREy8POx6MgfVeK2CJU6emRKFn_51H-Z8H5YlS/pub?gid=0&single=true&output=csv" %>%
read_csv() %>%
mutate(ID = 1:n()) %>%
select(ID, year)
usethis::use_data(pennies_sample, overwrite = TRUE)
# Resamples of pennies_sample
# Original Google Sheet here:
# https://docs.google.com/spreadsheets/d/1y3kOsU_wDrDd5eiJbEtLeHT9L5SvpZb_TrzwFBsouk0/
pennies_resamples <-
"https://docs.google.com/spreadsheets/d/e/2PACX-1vS-8hCHL4Gt6KvtjlSlA42CC4eNPhN4tg7yM4NVQ1MRa1mIA0EUf3t0NThNrw5ctlBWjKUbQPYuevS6/pub?gid=0&single=true&output=csv" %>%
read_csv() %>%
select(-`Resampled penny #`) %>%
gather(name, year) %>%
group_by(name) %>%
nest() %>%
mutate(replicate = 1:n()) %>%
select(replicate, everything()) %>%
unnest()
usethis::use_data(pennies_resamples, overwrite = TRUE)
You can’t perform that action at this time.