In [1]:
# Title:  Handwritten Digits Dataset
# File:   HandwrittenDigits.R
# Course: Data Mining with R

# INSTALL AND LOAD PACKAGES ################################

# Install pacman if you don't have it (uncomment next line)
# install.packages("pacman")

# Install and/or load packages with pacman
pacman::p_load(  # Use p_load function from pacman
  janitor,       # Remove constants
  magrittr,      # Pipes
  pacman,        # Load/unload packages
  psych,         # Descriptive statistics
  rio,           # Import/export data
  tidyverse      # So many reasons
)

also installing the dependency ‘snakecase’





The downloaded binary packages are in
	/var/folders/rh/pfd6hrw52s35skb7d44l36nr0000gn/T//RtmpGMdiXh/downloaded_packages



janitor installed



In [15]:
# LOAD AND PREPARE DATA ####################################

# Many of the for this course come from the Machine Learning
# Repository at the University of California, Irvine (UCI),
# at https://archive.ics.uci.edu/

# For all three demonstrations of dimensionality reduction,
# we'll use the "Optical Recognition of Handwritten Digits
# Data Set," which can be accessed via https://j.mp/34NFNGn

# We'll use the dataset saved in "optdigits.tra," which is
# the training dataset. This data can be downloaded as a CSV
# file without the variable names, but you'll need to
# manually change the extension. However, to save the
# variable names, I imported it directly into R and then
# saved it as a CSV file from there.

# Import data from UCI ML (but you can skip this step)
# df <- read.csv(
#   url(
#     paste(
#       "https://archive.ics.uci.edu/ml/",
#       "machine-learning-databases/",
#       "optdigits/optdigits.tra",
#       sep = ""  # No space between joined text
#     )
#   )
# ) %>%
# as_tibble()  # Save as tibble, which prints better

# Our data is saved in the R project's data folder, with 
# the name "optdigits.csv"

# Import the data
df <-  import("data/optdigits.csv") %>%
  as_tibble()  # Save as tibble, which prints better

In [16]:
# Look at the first few rows of the tibble
df

X0,X1,X6,X15,X12,X1.1,X0.1,X0.2,X0.3,X7,⋯,X0.21,X0.22,X0.23,X6.3,X14.1,X7.4,X1.3,X0.24,X0.25,X0.26
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
0,0,10,16,6,0,0,0,0,7,⋯,0,0,0,10,16,15,3,0,0,0
0,0,8,15,16,13,0,0,0,1,⋯,0,0,0,9,14,0,0,0,0,7
0,0,0,3,11,16,0,0,0,0,⋯,0,0,0,0,1,15,2,0,0,4
0,0,5,14,4,0,0,0,0,0,⋯,0,0,0,4,12,14,7,0,0,6
0,0,11,16,10,1,0,0,0,4,⋯,3,0,0,10,16,16,16,16,6,2
0,0,1,11,13,11,7,0,0,0,⋯,0,0,0,1,13,5,0,0,0,5
0,0,8,10,8,7,2,0,0,1,⋯,0,0,0,4,13,8,0,0,0,5
0,0,15,2,14,13,2,0,0,0,⋯,0,0,0,10,12,5,0,0,0,0
0,0,3,13,13,2,0,0,0,6,⋯,0,0,0,3,15,11,6,0,0,8
0,0,6,14,14,16,16,8,0,0,⋯,0,0,0,10,12,0,0,0,0,7


In [17]:
# Rename the last column with digit labels to y, which is
# easier to specify and allows code to be reused.
df %<>%                     # Assignment pipe
  rename(y = X0.26) %>%     # New = old
  mutate(y = as_factor(y))  # Convert to factor

df # Look at the first few rows of the tibble

X0,X1,X6,X15,X12,X1.1,X0.1,X0.2,X0.3,X7,⋯,X0.21,X0.22,X0.23,X6.3,X14.1,X7.4,X1.3,X0.24,X0.25,y
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<fct>
0,0,10,16,6,0,0,0,0,7,⋯,0,0,0,10,16,15,3,0,0,0
0,0,8,15,16,13,0,0,0,1,⋯,0,0,0,9,14,0,0,0,0,7
0,0,0,3,11,16,0,0,0,0,⋯,0,0,0,0,1,15,2,0,0,4
0,0,5,14,4,0,0,0,0,0,⋯,0,0,0,4,12,14,7,0,0,6
0,0,11,16,10,1,0,0,0,4,⋯,3,0,0,10,16,16,16,16,6,2
0,0,1,11,13,11,7,0,0,0,⋯,0,0,0,1,13,5,0,0,0,5
0,0,8,10,8,7,2,0,0,1,⋯,0,0,0,4,13,8,0,0,0,5
0,0,15,2,14,13,2,0,0,0,⋯,0,0,0,10,12,5,0,0,0,0
0,0,3,13,13,2,0,0,0,6,⋯,0,0,0,3,15,11,6,0,0,8
0,0,6,14,14,16,16,8,0,0,⋯,0,0,0,10,12,0,0,0,0,7


In [18]:
# Check the variable `y`; `forcats::fct_count` gives
# frequencies in factor order
df %>% 
  pull(y) %>%  # Return a vector instead of a dataframe
  fct_count()  # Count frequencies in factor order

f,n
<fct>,<int>
0,375
1,389
2,380
3,389
4,387
5,376
6,377
7,387
8,380
9,382


In [19]:
# Simplify the data for these demonstrations by using
# only the digits {1,3,6}. The pipe "|" means "or."
df %<>% 
  filter(y == 1 | y == 3 | y == 6) %>%
  mutate(y = fct_drop(y))  # Drop unused levels

In [20]:
# Check `y` again
df %>% pull(y) %>% fct_count()

f,n
<fct>,<int>
1,389
3,389
6,377


In [24]:
# Remove columns that are constant and thus not informative
df %<>% remove_constant()
df

X1,X6,X15,X12,X1.1,X0.1,X0.2,X7,X16,X6.1,⋯,X0.20,X0.21,X0.23,X6.3,X14.1,X7.4,X1.3,X0.24,X0.25,y
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<fct>
0,5,14,4,0,0,0,0,13,8,⋯,12,0,0,4,12,14,7,0,0,6
0,0,3,16,11,1,0,0,0,8,⋯,0,0,0,0,2,14,14,1,0,1
0,7,11,11,6,0,0,9,16,12,⋯,12,0,0,14,16,12,10,1,0,3
0,9,13,1,0,0,0,0,8,16,⋯,8,5,0,4,15,16,16,16,16,1
0,0,10,12,0,0,0,0,9,14,⋯,14,0,0,1,11,14,12,1,0,6
0,0,0,10,13,0,0,0,0,0,⋯,0,0,0,0,0,8,15,2,0,1
0,2,14,10,0,0,0,1,14,12,⋯,15,0,0,1,12,14,12,4,0,6
0,0,0,13,16,3,0,0,0,1,⋯,0,0,0,0,0,11,16,8,0,1
0,6,12,11,3,0,0,0,16,9,⋯,3,0,0,7,12,13,6,0,0,3
0,8,14,2,0,0,0,0,5,16,⋯,12,8,0,4,15,16,13,12,12,1


In [25]:
# SPLIT DATA ##############################################

# Some demonstrations will use separate testing and training
# datasets for validation.

# Set random seed for reproducibility in processes like
# splitting the data
set.seed(1)  # You can use any number here

In [28]:
# Split data into training (trn) and testing (tst) sets
df %<>% mutate(ID = row_number())  # Add row ID
trn <- df %>% sample_frac(.70)     # 70% in trn
tst <- df %>%                      # Start with df
  anti_join(trn, by = "ID") %>%    # Rest in tst
  select(-ID)                      # Remove id from tst
trn %<>% select(-ID)               # Remove id from trn
df %<>% select(-ID)                # Remove id from df

In [33]:
head(trn)

X1,X6,X15,X12,X1.1,X0.1,X0.2,X7,X16,X6.1,⋯,X0.20,X0.21,X0.23,X6.3,X14.1,X7.4,X1.3,X0.24,X0.25,y
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<fct>
0,0,6,14,2,0,0,0,2,14,⋯,15,1,0,0,7,14,16,14,1,6
0,2,11,14,5,0,0,0,0,10,⋯,0,0,0,6,13,14,0,0,0,1
1,9,16,15,1,0,0,8,14,8,⋯,7,0,0,9,16,14,9,1,0,3
0,0,8,13,0,0,0,0,1,15,⋯,13,0,0,0,7,14,14,6,0,6
0,9,15,16,4,0,0,0,8,7,⋯,15,0,0,5,11,12,12,6,0,3
0,0,4,15,12,0,0,0,0,10,⋯,0,0,0,0,4,16,7,0,0,1


In [34]:
head(tst)

X1,X6,X15,X12,X1.1,X0.1,X0.2,X7,X16,X6.1,⋯,X0.20,X0.21,X0.23,X6.3,X14.1,X7.4,X1.3,X0.24,X0.25,y
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<fct>
0,0,3,16,11,1,0,0,0,8,⋯,0,0,0,0,2,14,14,1,0,1
0,7,11,11,6,0,0,9,16,12,⋯,12,0,0,14,16,12,10,1,0,3
0,9,13,1,0,0,0,0,8,16,⋯,8,5,0,4,15,16,16,16,16,1
0,0,0,10,13,0,0,0,0,0,⋯,0,0,0,0,0,8,15,2,0,1
0,8,14,2,0,0,0,0,5,16,⋯,12,8,0,4,15,16,13,12,12,1
0,0,0,13,5,0,0,0,0,3,⋯,0,0,0,0,0,12,13,5,0,1


In [27]:
# SAVE DATA ################################################

# Use saveRDS(), which save data to native R formats
df  %>% saveRDS("data/optdigits.rds")
trn %>% saveRDS("data/optdigits_trn.rds")
tst %>% saveRDS("data/optdigits_tst.rds")