CS249 -- Spring 2016 -- D.S. Parker &copy; 2016

# Diamonds  --  Visualization of the dataset

In [None]:
# we need the ggplot2 package to get the "diamonds" dataset

not.installed <- function(pkg) !is.element(pkg, installed.packages()[,1])

if (not.installed("ggplot2")) install.packages("ggplot2")
    
library(ggplot2)

In [None]:
data(diamonds, package="ggplot2")

summary(diamonds)

The dataset has the following columns:
<table>
<tr><td>   <b>carat</b></td><td>weight of the diamond in carats, rounded to an integer  (1 carat = 0.2 grams)</td></tr>
<tr><td>   <b>cut</b></td><td>quality of the cut  {Fair, Good, VeryGood, Premium, Ideal}</td></tr>
<tr><td>   <b>color</b></td><td>color code: &lbrace; D &gt; E &gt; F &gt; G &gt; H &gt; I &gt; J &rbrace;  (J is worst, D is best)</td></tr>
<tr><td>   <b>clarity</b></td><td>clarity code: &lbrace; I1 &lt; SI1 &lt; SI2 &lt; VS1 &lt; VS2 &lt; VVS1 &lt; VVS2 &lt; IF &rbrace; (I1 is worst, IF is best)</td></tr>

<tr><td>   <b>depth</b></td><td>total depth percentage  =  2*z/(x+y)</td></tr>
<tr><td>   <b>table</b></td><td>width of top of diamond relative to widest point</td></tr>

<tr><td>   <b>price</b></td><td>in US dollars</td></tr>

<tr><td>   <b>x</b></td><td>Length in mm (numeric value between 0 and 6)</td></tr>
<tr><td>   <b>y</b></td><td>Width  in mm (numeric value between 0 and 9)</td></tr>
<tr><td>   <b>z</b></td><td>Depth  in mm (numeric value between 0 and 6)</td></tr>

</table>

Caution:  the datset has skewed distributions.  Please check below.


In [None]:
dim(diamonds)  # not a tiny dataset

In [None]:
low_prices = subset( diamonds$price, diamonds$price<5000 )

hist( low_prices, breaks=200, col="green",
     main = "diamond prices below $5000; notice the odd gap around 1500")

In [None]:
# log-scaling the prices makes patterns clearer

hist( log10(diamonds$price), breaks=50, col="skyblue",
     main="distribution of log10(price) looks like a mixture" )

plot(sort(log10(diamonds$price)), pch=".", col="skyblue")

# Do some basic cleaning of the data

In [None]:
diamonds = subset( diamonds, (x>0) & (y>0) & (z>0) )  #  There are actually some zero values, we omit them.

## Prepare for numeric encoding of "ordered categorical" values for Cut, Color, and Clarity

In [None]:
colnames(diamonds)

In [None]:
numeric.column.numbers = c(1,5:10)  # indices of numeric columns

In [None]:
colors = levels(diamonds$color)
colors

In [None]:
## The levels of Colors should have the reverse ordering
## { D > E > F > G > H > I > J }
## (J is worst, D is best)

levels(diamonds$color) = rev(colors)
levels(diamonds$color)

In [None]:
cuts = levels(diamonds$cut)
cuts

In [None]:
## The levels of Cuts have the correct ordering

In [None]:
clarities = levels(diamonds$clarity)
clarities

In [None]:
## The levels of Clarity should have ordering
## { I1 < SI1 < SI2 < VS1 < VS2 < VVS1 < VVS2 < IF }
## (I1 is worst, IF is best)
levels(diamonds$clarity) = clarities[c(1,3,2,5,4,7,6,8)]   #  reorder the factor levels of 'clarity'
levels(diamonds$clarity)                                   #  so that they match the real ordering.

## Convert the categorical values to integers using the unclass() function.

In [None]:
numeric.diamonds = na.omit(data.frame(carat = diamonds$carat,
                                      cut = unclass(diamonds$cut),
                                      color = unclass(diamonds$color),
                                      clarity = unclass(diamonds$clarity),
                                      diamonds[, 5:10]  # the other numeric columns
                                      ))

In [None]:
head(numeric.diamonds)

In [None]:
levels(diamonds$cut)
table( diamonds$cut, numeric.diamonds$cut )

In [None]:
levels(diamonds$color)
table( diamonds$color, numeric.diamonds$color )

In [None]:
levels(diamonds$clarity)
table( diamonds$clarity, numeric.diamonds$clarity )

## A sample of the diamonds data

In [None]:
n = nrow(diamonds)

sample.size = 1000

sample.rownumbers = sample( 1:n, sample.size )

diamonds.sample = diamonds[ sample.rownumbers, ]

numeric.diamonds.sample = numeric.diamonds[ sample.rownumbers, ]

# This generates a random sample of size 500, to give a sense of the data.
# Plotting the entire dataset is slow, and all the detail can obscure larger patterns.

## Correlation and PCA

In [None]:
diamonds.correlation.matrix = cor(numeric.diamonds)

In [None]:
round( diamonds.correlation.matrix, 2 )

In [None]:
# Quick PCA of the sample data

diamond.PCA = prcomp(diamonds.sample[,c(1,5:10)], scale.=TRUE)

biplot( diamond.PCA, xlabs=rep(".",sample.size) )

In [None]:
if (not.installed("ggbiplot")) install.packages("ggbiplot")

library(ggbiplot)
    
ggbiplot( diamond.PCA, groups=diamonds.sample$cut, ellipse=TRUE )

## "tableplots"

For more about tableplot() and the tabplot package, see:
<a href="http://cran.r-project.org/web/packages/tabplot">http://cran.r-project.org/web/packages/tabplot</a>
<br/><br/>
Many other examples of tableplots of the diamonds dataset are displayed at:
<a href="http://cran.r-project.org/web/packages/tabplot/vignettes/tabplot-vignette.html">http://cran.r-project.org/web/packages/tabplot/vignettes/tabplot-vignette.html</a>

In [None]:
if (not.installed("tabplot")) install.packages("tabplot")

library(tabplot)

In [None]:
tableplot( diamonds.sample )

In [None]:
# tableplot( diamonds, subset = (price < 5000) & cut == "Premium" )

In [None]:
tableplot( diamonds,
           select = c(carat, price, cut, color, clarity),
           sortCol = price )

# show results sorted by price

In [None]:
pairs( diamonds.sample, col=unclass(diamonds.sample$cut), pch="." )

In [None]:
panel.hist <- function(x, ...)
     {
         usr <- par("usr"); on.exit(par(usr))
         par(usr = c(usr[1:2], 0, 1.5) )
         h <- hist(x, plot = FALSE)
         breaks <- h$breaks; nB <- length(breaks)
         y <- h$counts; y <- y/max(y)
         rect(breaks[-nB], 0, breaks[-1], y, col="green")
     }

pairs( diamonds.sample[,numeric.column.numbers], diag.panel = panel.hist,
       col=unclass(diamonds.sample$cut), pch="." )


## corrplot() display of correlation matrices

In [None]:
heatmap( diamonds.correlation.matrix, symm=TRUE,
        main="diamonds correlation matrix -- using heatmap()")

In [None]:
if (!(is.element("corrplot", installed.packages()))) install.packages("corrplot")
library(corrplot)

## corrplot() display of correlation matrix

In [None]:
opar = par(mfrow=c(2,2))

corrplot( diamonds.correlation.matrix, method="number" )
corrplot( diamonds.correlation.matrix, method="square", addCoef.col="grey" )
corrplot( diamonds.correlation.matrix, method="ellipse", order="AOE" )
corrplot( diamonds.correlation.matrix, method="ellipse", order="hclust", addrect = 5 )

par(opar)

## ggcorr() display of correlation matrices

See <a href="http://briatte.github.io/ggcorr/">the ggcorr documentation</a>
at the recently-developed <a href="http://ggobi.github.io/ggally/">GGally</a> site.

In [None]:
if (!(is.element("GGally", installed.packages()))) install.packages("GGally")
library(GGally)

In [None]:
plotlist = list()
plotlist[[1]] = ggcorr( numeric.diamonds )
plotlist[[2]] = ggcorr( numeric.diamonds, low = "orange", mid = "white", high = "skyblue" )

ggmatrix( plotlist, 1, 2)

## Distribution of prices -- by cut and by color

In [None]:
ggplot(data=diamonds, aes(x=log10(price), fill=cut)) + geom_density(alpha=0.1) +
      ggtitle("distribution of log10(price), by cut")

In [None]:
ggplot(data=diamonds, aes(x=log10(price), fill=color)) + geom_density(alpha=0.1) +
      ggtitle("distribution of log10(price), by color")

## ggpairs()

ggpairs produces a generalized pairwise plot.
See <a href="https://cran.r-project.org/web/packages/GGally/vignettes/ggpairs.html">the ggpairs documentation</a>
at the <a href="http://ggobi.github.io/ggally/">GGally</a> site.


In [None]:
if (FALSE) {  # lots of information, but omitted since the plot is huge
ggpairs(
   diamonds.sample,
   mapping = ggplot2::aes(color = cut),
   upper = list(continuous = wrap("density", alpha = 0.3), combo = wrap("box", size = 0.2, alpha = 0.4)),
   lower = list(continuous = wrap("points",  alpha = 0.3), combo = wrap("dot", size = 0.2, alpha = 0.4)),
   title = "Diamonds:  large grid of pairwise plots of multiple types"
)
}

In [None]:
ggplot(data=diamonds, aes(x=log10(carat), y=log10(price), color=cut)) +
  geom_smooth() + ggtitle("log10(carat) vs log10(price), by cut")

## Hadley Wickham's qplot examples for the Diamonds dataset

qplot() is ggplot2's generalization of the plot() function in R.
The plots produced by many of the commands below are from
<a href="http://ggplot2.org/book/qplot.pdf">the chapter on qplot in Hadley Wickham's book about ggplot2</a> at <a href="http://ggplot2.org">ggplot2.org</a>.
Browsing the output in this chapter is a good way to get into ggplot,
and also get some perspective on the diamonds dataset.

These queries are included here to show how easy it is to generate plots with more modern style than the standard R graphics functions.  Graphics in R are now evolving rapidly because of ggplot2, it is important to be aware of this.

qplot(depth, data = diamonds, binwidth = 0.2, color=clarity) + xlim(55, 70) + facet_wrap(~ cut)

In [None]:
# options( repr.plot.width=6, repr.plot.height=6 )

qplot(x, y, data = diamonds)

qplot(table, price, data = diamonds) + xlim(50, 75)

qplot(carat, price, data = diamonds)
qplot(log(carat), log(price), data = diamonds)
qplot(carat, price / carat, data = diamonds)

In [None]:
# Adding aesthetics --------------------------------------------------

qplot(carat, price, data = diamonds, colour = color)

In [None]:
# Facetting --------------------------------------------------

qplot(carat, price, data = diamonds) + facet_grid(. ~ color)

qplot(carat, price, data = diamonds) + facet_grid(clarity ~ .)

qplot(carat, price, data = diamonds) + facet_grid(clarity ~ color)

qplot(carat, price, data = diamonds) + facet_wrap(~ color)

In [None]:
# Bar charts and histograms --------------------------------------------------

# With only one variable, qplot guesses that
# you want a bar chart or histogram

qplot(cut, data = diamonds)

qplot(carat, data = diamonds)
qplot(carat, data = diamonds, binwidth = 1)
qplot(carat, data = diamonds, binwidth = 0.1)
qplot(carat, data = diamonds, binwidth = 0.01)
resolution(diamonds$carat)

last_plot() + xlim(0, 3)

In [None]:
# If you want to add an additional variable to the plot it's better to
# use faceting, rather than aesthetics.

qplot(depth, data = diamonds, binwidth = 0.2)

In [None]:
# In this plot we can see general trends, but it's hard to get precise

qplot(depth, data = diamonds, binwidth = 0.2, fill = cut) + xlim(55, 70)

In [None]:
# Here the differences really jump out at you

qplot(depth, data = diamonds, binwidth = 0.2) + xlim(55, 70) +
  facet_wrap(~ cut)

qplot(carat, data = diamonds, binwidth = 0.1) + facet_wrap(~ color)

In [None]:
# Instead of displaying counts on the y-axis, you can display sums of other variables:

qplot(cut, data = diamonds, weight = carat)

qplot(cut, data = diamonds, weight = price)

In [None]:
# This is useful for pretabulated data

cuts <- as.data.frame(table(cut = diamonds$cut))
  
qplot(cut, weight = Freq, data = cuts)

In [None]:
# Zooming ------------------------------------------------------------------

qplot(table, data = diamonds, binwidth = 1)

qplot(table, data = diamonds, binwidth = 1) + xlim(50, 70)

qplot(table, data = diamonds, binwidth = 0.1) + xlim(50, 70)

qplot(table, data = diamonds, binwidth = 0.1) + xlim(50, 70) + ylim(0, 50)

In [None]:
# Note that this type of zooming discards data outside of the plot regions
# See coord_cartesian() for an alternative

In [None]:
# Weighting ------------------------------------------------------------------

qplot(cut, data = diamonds, weight = carat)

qplot(cut, data = diamonds, weight = price)

In [None]:
# Also useful for pretabulated data

cuts <- as.data.frame(table(cut = diamonds$cut))
  
qplot(cut, weight = Freq, data = cuts)

In [None]:
# Box plots ------------------------------------------------------------------

qplot(color, price, data = diamonds,
  geom = "boxplot")

In [None]:
# Boxplots also useful for continuous data
# But need to specify how to group data into boxes

qplot(table, price, data = diamonds)

qplot(table, price, data = diamonds, geom = "boxplot")

qplot(table, price, data = diamonds, geom = "boxplot", group = round(table))

In [None]:
# Overplotting ---------------------------------------------------------------

qplot(carat, price, data = diamonds,
  colour = I(alpha("black", 1/255)))

qplot(carat, price, data = diamonds, geom = "bin2d")

qplot(carat, price, data = diamonds, geom = "bin2d", bins = 100)

qplot(carat, price, data = diamonds, geom = "hex")

qplot(carat, price, data = diamonds) + geom_smooth()

In [None]:
# Very basic cleaning --------------------------------------------------------

diamonds$x[diamonds$x == 0] <- NA

diamonds$y[diamonds$y == 0] <- NA

diamonds$y[diamonds$y > 12] <- NA


qplot(x, y, data = diamonds)

qplot(x, y, data = diamonds, geom = "bin2d")

qplot(x, y, data = diamonds, geom = "hex")

qplot(x, y, data = diamonds, geom = "bin2d", bins = 100)

qplot(x, y, data = diamonds, geom = "hex", bins = 100)

In [None]:
# Zoom in ----------------------------------------------------------------------

qplot(x, y, data = diamonds, geom = "bin2d", bins = 100) +
  xlim(4,7) + ylim(4,7)
  
qplot(x, y, data = diamonds, geom = "bin2d", bins = 100) +
  xlim(4,5) + ylim(4,5)

qplot(x, x / y, data = diamonds, geom = "bin2d")

qplot(x, log(x / y), data = diamonds, geom = "bin2d")

clean <- subset(diamonds, abs(log(x / y)) < 0.1)

qplot(x, log(x / y), data = clean, geom = "bin2d")

qplot(x, log(x / y), data = clean, geom = "bin2d", bins = 80)