# PART 05 - Basic statistics and plots

In this part, you will learn:
* How to calculate basic statistics?
* How to plot basic plots?


In [None]:
code_data <- read.table("example.csv", header=TRUE, sep=";")
require(dplyr)
code_data

## Basic statistics

- mean, median, sd (apply on column)
    - What’s the mean LOC,
    - what’s the median bugs, 
    - what’s sd?
- Summary function
- Frequency
    - table – n-way contingency table
    - table(code_data)
    - table(code_data$CodeQuality)

### Contingency tables

In [None]:
table(code_data$Package)

In [None]:
table(code_data$Package, code_data$CodeQuality)

### Correlations


In [None]:
cor(code_data[,c("LOC","NoBugs")], method="pearson")

In [None]:
cor.test(code_data[,"LOC"], code_data[,"NoBugs"], method="pearson")

In [None]:
cor(code_data[,c("LOC","NoBugs")], method="spearman")

In [None]:
cor.test(code_data[,"LOC"], code_data[,"NoBugs"], method="spearman")

In [None]:
cor(code_data[,c("LOC","NoBugs")], method="kendall")

In [None]:
cor.test(code_data[,"LOC"], code_data[,"NoBugs"], method="kendall")

## Statistical inference testing

t-test to compare mean sizes of classes between two packages

In [None]:
x <- code_data %>% filter(Package=="pl.put.qmese") %>% select(LOC)
y <- code_data %>% filter(Package=="pl.put.qmese.internal") %>% select(LOC)
x
y


In [None]:
t.test(x,y, alternative = c("two.sided"), paired = FALSE, conf.level = 0.95)

## Basic plots

There is a function par() that controls most of the plotting parameters.

In [None]:
par()

If you want to customize these defaults values you have override them. However, please remember to keep the old version of them and revert the changes after you plot something.

In [None]:
par("bg")

old_par <- par(bg="black")

# ...  plot
par("bg")


par(old_par)
par("bg")

A few useful settings:

In [None]:
# margins - either mar (lines) or mai (inches).
# mfrow - if you want to plot with subplots, e.g., 
# cex - font size

### Bar plots

In [None]:
code_quality_table <- table(code_data$CodeQuality)

options(repr.plot.width=3, repr.plot.height=2)
old_par <- par(mfrow=c(1,1), mar=c(3, 3, 2, 1), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
barplot(code_quality_table,
        main="Quality of classes",
        xlab="Quality level",
        ylab="Frequency",
        col="#e37222")
par(old_par)


In [None]:
table(code_data$CodeQuality, code_data$Package)

In [None]:
code_quality_package <- table(code_data$CodeQuality, code_data$Package)

options(repr.plot.width=5, repr.plot.height=5)
old_par <- par(mfrow=c(1,1), mar=c(3, 3, 2, 0), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
barplot(code_quality_package,
        main="Quality of classes in packages",
        xlab="Package", ylab="Frequency", 
        col=c("#e37222", "#162274", "#618e02"),
        legend=NULL)
legend(x=c(2.5,3.3), y=c(4.5,5), legend=row.names(code_quality_package), 
       fill=c("#e37222", "#162274", "#618e02"), 
       col=c("#e37222", "#162274", "#618e02"), 
       bg=NA)
par(old_par)


### Pie plots

In [None]:
code_quality_table <- table(code_data$CodeQuality)

options(repr.plot.width=4, repr.plot.height=4)
old_par <- par(mfrow=c(1,1), mar=c(3, 3, 2, 0), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
pie(code_quality_table,
        main="Quality of classes", col=c("#e37222", "#162274", "#618e02"))
par(old_par)


### Histograms

In [None]:
options(repr.plot.width=4, repr.plot.height=3)
old_par <- par(mfrow=c(1,1), mar=c(3, 3, 2, 0), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
hist(code_data$LOC, 
    main="Histogram of LOC of class", 
    xlab="LOC",
    breaks=3, col="#e37222")
par(old_par)

### Boxplots

In [None]:
options(repr.plot.width=4, repr.plot.height=3)
old_par <- par(mfrow=c(1,1), mar=c(3, 3, 2, 0), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
boxplot(code_data$LOC, 
     main="Boxplot for LOC of class", 
     xlab="LOC", horizontal=TRUE)
par(old_par)

In [None]:
# We can overlay the data points to the boxplot
options(repr.plot.width=4, repr.plot.height=3)
old_par <- par(mfrow=c(1,1), mar=c(3, 3, 2, 0), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
boxplot(code_data$LOC, 
     main="Boxplot for LOC of class", 
     xlab="LOC", horizontal=TRUE)
stripchart(code_data$LOC, vertical = FALSE,  
    method = "jitter", add = TRUE, pch = 20, col = 'black')
par(old_par)

In [None]:
loc_a <- (code_data %>% filter(Package=="pl.put.qmese"))$LOC
loc_b <- (code_data %>% filter(Package=="pl.put.qmese.data"))$LOC
loc_c <- (code_data %>% filter(Package=="pl.put.qmese.internal"))$LOC

options(repr.plot.width=5, repr.plot.height=3)
old_par <- par(mfrow=c(1,1), mar=c(3, 10, 2, 0), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
boxplot(loc_a, loc_b, loc_c, 
     main="Boxplot for LOC of class", 
     xlab="LOC", horizontal=TRUE,
     names=c("pl.put.qmese", "pl.put.qmese.data", "pl.put.qmese.internal"),
    las=2)
par(old_par)

### Scatter plots

In [None]:
options(repr.plot.width=4, repr.plot.height=4)
old_par <- par(mfrow=c(1,1), mar=c(3, 3, 2, 0), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
plot(code_data$LOC, code_data$NoBugs, 
     main="Histogram of LOC of class", 
     xlab="LOC", ylab="Bugs")
par(old_par)

### Saving to a file

If you want to save a plot to a file, the best option is to use a PDF device to generate the plot. Here is an example.

In [None]:
pdf("./boxplot.pdf", width=4, height=3) # KEY LINE 1

old_par <- par(mfrow=c(1,1), mar=c(3, 1, 2, 1), cex=0.7, omi=c(0,0,0,0), mgp=c(2, 1, 0))
boxplot(code_data$LOC, 
     main="Boxplot for LOC of class", 
     xlab="LOC", horizontal=TRUE)
par(old_par)

dev.off() # KEY LINE 2

### What's next?

If you want to learn how to draw beautiful plots in R, I would recommend you to read about ggplot2. At the beginning the approach to plotting it offers my look weird, but it is really powerful.