# Data Mining in Physics - Presentation 2. - DAGUR 5.1: Fitting a line to data

In [None]:
library(microbenchmark)
library(graphics)
library(ggplot2)

In [None]:
options(jupyter.plot_scale=1.4)

## 1. Basics of linear fitting

In [None]:
library(lattice)
library(DAAG)

In [None]:
# Export the `roller` dataset from DAAG
df = DAAG::roller
# Extract X and y data
x = df$weight
y = df$depression

In [None]:
head(df)

In [None]:
mar = 0.2
par(bg='black', fig=c(0, 1, 0, 1),
    omi=c(mar,mar,mar,mar))

plot(x, y,
     xaxt='n', yaxt='n',
     ann=FALSE, type='n')

box(col=gray(0.8))

points(x, y,
       col='red', lwd=12)

grid(nx=NULL, ny=NULL, lty=3, col=grey(0.6))

axis(1, col=gray(0.8), col.ticks=gray(0.8), col.axis=gray(0.8), cex.axis=1.5)
axis(2, col=gray(0.8), col.ticks=gray(0.8), col.axis=gray(0.8), cex.axis=1.5)

title(main='The DAAG::roller dataset',
      col.main='lightgoldenrodyellow', cex.main=1.8, font.main=2) 
title(xlab='Weight of Roller [t]', ylab='Depression in Lawn [mm]',
      col.lab='white', cex.lab=1.5, font.lab=2)

## 2. Fit a linear modell to the data

In [None]:
# Fit linear model to the weight(depression) values
roller.lm = lm(formula=depression ~ weight, data=df)
# Use the extractor function summary() to summarize results
summary(roller.lm)

### 2.1. Plot fitted line

In [None]:
# Extract coefficients of fit
b = roller.lm$coefficients[1]
m = roller.lm$coefficients[2]

In [None]:
mar = 0.2
par(bg='black', fig=c(0, 1, 0, 1),
    omi=c(mar,mar,mar,mar))

plot(x, y,
     xaxt='n', yaxt='n',
     ann=FALSE, type='n')

box(col=gray(0.8))

points(x, y,
       col='red', lwd=12)
# Fitted line
abline(roller.lm,
       col='green', lty=2, lwd=3)

grid(nx=NULL, ny=NULL, lty=3, col=grey(0.6))

axis(1, col=gray(0.8), col.ticks=gray(0.8), col.axis=gray(0.8), cex.axis=1.5)
axis(2, col=gray(0.8), col.ticks=gray(0.8), col.axis=gray(0.8), cex.axis=1.5)

legend(2, 30, legend=c('Datapoints', 'Fitted line'),
       lty=c(1,2), lwd=c(2,2), 
       col=c('red', 'green'), text.col='white')

title(main='Linear fit on the DAAG::roller dataset',
      col.main='lightgoldenrodyellow', cex.main=1.8, font.main=2) 
title(xlab='Weight of Roller [t]', ylab='Depression in Lawn [mm]',
      col.lab='white', cex.lab=1.5, font.lab=2)

### 2.2. Diagnostic plots

In [None]:
## A: Plot residuals vs fitted values; B: normal probability plot
plot(roller.lm, which = 1:2)

### 2.3. Plot residuals

In [None]:
r = roller.lm$residuals

In [None]:
mar = 0.2
par(bg='black', fig=c(0, 1, 0, 1),
    omi=c(mar,mar,mar,mar))

plot(x, r,
     xaxt='n', yaxt='n',
     ann=FALSE, type='n')

for (i in 1:length(roller.lm$residuals)) {
    lines(x=c(x[i], x[i]), y=c(0, r[i]),
          col=gray(0.8), lty=2, lwd=2)
}

box(col=gray(0.8))

points(x, r,
       col='red', pch=4, lwd=5)

abline(h = 0,
       col='green', lty=2, lwd=3)

grid(nx=NULL, ny=NULL, lty=3, col=grey(0.6))

axis(1, col=gray(0.8), col.ticks=gray(0.8), col.axis=gray(0.8), cex.axis=1.5)
axis(2, col=gray(0.8), col.ticks=gray(0.8), col.axis=gray(0.8), cex.axis=1.5)

title(main='Residuals of the linear fit on\nthe DAAG::roller dataset',
      col.main='lightgoldenrodyellow', cex.main=1.8, font.main=2) 
title(xlab='Weight of Roller [t]', ylab='Residuals [t]',
      col.lab='white', cex.lab=1.5, font.lab=2)

## 3. ANOVA table

In [None]:
anov = anova(roller.lm)
anov

#### Calculate $R^{2}$ and adjusted $R^{2}$ (coefficient of determination)

In [None]:
anov$`Sum Sq`[2] / anov$`Sum Sq`[1]

In [None]:
1 - anov$`Sum Sq`[2] / anov$`Sum Sq`[1]