-
Notifications
You must be signed in to change notification settings - Fork 0
/
SimpleLinearRegression.R
163 lines (132 loc) · 5.52 KB
/
SimpleLinearRegression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
library(UsingR)
library(tidyverse)
library(caret)
# Example 1: Father-son heights
###############################
# Load data.
# Data source: Data set used by Pearson to investigate regression. Accessed via UsingR package.
data(father.son)
# Calculate statistics for father and son heights.
tibble(
FatherMean = mean(father.son$fheight),
FatherStandardDeviation = sd(father.son$fheight),
SonMean = mean(father.son$sheight),
SonStandardDeviation = sd(father.son$sheight)
)
# View the underlying distribution of father and son heights.
# To do this, we need to convert the data from wide format to long format.
father.son %>%
pivot_longer(cols = c(fheight, sheight), names_to = "group", values_to = "height") %>%
ggplot(aes(x = height, fill = group)) +
geom_histogram(bins = 30, alpha = 0.5) +
labs(
title = "Father-Son Height Distribution\n",
x = "\nHeight (in.)",
y = "Frequency\n"
) +
theme_bw() +
scale_fill_discrete(name = "Group:", labels = c("Father", "Son"))
# Look for correlation, both visually and using correlation coefficient.
r <- cor(father.son$fheight, father.son$sheight)
father.son %>%
ggplot(aes(x = fheight, y = sheight)) +
geom_point() +
labs(
title = "Father-Son Height Correlation\n",
x = "\nFather's Height (in.)",
y = "Son's Height (in.)\n"
) +
theme_bw() +
geom_smooth(method = "lm", formula = y ~ x, se = F, color = "tomato2")
# Find the equation for the linear regression line.
m <- lm(sheight ~ fheight, father.son)
summary(m)
cat(paste0("Son's height is predicted by father's height with the following ",
"formula:\nson_height = (", round(m$coefficients[[1]], 1), " + ",
round(m$coefficients[[2]], 1), " * father_height)."))
# Using machine learning methodology, a train-test partition would allow for testing
# the model's accuracy. The model is training on the training set and predicted on
# the test set.
train.heights <- father.son[1:809,]
test.heights <- father.son[810:nrow(father.son),]
model <- lm(sheight ~ fheight, train.heights)
pred <- predict(model, test.heights, type = "response")
# Notice this produces slightly different coefficients.
cat(paste0("Son's height is predicted by father's height with the following ",
"formula:\nson_height = (", round(model$coefficients[[1]], 1), " + ",
round(model$coefficients[[2]], 1), " * father_height)."))
# Compare the trained model's predictions against ground truth.
tibble(
Predicted = pred,
Actual = test.heights$sheight,
Error = test.heights$sheight - pred
) %>% View()
# Model Diagnostics
###################
# Create a residual plot to assess regression model assumptions.
ggplot() +
geom_hline(yintercept = 0, size = 1, col = "steelblue4") +
geom_point(aes(x = pred, y = (pred - test.heights$sheight))) +
labs(
title = "Residual Plot - Linear Regression Model\n",
x = "\nFitted Height (in.)",
y = "Residual\n"
) +
theme_bw()
# View the model's residuals, coefficients, coefficient test statistics, R^2 value,
# and F-statistic.
summary(model)
cat(paste0(round((summary(model)$adj.r.squared * 100), 1), "% of the variability in ",
"son's heights is explained by the variability in father's heights."))
# Example 2: M1 money supply
############################
# Load the data.
# Source shown in url below.
url <- "https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WM1NS&scale=left&cosd=1975-01-06&coed=2021-04-05&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Weekly%2C%20Ending%20Monday&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2021-05-22&revision_date=2021-05-22&nd=1975-01-06"
m1 <- read.csv(url) %>% tibble()
# Wrangle the data.
m1 <- m1 %>%
rename(Date = DATE, Supply = WM1NS) %>%
mutate(Date = as.Date(Date))
# Visualize the underlying data.
m1 %>%
ggplot(aes(x = Date, y = Supply)) +
geom_line() +
geom_smooth(method = "lm", formula = y ~ x) +
scale_y_log10() +
theme_tq() +
labs(
title = "M1 Money Supply\n",
x = "\nDate",
y = "Money Supply (Billions of USD)\n",
caption = "Source: FRED Economic Data"
)
# Notice there is a large spike in the recent data. Check for outliers.
stats <- tibble(
Mean = mean(m1$Supply),
StandardDeviation = sd(m1$Supply),
Q1 = summary(m1$Supply)[2],
Q3 = summary(m1$Supply)[5]
)
iqr <- stats$Q3 - stats$Q1
outlier.low <- stats$Q1 - (1.5 * iqr)
outlier.high <- stats$Q3 + (1.5 * iqr)
m1$Supply[m1$Supply > outlier.high]
# Too many outliers to remove with this method.
# Build a train-test split, randomly selecting 75% of the data.
set.seed(23482394) # Included for reproducibility.
train.index <- sample(index(m1), size = floor(nrow(m1) * 0.75), replace = FALSE)
test.index <- which(!(index(m1) %in% train.index))
train.data <- m1[train.index,]
test.data <- m1[test.index,]
# Train a linear regression model.
model <- lm(Supply ~ Date, data = train.data)
# Predict the test data using the trained model.
pred <- predict(model, test.data, type = "response")
# Evaluate the model performance.
summary(model)
tibble(
GroundTruth = test.data$Supply,
Prediction = pred,
Error = Prediction - GroundTruth
)