-
Notifications
You must be signed in to change notification settings - Fork 9
/
2_distributions.R
191 lines (115 loc) · 3.61 KB
/
2_distributions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# lab code for normal and binomial distribution
# randomness ----
# simulate
sample(1:6, size = 2, replace = T)
sample(1:3, size = 3, replace = F)
sample(1:3, size = 3, replace = T) # replacement is allowed
# if set seed, the result is the same
# you need to run the line set.seed(yourseed) right before
set.seed(1)
sample(1:6, size = 2, replace = T)
# binomial distribution -----
# for this simulation, we do not need to set seed
# because the reproducibility is not too important for the demo
x <- rbinom(n = 500, size = 1, prob = 0.5)
hist(x)
# test different prob
x <- rbinom(n = 500, size = 1, prob = 0.15)
hist(x)
# test out different size
x <- rbinom(n = 500, size = 8, prob = 0.5)
hist(x)
# increase n and p (size and prob)
x <- rbinom(n = 500, size = 30, prob = 0.5)
hist(x, breaks = 20)
# more data points
x <- rbinom(n = 2000, size = 30, prob = 0.5)
hist(x, breaks = 20)
# normal distribution -----
# two parameters: mean and sd
x <- rnorm(1000, mean = 0, sd = 1)
x <- rnorm(1000, mean = 10, sd = 2)
# visualize
hist(x)
# check the summary
summary(x)
# mean
mean(x)
# variance and sd
var(x)
sd(x)
# (sd(x))^2 # these two are equivalent
# can plot the mean on the histogram to indicate the center
hist(x)
# v means verticle line, lwd means line width
abline(v = mean(x), col = 'red', lwd = 2)
# quantiles and probability
p1 <- pnorm(1.96, mean = 0, sd = 1)
p2 <- pnorm(-1.96, mean = 0, sd = 1)
1-p1 # equal to p2
# bar plot ----
# make bar plot given counts
dd <- data.frame(prob = c(0.03, 0.14, 0.13, 0.38, 0.33),
n = c(3,16,15,44,38))
rownames(dd) <- c('0-17', '18-24', '25-34', '35-64', '65+')
# bar plot for counts
barplot(dd$n, names.arg = rownames(dd),
main = 'Number of killed for road accidents')
# bar plot for probability
barplot(dd$prob, names.arg = rownames(dd),
main = 'Proportion for killed road accidents')
# normal distribution: birth -----
# load birth data first
# if you forgot, check notes from day 2 (descriptive stat)
birth
# we use the variable bwt
birth_weight <- birth$bwt
hist(birth_weight)
# find the probability above 4000
# solution 1: from empirical data
# first find how many are above 400 (counting)
birth_weight>4000
# we see 9 TRUE
# which finds the indices
# length counts the number of elements
which(birth_weight >4000)
length(which(birth_weight >4000)==T)
# alternatively, since R codes T as 1 and F as 0
# we can use sum() command
sum(birth_weight>4000)
# probability
9/189
# similarly, we can find weight between 2800 and 3000 (including)
which(birth_weight <= 3000 & birth_weight >= 2800)
length(which(birth_weight <= 3000 & birth_weight >= 2800))
# probability
20/189
# solution 2
# use normal approximation
# first get the parameters mean and sd (or sqrt variance)
m <- mean(birth_weight)
s <- sd(birth_weight)
# s <- sqrt(var(birth_weight))
# probability of birthweight above 4000
# if you want
pnorm(4000, mean = m, sd =s, lower.tail = F)
# you can also use the standard normal dist
# whose mean and sd (var) are 0 and 1
# can be translated as a second variable Y above 1.45
# see lecture notes for why this is the case!
pnorm(1.45, lower.tail = F)
# binomial vs normal ----
# the approximation is N(np, npq) if n is large, p close to 0.5
# note: the small n (first argument) is how many random samples you want
# it is different from the 'size' argument in binomial distribution
N <- 50
P <- 0.5
# binom(50, 0.5)
x_binom <- rbinom(n = 1000, size = N, prob = P)
hist(x_binom)
# normal
x_norm <- rnorm(n = 1000, mean = N*P, sd = sqrt(50*0.5*(1-0.5)))
hist(x_norm)
# take summary statistics
summary(x_binom)
summary(x_norm)