-
Notifications
You must be signed in to change notification settings - Fork 0
/
logbook.Rmd
331 lines (250 loc) · 11.5 KB
/
logbook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
---
title: "Logbook Typo Challenge data analysis"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(ggplot2)
library(dplyr)
```
```{r}
######################################################
### read contributions ###
######################################################
date_of_contributions <- "Final"
dat <- readRDS(paste0("./contributions/", date_of_contributions, "/contributions.rds"))
n_entries_per_contribution <- vapply(dat, function(x) nrow(x$data), integer(1))
tmp <- do.call("rbind", lapply(dat, "[[", "data"))
ret <- cbind(id = rep(names(dat), n_entries_per_contribution), tmp, stringsAsFactors = FALSE)
rownames(ret) <- NULL
######################################################
### source functions ###
######################################################
source("R/functions.R")
```
```{r}
######################################################
### Characteristics of contributions ###
######################################################
### number of contributions ###
id_contributions <- names(dat)
n_contributions <- length(id_contributions)
### Number of entries by contribution ###
hist(table(ret$id), breaks = seq(0, 460, 10), col = "grey",
main = "", xlab = "Number of entry by contribution")
```
The dataset collected during the TypoChallenge contains `r nrow(ret)` entries from `r n_contributions` individual contributions. Each contribution includes an average of `r mean(table(ret$id))` entries (median=`r median(table(ret$id))`, sd=`r sd(table(ret$id))`, range=`r range(table(ret$id))`).
```{r}
######################################################
### Characteristics of errors ###
######################################################
### frequency of error (across all contributions) ###
freq_error <- table(ret$correct)['FALSE'] / sum(table(ret$correct))
### frequency of error (per contribution) ###
freq_error_per_contribution <-
sapply(id_contributions, compute_freq_error_per_contribution)
par(mfrow=c(1, 2))
hist(freq_error_per_contribution, breaks = seq(0, 1, 0.05), col = "grey",
main = "", xlab = "Frequency of error by contribution")
plot(n_entries_per_contribution, freq_error_per_contribution,
col = scales::alpha("black", 0.5),
xlab = "Number of entries", ylab = "Frequency of error")
xx <- seq(from=0, to=450, length.out = 1000)
for(i in 1:10)
lines(xx,i/xx,col=gray(.8))
### looking at the errors made ###
erroneous_ret <- ret[!ret$correct,]
head(erroneous_ret, 20)
```
```{r}
data_modified <- dplyr::filter(ret, elapsed < 50) %>% mutate(perc_error=100*(1-as.numeric(correct)))
p <- ggplot(data_modified, aes(x=elapsed, y=perc_error)) + geom_point(size=2, alpha=0.4) +
stat_smooth(method="loess", colour="blue", size=1.5) + facet_wrap(vars(date_format)) +
theme_bw() + xlab("Time taken to type (sec)") + ylab("Frequency of error (%)") + ylim(0,105)
p
ggsave("Freq_error.png")
```
# ```{r}
# ######################################################
# ### Characteristics of participants ###
# ######################################################
#
# countries_from <- unlist(sapply(1:length(dat), function(e) dat[[e]]$survey$country_from))
# table(countries_from)
#
# countries_residence <- unlist(sapply(1:length(dat), function(e) dat[[e]]$survey$country_residence))
# table(countries_residence)
#
# ######################################################
# ### Plot a map of the origin of contributors ###
# ######################################################
#
# library("rworldmap")
#
# par(mfrow=c(1, 1))
#
# count_countries <- table(countries_from)
# count_countries <- count_countries[-1]
#
# origin_map_df <- data.frame(country = names(count_countries), freq = as.integer(count_countries), stringsAsFactors = F)
#
# origin_map_df$country[which(origin_map_df$country=="Czech Rep.")] <- "Czech republic"
# origin_map_df$country[which(origin_map_df$country=="Dominican Rep.")] <- "Dominican republic"
#
# origin_Map <- joinCountryData2Map(origin_map_df, joinCode = "NAME", verbose = TRUE,
# nameJoinColumn = "country")
#
# mapCountryData(origin_Map, nameColumnToPlot="freq", xlim= c(-160,180), ylim = c(-0,70) , catMethod = "pretty",
# missingCountryCol = gray(.6), oceanCol="#f0f0ffff", addLegend="T",mapTitle = "Typo Challenge contributors")
# ```
```{r}
date1_char <- erroneous_ret$date # true date
date2_char <- erroneous_ret$user # date entered by user
erroneous_ret <- ret[!ret$correct,]
date1_char <- erroneous_ret$date # true date
date2_char <- erroneous_ret$user # date entered by user
erroneous_ret <- data.frame(erroneous_ret, detection = NA)
levels(erroneous_ret$detection) <- c("D|M|Y","empty","FullSpell","OneSep","NoSep")
# regular expression to track day sep1 month sep2 year pattern
# ?grep ?regex
re <- "^'?([0-9]+)([[:punct:]]+)([0-9]+)([[:punct:]]+)([0-9]+)[[:punct:]]*$"
recognised_by_regex <- grepl(re, date2_char)
# empty entries
empty_entries <- date2_char == ""
erroneous_ret$detection[recognised_by_regex] <- "D|M|Y"
# empty entries
empty_entries <- date2_char == ""
erroneous_ret$detection[empty_entries] <- "empty"
# fully spelt out date (with space or punctuation to separate dates)
re2 <- "^'?([0-9]+)([[:space:][:punct:]]+)([[:alpha:]]+)([[:space:][:punct:]]+)([0-9]+)[[:punct:]]*$"
full_spell_out_entries <- grepl(re2, date2_char)
date2_char[full_spell_out_entries]
erroneous_ret$detection[full_spell_out_entries] <- "FullSpell"
# missing one field (only one separator)
re3 <- "^'?([0-9]+)([[:punct:]]+)([0-9]+)[[:punct:]]*$"
missing_one_numeric_field <- grepl(re3, date2_char)
erroneous_ret$detection[missing_one_numeric_field] <- "OneSep"
# missing two fields (no separator)
re4 <- "^'?([0-9]+)[[:punct:]]*$"
missing_two_numeric_fields <- grepl(re4, date2_char)
erroneous_ret$detection[missing_two_numeric_fields] <- "NoSep"
### what are we currently not capturing?
captured <- recognised_by_regex | empty_entries | full_spell_out_entries | missing_one_numeric_field | missing_two_numeric_fields
date2_char[!captured]
date1_char[!captured]
prop.table(table(captured)) # <1% of errors not captured by current algorithm --> can look at these manually really!
# table(recognised_by_regex , empty_entries , full_spell_out_entries , missing_one_numeric_field, missing_two_numeric_fields)
### for those recognised by regular expressions, get day, month, year out of there ###
x <- date2_char[recognised_by_regex]
recovered_values <- list(day = as.numeric(sub(re, "\\1", x)),
sep1 = sub(re, "\\2", x),
month = as.numeric(sub(re, "\\3", x)),
sep2 = sub(re, "\\4", x),
year = as.numeric(sub(re, "\\5", x)))
four_digit_year <- nchar(sub(re, "\\5", x)) == 4
#res <- logical(length(recognised_by_regex))
#res[!recognised_by_regex] <- NA
#res[recognised_by_regex][four_digit_year] <- TRUE
valid_day <- (recovered_values$day > 0) & (recovered_values$day <= 31)
valid_month <- (recovered_values$month > 0) & (recovered_values$month <= 12)
valid_year <- (recovered_values$year > 1899) & (recovered_values$year <= 2100)
valid_date <- valid_day & valid_month & valid_year
recov_dates <- rep(0,length(erroneous_ret$user))
recov_date_correct <- rep(FALSE,length(erroneous_ret$user))
recov_is_date <- rep(FALSE,length(erroneous_ret$user))
for(i in 1:length(erroneous_ret$user))
{
e_s <- erroneous_ret$user[i]
if(grepl(re, e_s)){
recov_dates[i] <- paste0(as.numeric(sub(re, "\\1", e_s)),"/",as.numeric(sub(re, "\\3", e_s)),"/",year = as.numeric(sub(re, "\\5", e_s)))
tentative_date <- as.Date(recov_dates[i], format="%d/%m/%Y")
if(!is.na(tentative_date)){
recov_date_correct[i] <- (as.Date(erroneous_ret$date[i], format="%d/%m/%Y") == tentative_date)
recov_is_date[i] <- TRUE
}
} else {
recov_dates[i] <- NA
}
#recov_date_correct[is.na(recov_date_correct)] <- FALSE
}
```
```{r}
library(ggwaffle)
erroneous_ret$detection <- as.character(erroneous_ret$detection)
waffle_data <- waffle_iron(erroneous_ret, aes_d(group = detection), rows=30)
ggplot(waffle_data, aes(x, y, fill = group)) +
geom_waffle() +
coord_equal() +
scale_fill_waffle() +
theme_waffle()
```
There are `r nrow(ret)` entries, of which `r sum(data_modified$correct == FALSE)` are not matching the original. Among these non matching dates, `r sum(recov_is_date)` can be recovered (using regular expressions) as valid dates. Among these valid recovered dates, `r sum(recov_date_correct)` can be matched back to the correct original date. We thus have `r sum(recov_is_date)-sum(recov_date_correct)` dates which can be used to study the error model (`r (sum(recov_is_date)-sum(recov_date_correct))/nrow(ret)*100 `% of the entries).
```{r}
n_erroneous <- length(erroneous_ret$date)
error_class <- rep("non_classified",n_erroneous)
for(i in 1:n_erroneous)
{
#NB one date can be no date recovered and swap if the swap led to date not existing
#at the moment this is classified as a swap; later best to classify using a date_valid column?
if(recognised_by_regex[i]){
if(recov_date_correct[i]) {
error_class[i] <- "recovered_correct"} else {
error_class[i] <- error_type(erroneous_ret$date[i],recover_date(erroneous_ret$user[i]))
}
} else { error_class[i] <- "not_recovered" }
}
erroneous_ret$error_class <- error_class
table(erroneous_ret$error_class)
```
#### Maybe have a "likely problem" column to detect what is likely to have been the problem based on regular expression
#### Look at different keyborad in term of neighbouring
```{r}
waffle_data <- waffle_iron(erroneous_ret, aes_d(group = error_class), rows=40)
ggplot(waffle_data, aes(x, y, fill = group)) +
geom_waffle() +
coord_equal() +
theme_waffle()
```
# Overall distribution of errors
```{r}
table(erroneous_ret$error_class)/(length(erroneous_ret$error_class)-sum(erroneous_ret$error_class=="recovered_correct"))*100
hw <- (erroneous_ret$date_format == "Handwritten")
```
# Distribution of errors for handwritten data
```{r}
hw <- (erroneous_ret$date_format == "Handwritten")
table(erroneous_ret$error_class[hw])/(length(erroneous_ret$error_class[hw])-sum(erroneous_ret$error_class[hw]=="recovered_correct"))*100
```
# Distribution of errors for D/M/Y data
```{r}
dmy <- (erroneous_ret$date_format == "TextDayMonthYear")
table(erroneous_ret$error_class[dmy])/(length(erroneous_ret$error_class[dmy])-sum(erroneous_ret$error_class[dmy]=="recovered_correct"))*100
```
# Distribution of errors for calendar data
```{r}
cal <- (erroneous_ret$date_format == "Calendar")
table(erroneous_ret$error_class[cal])/(length(erroneous_ret$error_class[cal])-sum(erroneous_ret$error_class[cal]=="recovered_correct"))*100
```
# Error rates
```{r}
#overall
100*(length(erroneous_ret$error_class)-sum(erroneous_ret$error_class=="recovered_correct"))/length(data_modified$date_format)
#handwritten
100*(sum(hw)-sum((erroneous_ret$error_class=="recovered_correct")&hw))/sum(data_modified$date_format=="Handwritten")
#D/M/Y
100*(sum(dmy)-sum((erroneous_ret$error_class=="recovered_correct")&dmy))/sum(data_modified$date_format=="TextDayMonthYear")
#Calendar
100*(sum(cal)-sum((erroneous_ret$error_class=="recovered_correct")&cal))/sum(data_modified$date_format=="Calendar")
```
```{r}
idx <- (erroneous_ret$error_class=="recovered_correct")|(erroneous_ret$error_class=="not_recovered")
t_multinom <- table(erroneous_ret$error_class[!idx])
t_multinom/sum(t_multinom)
```
```{r}
list_errors <- list(external_swap=0.0688,internal_swap=0.0112,neighbour_substitution=0.2784,distant_substitution=0.3656,random=0.2760)
mat_e_d <- calculate_date_matrix(start_date=as.Date("28/12/2019", "%d/%m/%Y"),end_date=as.Date("27/1/2020", "%d/%m/%Y"), p_error=list_errors)
heatmap(mat_e_d, Rowv=NA, Colv = "Rowv")
```