logbook.Rmd

---
title: "Logbook Typo Challenge data analysis"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(ggplot2)
library(dplyr)
```

```{r}
######################################################
### read contributions ###
######################################################

date_of_contributions <- "Final"

dat <- readRDS(paste0("./contributions/", date_of_contributions, "/contributions.rds"))

n_entries_per_contribution <- vapply(dat, function(x) nrow(x$data), integer(1))
tmp <- do.call("rbind", lapply(dat, "[[", "data"))
ret <- cbind(id = rep(names(dat), n_entries_per_contribution), tmp, stringsAsFactors = FALSE)
rownames(ret) <- NULL

######################################################
### source functions ###
######################################################
source("R/functions.R")
```

```{r}
######################################################
### Characteristics of contributions ###
######################################################

### number of contributions ###
id_contributions <- names(dat)
n_contributions <- length(id_contributions)

### Number of entries by contribution ### 
hist(table(ret$id), breaks = seq(0, 460, 10), col = "grey", 
     main = "", xlab = "Number of entry by contribution")

```

The dataset collected during the TypoChallenge contains `r nrow(ret)` entries from `r n_contributions` individual contributions. Each contribution includes an average of `r mean(table(ret$id))` entries (median=`r median(table(ret$id))`, sd=`r sd(table(ret$id))`, range=`r range(table(ret$id))`).

```{r}
######################################################
### Characteristics of errors ###
######################################################

### frequency of error (across all contributions) ###
freq_error <- table(ret$correct)['FALSE'] / sum(table(ret$correct))

### frequency of error (per contribution) ###
freq_error_per_contribution <- 
  sapply(id_contributions, compute_freq_error_per_contribution)

par(mfrow=c(1, 2))
hist(freq_error_per_contribution, breaks = seq(0, 1, 0.05), col = "grey", 
     main = "", xlab = "Frequency of error by contribution")
plot(n_entries_per_contribution, freq_error_per_contribution,
     col = scales::alpha("black", 0.5),
     xlab = "Number of entries", ylab = "Frequency of error")

xx <- seq(from=0, to=450, length.out = 1000)
for(i in 1:10)
  lines(xx,i/xx,col=gray(.8))

### looking at the errors made ###
erroneous_ret <- ret[!ret$correct,]

head(erroneous_ret, 20)
```

```{r}
data_modified <- dplyr::filter(ret, elapsed < 50) %>% mutate(perc_error=100*(1-as.numeric(correct)))

p <- ggplot(data_modified, aes(x=elapsed, y=perc_error)) + geom_point(size=2, alpha=0.4) +
stat_smooth(method="loess", colour="blue", size=1.5) + facet_wrap(vars(date_format)) +
  theme_bw() + xlab("Time taken to type (sec)") + ylab("Frequency of error (%)") + ylim(0,105)
p
ggsave("Freq_error.png")

```

# ```{r}
# ######################################################
# ### Characteristics of participants ###
# ######################################################
# 
# countries_from <- unlist(sapply(1:length(dat), function(e) dat[[e]]$survey$country_from))
# table(countries_from)
# 
# countries_residence <- unlist(sapply(1:length(dat), function(e) dat[[e]]$survey$country_residence))
# table(countries_residence)
# 
# ######################################################
# ### Plot a map of the origin of contributors ###
# ######################################################
# 
# library("rworldmap")
# 
# par(mfrow=c(1, 1))
# 
# count_countries <- table(countries_from)
# count_countries <- count_countries[-1]
# 
# origin_map_df <- data.frame(country = names(count_countries), freq = as.integer(count_countries), stringsAsFactors = F)
# 
# origin_map_df$country[which(origin_map_df$country=="Czech Rep.")] <- "Czech republic"
# origin_map_df$country[which(origin_map_df$country=="Dominican Rep.")] <- "Dominican republic"
# 
# origin_Map <- joinCountryData2Map(origin_map_df, joinCode = "NAME", verbose = TRUE,
#                               nameJoinColumn = "country")
# 
# mapCountryData(origin_Map, nameColumnToPlot="freq", xlim= c(-160,180), ylim = c(-0,70) , catMethod = "pretty",
#                missingCountryCol = gray(.6), oceanCol="#f0f0ffff", addLegend="T",mapTitle = "Typo Challenge contributors")
# ```

```{r}
date1_char <- erroneous_ret$date # true date
date2_char <- erroneous_ret$user # date entered by user

erroneous_ret <- ret[!ret$correct,]

date1_char <- erroneous_ret$date # true date
date2_char <- erroneous_ret$user # date entered by user

erroneous_ret <- data.frame(erroneous_ret, detection = NA)

levels(erroneous_ret$detection) <- c("D|M|Y","empty","FullSpell","OneSep","NoSep")

# regular expression to track day sep1 month sep2 year pattern
# ?grep ?regex
re <- "^'?([0-9]+)([[:punct:]]+)([0-9]+)([[:punct:]]+)([0-9]+)[[:punct:]]*$"
recognised_by_regex <- grepl(re, date2_char)

# empty entries
empty_entries <- date2_char == ""
erroneous_ret$detection[recognised_by_regex] <- "D|M|Y"

# empty entries
empty_entries <- date2_char == ""
erroneous_ret$detection[empty_entries] <- "empty"

# fully spelt out date (with space or punctuation to separate dates)
re2 <- "^'?([0-9]+)([[:space:][:punct:]]+)([[:alpha:]]+)([[:space:][:punct:]]+)([0-9]+)[[:punct:]]*$"
full_spell_out_entries <- grepl(re2, date2_char)
date2_char[full_spell_out_entries]
erroneous_ret$detection[full_spell_out_entries] <- "FullSpell"

# missing one field (only one separator)
re3 <- "^'?([0-9]+)([[:punct:]]+)([0-9]+)[[:punct:]]*$"
missing_one_numeric_field <- grepl(re3, date2_char)
erroneous_ret$detection[missing_one_numeric_field] <- "OneSep"


# missing two fields (no separator)
re4 <- "^'?([0-9]+)[[:punct:]]*$"
missing_two_numeric_fields <- grepl(re4, date2_char)
erroneous_ret$detection[missing_two_numeric_fields] <- "NoSep"

### what are we currently not capturing?  
captured <- recognised_by_regex | empty_entries | full_spell_out_entries | missing_one_numeric_field | missing_two_numeric_fields
date2_char[!captured] 
date1_char[!captured] 
prop.table(table(captured)) # <1% of errors not captured by current algorithm --> can look at these manually really!
# table(recognised_by_regex , empty_entries , full_spell_out_entries , missing_one_numeric_field, missing_two_numeric_fields)

### for those recognised by regular expressions, get day, month, year out of there ###
x <- date2_char[recognised_by_regex]
recovered_values <- list(day = as.numeric(sub(re, "\\1", x)),
     sep1 = sub(re, "\\2", x),
     month = as.numeric(sub(re, "\\3", x)),
     sep2 = sub(re, "\\4", x),
     year = as.numeric(sub(re, "\\5", x)))

four_digit_year <- nchar(sub(re, "\\5", x)) == 4

#res <- logical(length(recognised_by_regex))
#res[!recognised_by_regex] <- NA
#res[recognised_by_regex][four_digit_year] <- TRUE

valid_day <- (recovered_values$day > 0) & (recovered_values$day <= 31)
valid_month <- (recovered_values$month > 0) & (recovered_values$month <= 12)
valid_year <- (recovered_values$year > 1899) & (recovered_values$year <= 2100)

valid_date <- valid_day & valid_month & valid_year

recov_dates <- rep(0,length(erroneous_ret$user))
recov_date_correct <- rep(FALSE,length(erroneous_ret$user))
recov_is_date <- rep(FALSE,length(erroneous_ret$user))
for(i in 1:length(erroneous_ret$user))
{
  e_s <- erroneous_ret$user[i]
  if(grepl(re, e_s)){
    recov_dates[i] <- paste0(as.numeric(sub(re, "\\1", e_s)),"/",as.numeric(sub(re, "\\3", e_s)),"/",year = as.numeric(sub(re, "\\5", e_s)))
    tentative_date <- as.Date(recov_dates[i], format="%d/%m/%Y")
    if(!is.na(tentative_date)){
      recov_date_correct[i] <- (as.Date(erroneous_ret$date[i], format="%d/%m/%Y") == tentative_date)
      recov_is_date[i] <- TRUE
    }
    
    } else {
     recov_dates[i] <- NA 
    }
  #recov_date_correct[is.na(recov_date_correct)] <- FALSE 
}
```
```{r}
library(ggwaffle)

erroneous_ret$detection <- as.character(erroneous_ret$detection)
waffle_data <- waffle_iron(erroneous_ret, aes_d(group = detection), rows=30)

ggplot(waffle_data, aes(x, y, fill = group)) + 
  geom_waffle() + 
  coord_equal() +
  scale_fill_waffle() + 
  theme_waffle()

```

There are `r nrow(ret)` entries, of which `r sum(data_modified$correct == FALSE)` are not matching the original. Among these non matching dates, `r sum(recov_is_date)` can be recovered (using regular expressions) as valid dates. Among these valid recovered dates, `r sum(recov_date_correct)` can be matched back to the correct original date. We thus have `r sum(recov_is_date)-sum(recov_date_correct)` dates which can be used to study the error model (`r (sum(recov_is_date)-sum(recov_date_correct))/nrow(ret)*100 `% of the entries).

```{r}
n_erroneous <- length(erroneous_ret$date)

error_class <- rep("non_classified",n_erroneous)

for(i in 1:n_erroneous)
{
  #NB one date can be no date recovered and swap if the swap led to date not existing
  #at the moment this is classified as a swap; later best to classify using a date_valid column?
  if(recognised_by_regex[i]){
    
  if(recov_date_correct[i]) {
    error_class[i] <- "recovered_correct"} else {
    error_class[i] <- error_type(erroneous_ret$date[i],recover_date(erroneous_ret$user[i]))  
    }  
  } else { error_class[i] <- "not_recovered" }
    
}

erroneous_ret$error_class <- error_class

table(erroneous_ret$error_class)

```

#### Maybe have a "likely problem" column to detect what is likely to have been the problem based on regular expression
#### Look at different keyborad in term of neighbouring

```{r}

waffle_data <- waffle_iron(erroneous_ret, aes_d(group = error_class), rows=40)

ggplot(waffle_data, aes(x, y, fill = group)) + 
  geom_waffle() + 
  coord_equal() +
  theme_waffle()

```

# Overall distribution of errors

```{r}
table(erroneous_ret$error_class)/(length(erroneous_ret$error_class)-sum(erroneous_ret$error_class=="recovered_correct"))*100

hw <- (erroneous_ret$date_format == "Handwritten")

```

# Distribution of errors for handwritten data

```{r}
hw <- (erroneous_ret$date_format == "Handwritten")
table(erroneous_ret$error_class[hw])/(length(erroneous_ret$error_class[hw])-sum(erroneous_ret$error_class[hw]=="recovered_correct"))*100
```

# Distribution of errors for D/M/Y data

```{r}
dmy <- (erroneous_ret$date_format == "TextDayMonthYear")
table(erroneous_ret$error_class[dmy])/(length(erroneous_ret$error_class[dmy])-sum(erroneous_ret$error_class[dmy]=="recovered_correct"))*100
```

# Distribution of errors for calendar data

```{r}
cal <- (erroneous_ret$date_format == "Calendar")
table(erroneous_ret$error_class[cal])/(length(erroneous_ret$error_class[cal])-sum(erroneous_ret$error_class[cal]=="recovered_correct"))*100
```

# Error rates

```{r}

#overall
100*(length(erroneous_ret$error_class)-sum(erroneous_ret$error_class=="recovered_correct"))/length(data_modified$date_format)

#handwritten
100*(sum(hw)-sum((erroneous_ret$error_class=="recovered_correct")&hw))/sum(data_modified$date_format=="Handwritten")

#D/M/Y
100*(sum(dmy)-sum((erroneous_ret$error_class=="recovered_correct")&dmy))/sum(data_modified$date_format=="TextDayMonthYear")

#Calendar
100*(sum(cal)-sum((erroneous_ret$error_class=="recovered_correct")&cal))/sum(data_modified$date_format=="Calendar")

```

```{r}
idx <- (erroneous_ret$error_class=="recovered_correct")|(erroneous_ret$error_class=="not_recovered")
t_multinom <- table(erroneous_ret$error_class[!idx])
t_multinom/sum(t_multinom)
```
```{r}
list_errors <- list(external_swap=0.0688,internal_swap=0.0112,neighbour_substitution=0.2784,distant_substitution=0.3656,random=0.2760)

mat_e_d <- calculate_date_matrix(start_date=as.Date("28/12/2019", "%d/%m/%Y"),end_date=as.Date("27/1/2020", "%d/%m/%Y"), p_error=list_errors)
heatmap(mat_e_d, Rowv=NA, Colv = "Rowv")
```