-
Notifications
You must be signed in to change notification settings - Fork 0
/
wfo_fuzzyjoin_example.Rmd
233 lines (197 loc) · 6.71 KB
/
wfo_fuzzyjoin_example.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
---
title: "WFO FuzzyJoin Example"
author: "P. Zacher"
date: "2023-10-23"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Load packages
```{r}
library(dplyr)
library(tidytext)
library(tm)
library(fuzzyjoin)
library(WorldFlora)
library(data.table)
library(stringr)
library(parallel)
library(doParallel)
library(foreach)
library(iterors)
```
## Load data
```{r}
scientific_name <- read.csv("wfo_species_example.csv") # Longest name is 6 words
example_data <- read.csv("example_data.csv")
WFO.download() # Run once to download WFO data
WFO.remember()
```
## Subset data frames for troubleshooting
```{r}
# Subset scientific_name, example_data, and WFO.data for troubleshooting
scientific_name_subset <- slice(scientific_name, 1:100)
example_data_subset <- slice(example_data, 1:5)
WFO.data.subset <- slice(WFO.data, 1:1000000)
# Convert all scientific names to lowercase
scientific_name_subset <- scientific_name_subset %>%
mutate_all(tolower)
```
## Clean abstracts
```{r}
# Create a Corpus from the example abstracts (from tm package)
corpus <- Corpus(VectorSource(example_data_subset$abstract_l))
# Preprocess the corpus (remove punctuation, convert to lowercase, remove numbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
# Load stop words (e.g., "a", "the", "is", "are", etc.) and remove them from
# abstracts.
data("stopwords")
corpus <- tm_map(corpus, removeWords, stopwords("en"))
# Convert the processed corpus back to a character vector
processed_abstracts <- sapply(corpus, as.character)
# Add the processed_abstracts column to example_data_subset
example_data_subset$processed_abstracts <- processed_abstracts
# Tokenize the abstracts into n-grams of 6 words (tidytext package)
tokenized_ngrams <- example_data_subset %>%
unnest_tokens(ngram, processed_abstracts, token = "ngrams", n = 4)
# Create a subset for troubleshooting
tokenized_ngrams_subset <- slice(tokenized_ngrams, 1:300)
```
## Option 1:
### Fuzzy join using WFO.match.fuzzyjoin without parallel processing
```{r}
cuts <- cut(c(1:nrow(tokenized_ngrams_subset)), breaks = 2, labels = FALSE)
cut.i <- sort(unique(cuts))
for (i in 1:length(cut.i)) {
cat(paste("Cut: ", i, "\n"))
tokenized_ngrams_subset_i <-
WFO.one(
WFO.match.fuzzyjoin(
spec.data = tokenized_ngrams_subset[cuts == cut.i[i],],
WFO.data = WFO.data.subset,
spec.name = "ngram",
fuzzydist.max = 3
),
verbose = FALSE
)
if (i == 1) {
tokenized_ngrams_subset_WFO <- tokenized_ngrams_subset_i
} else{
tokenized_ngrams_subset_WFO <- rbind(tokenized_ngrams_subset_WFO, tokenized_ngrams_subset_i)
}
}
```
## Option 2:
### Fuzzy join using WFO.match.fuzzyjoin with parallel processing
```{r}
# How many cores do you have available?
detectCores()
# Set the number of cores to use. Probably best to start with 1/2 of the cores
# available and scale up.
num_cores <- 5
# Create a cluster for parallel processing
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Break data into parts
cuts <- cut(c(1:nrow(tokenized_ngrams_subset)), breaks=2, labels=FALSE)
cut_i <- sort(unique(cuts))
system.time({
# Perform parallel fuzzy join using WFO.match.fuzzyjoin
tokenized_ngrams_WFO_parallel <-
foreach(
i = 1:length(cut_i),
.combine = rbind,
.packages = c("dplyr", "WorldFlora")
) %dopar% {
WFO.one(
WFO.match.fuzzyjoin(
spec.data = tokenized_ngrams_subset[cuts == cut_i[i],],
WFO.data = WFO.data.subset,
spec.name = "ngram",
fuzzydist.max = 3
),
verbose = FALSE
)
}
})
# Stop the parallel cluster
stopCluster(cl)
```
## Option 2.1:
### Fuzzy join using WFO.match.fuzzyjoin with parallel processing and chunked data frames
First define a function `tokenized_ngrams_WFO_parallel_fn` that wraps the
parallel processing code from Option 2 into a single call. Inputs to this
function are `i_chunk_df`, a chunked ngram data frame (explained below) and
`WFO_data`.
```{r}
# Create a function to process each chunked ngram data frame. The `breaks`
# argument can be modified to suit your setup. I set it to the same value as the
# number of cores I allocated for the task (e.g., 5) and saw faster performance
# compared to a lower number of breaks (e.g., 2).
tokenized_ngrams_WFO_parallel_fn <- function(i_chunk_df, WFO_data) {
cuts <- cut(c(1:nrow(i_chunk_df)), breaks = 5, labels = FALSE) # Cut each chunked data frame into smaller chunks for each worker to process
cut_i <- sort(unique(cuts)) # How many unique cuts are there?
tokenized_ngrams_WFO_parallel_i <-
foreach(
i = 1:length(cut_i),
.combine = rbind,
.packages = c("dplyr", "WorldFlora")
) %dopar% {
WFO.one(
WFO.match.fuzzyjoin(
spec.data = i_chunk_df[cuts == cut_i[i],],
WFO.data = WFO_data,
spec.name = "ngram",
fuzzydist.max = 3
),
verbose = FALSE
)
}
}
```
We are going to break down the `tokenized_ngrams` data frame into smaller
chunks. Each of these chunks will then be fed into
`tokenized_ngrams_WFO_parallel_fn` that will complete the WFO.match.fuzzyjoin
using parallel processing. The output is `tokenized_ngrams_WFO_parallel`.
```{r}
# Turn the ngrams data frame into an `iteror` object. This makes it easier to
# iterate through a data frame, rather than chunking manually. Chunk size can be
# changed to suit your needs by changing the argument `chunkSize`. It will
# likely require some testing to find the right size.
i_chunk_df <- iteror(tokenized_ngrams_subset, by="row", chunkSize=100)
# How many cores do you have available?
detectCores()
# Set the number of cores to use. Probably best to start with 1/2 of the cores
# available and scale up.
num_cores <- 5
# Create a cluster for parallel processing
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Process ngrams
tokenized_ngrams_WFO_parallel <-
foreach(
i = i_chunk_df,
.combine = rbind,
.packages = c("dplyr", "WorldFlora")
) %do% {
tokenized_ngrams_WFO_parallel_fn(i_chunk_df = i, WFO_data = WFO.data)
}
# Stop the cluster when finished.
stopCluster(cl)
```
## Option 3:
### Fuzzy join using `fuzzyjoin`
```{r}
# Perform fuzzy join based on approximate string matching (fuzzyjoin package)
fuzzy_join_df <- stringdist_left_join(tokenized_ngrams, scientific_name_subset,
by = c("ngram" = "scientificName"),
max_dist = 5, method = "lv")
# Filter to remove scientific name non-matches and only keep unique names (dplyr)
filtered_df <- fuzzy_join_df %>%
group_by(uid) %>%
distinct(scientificName, .keep_all = TRUE) %>%
filter(!is.na(scientificName))
```