-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping.R
79 lines (68 loc) · 1.73 KB
/
scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# title: "BIOPHYTMOL scrapeR"
# loading paths
source("paths.R")
library(data.table)
library(dplyr)
library(rvest) # provides read_html
library(tidyr) # provides pivot_wider
library(xml2)
# get paths
database <- databases$get("biophytmol")
url <-
"http://crdd.osdd.net/servers/biophytmol/search-biophytmol.php?compound_id="
X <- (1001:4154)
getbiophyt <- function(X) {
tryCatch({
cd_id <- X
url_id <- paste(url, cd_id, "&type=compound_id")
url_id <- gsub(
pattern = "\\s",
replacement = "",
x = url_id
)
df1 <- rvest::read_html(x = url_id) |>
rvest::html_element("body") |>
xml2::xml_child("table[3]") |>
rvest::html_table(fill = TRUE)
})
}
BIOPHYTMOL <- invisible(lapply(
FUN = getbiophyt,
X = X
))
BIOPHYTMOL_2 <- BIOPHYTMOL[BIOPHYTMOL != "Timed out!"]
BIOPHYTMOL_3 <- dplyr::bind_rows(BIOPHYTMOL_2, .id = "column_label")
BIOPHYTMOL_4 <- BIOPHYTMOL_3 |>
dplyr::select(1:3) |>
dplyr::filter(!is.na(X2)) |>
dplyr::group_by(column_label) |>
tidyr::pivot_wider(
names_from = X1,
values_from = X2
) |>
dplyr::filter(!is.na(SMILES)) |>
dplyr::select(
uniqueid = `Compound ID`,
name = `Active Compound Identified`,
biologicalsource = `Plant Source`,
biologicalpart = `Plant Part Used`,
extract = Extract,
pubchem = `PubChem ID`,
smiles = SMILES,
pubmed = `PubMed ID [Source Literature]`,
reference = `Reference(s)`
) |>
dplyr::mutate(reference = paste(pubmed, reference, sep = "§")) |>
dplyr::select(
uniqueid,
name,
biologicalsource,
biologicalpart,
extract,
pubchem,
smiles,
reference
)
# exporting
create_dir(export = database$sourceFiles$tsv)
database$writeFile(database$sourceFiles$tsv, BIOPHYTMOL_4)