-
Notifications
You must be signed in to change notification settings - Fork 1
/
wordCount.R
executable file
·54 lines (48 loc) · 1.76 KB
/
wordCount.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(pdftools)
library(stringr)
######################
## FUNCTIONS
######################
openPDF <- function (folderName){
# Opens multiple PDF files within a given folder
# Args:
# folderName: name (string) of folder where the files are located
#
# Returns:
# files: list of files that can be analyzed using R. Input for wordCount function.
# The working directory must be set prior to running this code using either:
# - setwd()
# OR
# - file.path("folder_name", "filename.pdf")
#
# Error Handling:
# suppressWarnings are enabled for files where 'foreign' docx bullets can
# yield 'Badly formatted number' warning. And specific font weights can
# yield 'invalid font weight'. Both warnings do not effect the word count.
filenames <- list.files(folderName, pattern = "*.pdf", full.names = TRUE)
files <- suppressWarnings(lapply(filenames, pdf_text))
return(files)
}
wordCount <- function(files, totWords){
# Computes the number of words in a given file
# Args:
# files: list of files output by openPDF function
# totWords: initalized total words, set to 0 for initialization
#
# Returns:
# totWords: the total number of words for all files within files list
#
for (txt in files){
txt <- removePunctuation(txt)
txt <- str_replace(gsub("\\s+", " ", str_trim(txt)), "\n", " ")
totWords <- totWords + sum(lengths(strsplit(txt, " ")))
}
return(totWords)
}
######################
## Main Execution
######################
noInterFiles <- openPDF("noInteractionFiles")
print(paste0("No interaction total words (pdf): ", wordCount(noInterFiles,0)))
interFiles <- openPDF("interactionFiles")
print(paste0("Interaction total words (pdf): ", wordCount(interFiles,0)))