In [1]:
###################################################
## Post Process the DFCM corpus results - focus on the abstract text, and converting from string corpus to DTM
##
## Author: Chris Meaney
## Date: January 2023
###################################################

In [2]:
##########################
## Dependency packages 
##########################

## For sparse matrices
library(Matrix)
## For matching
library(fastmatch)

In [3]:
###########################
## Filepaths to import data obsjects from disk, and export processed data to disk
###########################

## Set working directory path
wd_path <- "Enter_a_Path_to_a_Working_Directory_For_This_Project"

## Import stop words file from disk
fpath_stop <- paste0(wd_path, "//stopwords-en.txt")

## Import core scopus dataframe (contains the abstract text)
scopus_fpath <- paste0(wd_path, "//Core_Scopus_Dataset_Sm.csv")

## Filepath to write keywords to disk
fpath_keywords <- paste0(wd_path, "//Table1_KeywordList.csv")

## Filepath to write title bigrams/trigrams to disk
fpath_title <- paste0(wd_path, "//Table2_Title_BigramTrigramList.csv")

## Filepath to write abstract bigrams/trigrams to disk
fpath_abstract <- paste0(wd_path, "//Table3_Abstract_BigramTrigramList.csv")

## Filepath to write combined keyword/bigram/trigram table to disk
fpath_combined <- paste0(wd_path, "//Table123_CombinedTable.csv")

## Filepath to write emerging thematic queries to disk
fpath_novel_themes <- paste0(wd_path, "//Table4_NovelThemes.csv")


In [4]:
####################
## Import Stop words list
####################
stopw <- read.table(file=fpath_stop, header=FALSE, skip=3, sep="\n", stringsAsFactors=FALSE, quote="", fill=FALSE, col.names=c("stop_word"))
stopw$stop_word <- gsub(pattern='[[:punct:]]', replace="", x=stopw$stop_word)
stopw <- unlist(stopw)
str(stopw)

 Named chr [1:1298] "ll" "tis" "twas" "ve" "10" "39" "a" "as" "able" ...
 - attr(*, "names")= chr [1:1298] "stop_word1" "stop_word2" "stop_word3" "stop_word4" ...


In [5]:
#####################
## Import core Scopus Dataset
#####################
scopus_df <- read.csv(scopus_fpath, header=TRUE, sep=",", stringsAsFactors=FALSE)
str(scopus_df)

'data.frame':	18874 obs. of  25 variables:
 $ au_id         : num  1e+10 1e+10 1e+10 1e+10 1e+10 ...
 $ prism_url     : chr  "https://api.elsevier.com/content/abstract/scopus_id/85140976962" "https://api.elsevier.com/content/abstract/scopus_id/85121747496" "https://api.elsevier.com/content/abstract/scopus_id/85112107056" "https://api.elsevier.com/content/abstract/scopus_id/85118672412" ...
 $ eid           : chr  "2-s2.0-85140976962" "2-s2.0-85121747496" "2-s2.0-85112107056" "2-s2.0-85118672412" ...
 $ doi           : chr  "10.1016/j.socscimed.2022.115463" "10.1111/add.15760" "10.1016/j.socscimed.2021.114262" "10.3399/BJGP.2021.0195" ...
 $ issn          : chr  "02779536" "09652140" "02779536" "09601643" ...
 $ eissn         : chr  "18735347" "13600443" "18735347" "14785242" ...
 $ pub_title     : chr  "Brief opportunistic interventions by general practitioners to promote smoking cessation: A conversation analytic study" "The old and familiar meets the new and unknown: patient and clin

In [6]:
## Only keep publications from certain years --- 2017:2022
scopus_df <- scopus_df[scopus_df$pub_year %in% 2017:2022, ]
dim(scopus_df)

In [7]:
## Group institutions by region
scopus_df$au_institution_country <- with(scopus_df, 
                                            ifelse(au_institution %in% c("Toronto","UBC","McGill","Laval","Ottawa"), "Canada",
                                            ifelse(au_institution %in% c("Dartmouth","Michigan","Duke","OHSU","UCSF","Harvard"), "USA",
                                            ifelse(au_institution %in% c("Oxford", "Cambridge","UCL"), "UK", "Other"))))

table(scopus_df$au_institution, scopus_df$au_institution_country)

           
            Canada Other   UK  USA
  Cambridge      0     0  560    0
  Dartmouth      0     0    0 1173
  Duke           0     0    0  380
  Harvard        0     0    0  485
  HKU            0   648    0    0
  Laval        785     0    0    0
  McGill      1025     0    0    0
  Michigan       0     0    0  793
  Monash         0   455    0    0
  OHSU           0     0    0  752
  Ottawa       774     0    0    0
  Oxford         0     0 5443    0
  Toronto     2484     0    0    0
  UBC          920     0    0    0
  UCL            0     0 1526    0
  UCSF           0     0    0  421
  UNSW           0   250    0    0

In [8]:
## Create small data.frame that only considers unique publications
scopus_df_uniq <- scopus_df[is.na(scopus_df$pub_title)==FALSE, ]
scopus_df_uniq <- scopus_df_uniq[!duplicated(scopus_df_uniq$pub_title), ]

list(
    dim(scopus_df_uniq),
    dim(scopus_df)
)

In [9]:
## How many included institutions and authors (in dataset of unique titles)
list(
length(unique(scopus_df_uniq$au_institution)),
length(unique(scopus_df_uniq$au_name))
)

In [10]:
############################################################################
##
## Basics Corpus Summary Statistics, Linguistic Statistics and Topic Modelling
##
############################################################################

In [11]:
## When were these papers published
## N.b. this is an "open cohort"...reflects growth in research institution faculty size over time
year_df <- data.frame(table(scopus_df_uniq$pub_year, useNA="always"))
names(year_df) <- c("year","freq")
year_df$year <- as.numeric(as.character(year_df$year))
year_df 

year,freq
<dbl>,<int>
2017.0,1872
2018.0,2002
2019.0,2012
2020.0,2141
2021.0,2404
2022.0,2616
,0


In [12]:
## What journals does DFCM publish in, with particular frequencies
journal_df <- data.frame(table(scopus_df_uniq$pub_journal, useNA="always"))
names(journal_df) <- c("journal","freq")
journal_df <- with(journal_df, journal_df[order(-freq), ])
head(journal_df, 20)

Unnamed: 0_level_0,journal,freq
Unnamed: 0_level_1,<fct>,<int>
339,BMJ Open,655
1875,PLoS ONE,321
382,British Journal of General Practice,165
321,BMC Public Health,142
293,BMC Health Services Research,126
1395,Journal of Medical Internet Research,113
557,Cochrane Database of Systematic Reviews,111
555,CMAJ open,107
1055,International Journal of Environmental Research and Public Health,106
1168,JMIR Research Protocols,101


In [13]:
nrow(journal_df)

In [14]:
## How many publications have keywords
nrow(scopus_df_uniq) - sum(is.na(scopus_df_uniq$pub_keywords)) - sum(scopus_df_uniq$pub_keywords=="", na.rm=TRUE)

In [15]:
## How many publications have titles
nrow(scopus_df_uniq) - sum(is.na(scopus_df_uniq$pub_title)) - sum(scopus_df_uniq$pub_title=="", na.rm=TRUE)

In [16]:
## How many publications have abstracts
nrow(scopus_df_uniq) - sum(is.na(scopus_df_uniq$pub_abstract)) - sum(scopus_df_uniq$pub_abstract=="", na.rm=TRUE)

In [17]:
######################
## What words occur most frequently in corpus
######################
unigrams <- data.frame(table(tolower(unlist(lapply(scopus_df_uniq$pub_abstract, strsplit, "\\s+")))))
names(unigrams) <- c("token", "freq")
unigrams <- with(unigrams, unigrams[order(-freq), ])
head(unigrams, 100)

Unnamed: 0_level_0,token,freq
Unnamed: 0_level_1,<fct>,<int>
73598,and,133930
152277,the,122064
125690,of,115422
153358,to,84795
108171,in,71082
70054,a,49162
159771,with,42280
100436,for,40234
159415,were,30626
158839,was,23578


In [18]:
nrow(unigrams)

In [19]:
#####################
## What about bigrams??
#####################
bigram_func <- function(string) {
	toks <- gsub(tolower(unlist(strsplit(string, split="\\s+"))), pattern="[^[:alnum:]]", replace="")
	toks_shift <- toks[2:length(toks)]
	bigrams <- data.frame(cbind(toks[1:(length(toks)-1)], toks_shift))
	names(bigrams) <- c("tok1","tok2")
	bigrams$tok1_flag <- ifelse( (bigrams$tok1 %in% stopw) | (nchar(bigrams$tok1)==1) | (!grepl("\\D", bigrams$tok1)), TRUE, FALSE)
	bigrams$tok2_flag <- ifelse( (bigrams$tok2 %in% stopw) | (nchar(bigrams$tok2)==1) | (!grepl("\\D", bigrams$tok2)), TRUE, FALSE) 
	bigrams <- bigrams[bigrams$tok1_flag==FALSE & bigrams$tok2_flag==FALSE, ] 
	bigrams$tok1_flag <- NULL
	bigrams$tok2_flag <- NULL
	return(apply(bigrams, 1, paste0, collapse="_"))
}

In [20]:
##
## Abstract bigrams
##
bigrams <- data.frame(table(unlist(lapply(scopus_df_uniq$pub_abstract, bigram_func))))
names(bigrams) <- c("bigram", "freq")
bigrams$bigram <- as.character(bigrams$bigram)
bigrams <- with(bigrams, bigrams[order(-freq), ])
head(bigrams, 100)

Unnamed: 0_level_0,bigram,freq
Unnamed: 0_level_1,<chr>,<int>
264150,primary_care,5168
145309,health_care,2344
204757,mental_health,1569
69621,confidence_interval,1360
248900,physical_activity,1181
58074,cohort_study,1168
264354,primary_outcome,1045
274195,public_health,985
336918,systematic_review,915
37046,blood_pressure,903


In [21]:
nrow(bigrams)

In [22]:
##
## Title bigrams
##
bigrams_title <- data.frame(table(unlist(lapply(scopus_df_uniq$pub_title, bigram_func))))
names(bigrams_title) <- c("bigram", "freq")
bigrams_title$bigram <- as.character(bigrams_title$bigram)
bigrams_title <- with(bigrams_title, bigrams_title[order(-freq), ])
head(bigrams_title, 100)

Unnamed: 0_level_0,bigram,freq
Unnamed: 0_level_1,<chr>,<int>
26037,primary_care,1212
32461,systematic_review,838
5922,cohort_study,691
7121,controlled_trial,486
27127,qualitative_study,369
27368,randomised_controlled,308
13609,health_care,296
27401,randomized_controlled,255
7463,covid19_pandemic,215
20083,mixed_methods,204


In [23]:
nrow(bigrams_title)

In [24]:
##
## What about trigrams??
##
trigram_func <- function(string) {
	toks <- gsub(tolower(unlist(strsplit(string, split="\\s+"))), pattern="[^[:alnum:]]", replace="")
	toks_shift <- toks[2:length(toks)]
	toks_shift2 <- toks[3:length(toks)]
	if (length(toks)<2) {
		trigrams <- NULL
	} else {
		trigrams <- data.frame(cbind(toks[1:(length(toks)-2)], toks_shift[1:(length(toks_shift)-1)], toks_shift2))
		names(trigrams) <- c("tok1","tok2","tok3")
		trigrams$tok1_flag <- ifelse( (trigrams$tok1 %in% stopw) | (nchar(trigrams$tok1)==1) | (!grepl("\\D", trigrams$tok1)), TRUE, FALSE)
		trigrams$tok2_flag <- ifelse( (trigrams$tok2 %in% stopw) | (nchar(trigrams$tok2)==1) | (!grepl("\\D", trigrams$tok2)), TRUE, FALSE) 
		trigrams$tok3_flag <- ifelse( (trigrams$tok3 %in% stopw) | (nchar(trigrams$tok3)==1) | (!grepl("\\D", trigrams$tok3)), TRUE, FALSE) 
		trigrams <- trigrams[trigrams$tok1_flag==FALSE & trigrams$tok2_flag==FALSE & trigrams$tok3_flag==FALSE, ] 
		trigrams$tok1_flag <- NULL
		trigrams$tok2_flag <- NULL
		trigrams$tok3_flag <- NULL
		trigrams <- apply(trigrams, 1, paste0, collapse="_")	
	}
	return(trigrams)
}

In [25]:
##
## Abstract trigrams
##
trigrams <- data.frame(table(unlist(lapply(scopus_df_uniq$pub_abstract, trigram_func))))
names(trigrams) <- c("trigram", "freq")
trigrams$trigram <- as.character(trigrams$trigram)
trigrams <- with(trigrams, trigrams[order(-freq), ])
head(trigrams, 100)

Unnamed: 0_level_0,trigram,freq
Unnamed: 0_level_1,<chr>,<int>
232115,retrospective_cohort_study,356
219026,randomised_controlled_trial,352
219176,randomized_controlled_trial,303
207299,primary_care_practices,267
7535,adjusted_odds_ratio,266
153196,main_outcome_measures,253
207326,primary_care_providers,245
219178,randomized_controlled_trials,232
84291,electronic_health_record,211
207623,primary_health_care,199


In [26]:
nrow(trigrams)

In [27]:
##
## Title trigrams
##
trigrams_title <- data.frame(table(unlist(lapply(scopus_df_uniq$pub_title, trigram_func))))
names(trigrams_title) <- c("trigram", "freq")
trigrams_title$trigram <- as.character(trigrams_title$trigram)
trigrams_title <- with(trigrams_title, trigrams_title[order(-freq), ])
head(trigrams_title, 100)

Unnamed: 0_level_0,trigram,freq
Unnamed: 0_level_1,<chr>,<int>
17307,randomised_controlled_trial,247
17353,randomized_controlled_trial,216
18141,retrospective_cohort_study,159
15522,populationbased_cohort_study,119
16772,prospective_cohort_study,80
12688,mixed_methods_study,75
16463,primary_health_care,64
17337,randomized_clinical_trial,59
6617,electronic_health_records,51
3579,chronic_obstructive_pulmonary,49


In [28]:
nrow(trigrams_title)

In [29]:
##
## Keywords of publications
##
table(is.na(scopus_df_uniq$pub_keywords))


FALSE  TRUE 
 9379  3668 

In [30]:
scopus_list_keywords <- lapply(scopus_df_uniq$pub_keywords, function(x) strsplit(x, split=" \\| "))
scopus_df_keywords <- data.frame(table(tolower(unlist(scopus_list_keywords))))
names(scopus_df_keywords) <- c("keyword","freq")
scopus_df_keywords$keyword <- as.character(scopus_df_keywords$keyword)
scopus_df_keywords$percent <- (scopus_df_keywords$freq/sum(scopus_df_keywords$freq))*100
scopus_df_keywords <- with(scopus_df_keywords, scopus_df_keywords[order(-freq), ])
head(scopus_df_keywords, 100)

Unnamed: 0_level_0,keyword,freq,percent
Unnamed: 0_level_1,<chr>,<int>,<dbl>
12500,primary care,865,1.7223870
3459,covid-19,417,0.8303299
12548,primary health care,360,0.7168316
13018,qualitative research,335,0.6670516
15363,systematic review,278,0.5535533
12893,public health,266,0.5296589
6050,general practice,262,0.5216941
5101,epidemiology,224,0.4460286
3886,dementia,206,0.4101870
7118,hiv,206,0.4101870


In [31]:
nrow(scopus_df_keywords)

In [32]:
sum(scopus_df_keywords$freq)

In [33]:
## Write keywords to disk
scopus_df_keywords_sm <- head(scopus_df_keywords, 25)
rownames(scopus_df_keywords_sm) <- 1:nrow(scopus_df_keywords_sm)
write.csv(x=scopus_df_keywords_sm, file=fpath_keywords, row.names=TRUE)

In [34]:
##
## Table of title bigrams/trigrams
##
title_table <- cbind(head(bigrams_title, 25), head(trigrams_title, 25))
rownames(title_table) <- 1:nrow(title_table)
names(title_table) <- c("bigram","bigram_freq","trigram","trigram_freq")

write.csv(x=title_table, file=fpath_title, row.names=TRUE)

In [35]:
##
## Table of abstracts bigrams/trigrams
##
abstract_table <- cbind(head(bigrams, 25), head(trigrams, 25))
rownames(abstract_table) <- 1:nrow(abstract_table)
names(abstract_table) <- c("bigram","bigram_freq","trigram","trigram_freq")

write.csv(x=abstract_table, file=fpath_abstract, row.names=TRUE)

In [36]:
##
## Combine into single table
##
rank <- 1:25
keyword <- paste0(scopus_df_keywords_sm$keyword, " (N=", scopus_df_keywords_sm$freq, ")")
title_bigram <- paste0(gsub(x=title_table$bigram, pattern="_", replace=" "), " (N=", title_table$bigram_freq, ")")
title_trigram <- paste0(gsub(title_table$trigram, pattern="_", replace=" "), " (N=", title_table$trigram_freq, ")")
abstract_bigram <- paste0(gsub(abstract_table$bigram, pattern="_", replace=" "), " (N=", abstract_table$bigram_freq, ")")
abstract_trigram <- paste0(gsub(abstract_table$trigram, pattern="_", replace=" "), " (N=", abstract_table$trigram_freq, ")")

combined_table <- data.frame(rank=rank,
							keyword=keyword,
							title_bigram=title_bigram,
							title_trigram=title_trigram,
							abstract_bigram=abstract_bigram,
							abstract_trigram=abstract_trigram)

write.csv(x=combined_table, file=fpath_combined, row.names=FALSE)

In [37]:
##
## Table --- stakeholder identified research terms
##
qi <- c(bigrams[bigrams$bigram=="quality_improvement", ], scopus_df_keywords[scopus_df_keywords$keyword=="quality improvement", 1:2])
med_ed <- c(bigrams[bigrams$bigram=="medical_education", ], scopus_df_keywords[scopus_df_keywords$keyword=="medical education", 1:2])
emerg <- c(bigrams[bigrams$bigram=="emergency_medicine", ], scopus_df_keywords[scopus_df_keywords$keyword=="emergency medicine", 1:2])
pall <- c(bigrams[bigrams$bigram=="palliative_care", ], scopus_df_keywords[scopus_df_keywords$keyword=="palliative care", 1:2])
sport <- c(bigrams[bigrams$bigram=="sports_medicine", ], scopus_df_keywords[scopus_df_keywords$keyword=="sports medicine", 1:2])
global <- c(bigrams[bigrams$bigram=="global_health", ], scopus_df_keywords[scopus_df_keywords$keyword=="global health", 1:2])
edi <- c(bigrams[bigrams$bigram=="equity_diversity", ], scopus_df_keywords[scopus_df_keywords$keyword=="equity", 1:2])
indig <- c(bigrams[bigrams$bigram=="indigenous_health", ], scopus_df_keywords[scopus_df_keywords$keyword=="indigenous health", 1:2])
soc_det <- c(bigrams[bigrams$bigram=="social_determinants", ], scopus_df_keywords[scopus_df_keywords$keyword=="social determinants of health", 1:2])
ci <- c(bigrams[bigrams$bigram=="clinical_informatics", ], scopus_df_keywords[scopus_df_keywords$keyword=="clinical informatics", 1:2])
mi <- c(bigrams[bigrams$bigram=="medical_informatics", ], scopus_df_keywords[scopus_df_keywords$keyword=="medical informatics", 1:2])
ehr <- c(trigrams[trigrams$trigram=="electronic_health_record", ], scopus_df_keywords[scopus_df_keywords$keyword=="electronic health record", 1:2])
emr <- c(trigrams[trigrams$trigram=="electronic_medical_record", ], scopus_df_keywords[scopus_df_keywords$keyword=="electronic medical record", 1:2])
ai <- c(bigrams[bigrams$bigram=="artificial_intelligence", ], scopus_df_keywords[scopus_df_keywords$keyword=="artificial intelligence", 1:2])
ml <- c(bigrams[bigrams$bigram=="machine_learning", ], scopus_df_keywords[scopus_df_keywords$keyword=="machine learning", 1:2])
dl <- c(bigrams[bigrams$bigram=="deep_learning", ], scopus_df_keywords[scopus_df_keywords$keyword=="deep learning", 1:2])
causal <- c(bigrams[bigrams$bigram=="causal_inference", ], scopus_df_keywords[scopus_df_keywords$keyword=="causal inference", 1:2])

cont <- c(bigrams[bigrams$bigram=="care_continuity", ], scopus_df_keywords[scopus_df_keywords$keyword=="continuity of patient care", 1:2])
multi <- c(as.character(unigrams[unigrams$token=="multimorbidity", c("token")]),unigrams[unigrams$token=="multimorbidity", c("freq")], scopus_df_keywords[scopus_df_keywords$keyword=="multimorbidity", 1:2])
relat <- c(bigrams[bigrams$bigram=="doctorpatient_relationship", ], scopus_df_keywords[scopus_df_keywords$keyword=="doctor-patient relationship", 1:2])
team <- c(bigrams[bigrams$bigram=="teambased_care", ], scopus_df_keywords[scopus_df_keywords$keyword=="team-based care", 1:2])
pop <- c(bigrams[bigrams$bigram=="population_health", ], scopus_df_keywords[scopus_df_keywords$keyword=="population health", 1:2])


novel_themes <- data.frame(rbind(qi, med_ed, emerg, pall, sport, global, edi, indig, soc_det, ci, mi, ehr, emr, ai, ml, dl, causal, cont, multi, relat, team, pop))
rownames(novel_themes) <- 1:nrow(novel_themes)
names(novel_themes) <- c("ngram","ngram_freq","keyword","keyword_freq")

ngram_novel_themes <- paste0(gsub(novel_themes$ngram, pattern="_", replace=" "), " (N=", novel_themes$ngram_freq, ")")
keyword_novel_themes <- paste0(gsub(novel_themes$keyword, pattern="_", replace=" "), " (N=", novel_themes$keyword_freq, ")")

novel_themes_df <- data.frame(item=1:nrow(novel_themes),
                                ngrams=ngram_novel_themes,
                                keyword=keyword_novel_themes)

write.csv(x=novel_themes_df, file=fpath_novel_themes, row.names=FALSE)

In [38]:
###############
## Primary Care Research Networks in Abstracts/Titles
###############

## Titles
list(
    sum(grepl(x=tolower(scopus_df$pub_title), pattern="emrald")),
    sum(grepl(x=tolower(scopus_df$pub_title), pattern="emrpc")),
    sum(grepl(x=tolower(scopus_df$pub_title), pattern="utopian")),
    sum(grepl(x=tolower(scopus_df$pub_title), pattern="gprd")),
    sum(grepl(x=tolower(scopus_df$pub_title), pattern="cprd"))
)

## Abstracts
list(
    sum(grepl(x=tolower(scopus_df$pub_abstract), pattern="emrald")),
    sum(grepl(x=tolower(scopus_df$pub_abstract), pattern="emrpc")),
    sum(grepl(x=tolower(scopus_df$pub_abstract), pattern="utopian")),
    sum(grepl(x=tolower(scopus_df$pub_abstract), pattern="gprd")),
    sum(grepl(x=tolower(scopus_df$pub_abstract), pattern="cprd"))
)

In [39]:
##############################################################
##
## Construct sparse matrix and fit topic model to promary care research abstract corpus
##
##############################################################

In [40]:
## Drop NA abstracts
scopus_df_uniq_sm <- scopus_df_uniq[!(is.na(scopus_df_uniq$pub_abstract)), ]

list(
    dim(scopus_df_uniq_sm),
    dim(scopus_df_uniq)
)

In [41]:
## Split strength/note on whitespace
t0 <- Sys.time()
parsed_abstract <- lapply(tolower(scopus_df_uniq_sm$pub_abstract), strsplit, split="\\s+")
t1 <- Sys.time()
t1-t0

Time difference of 3.332326 secs

In [42]:
## Number notes
length(parsed_abstract)

In [43]:
## Distribution of number characters per note
summary(sapply(scopus_df_uniq_sm$pub_abstract, nchar))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     43    1547    1807    1864    2109   12352 

In [44]:
## Distribution of number of words per note
summary(sapply(parsed_abstract, function(x) length(x[[1]])))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    7.0   221.0   256.0   267.2   301.0  1865.0 

In [45]:
## 1) convert all tokens to lowercase
t0 <- Sys.time()
parsed_abstract1 <- lapply(parsed_abstract, function(x) tolower(x[[1]]))
t1 <- Sys.time()
t1-t0

Time difference of 0.9982469 secs

In [46]:
## 2) remove all non-alphabetic characters from tokens, replacing with blank string
t0 <- Sys.time()
parsed_abstract2 <- lapply(parsed_abstract1, function(x) gsub(x=x, pattern="[^[:alpha:]]", replace=""))
t1 <- Sys.time()
t1-t0

Time difference of 2.927436 secs

In [47]:
## 3) remove blank strings induced from step (2) above
t0 <- Sys.time()
parsed_abstract3 <- lapply(parsed_abstract2, function(x) x[x != ""])
t1 <- Sys.time()
t1-t0

Time difference of 0.08911991 secs

In [48]:
## 4) remove single (alpha) character tokens
t0 <- Sys.time()
parsed_abstract4 <- lapply(parsed_abstract3, function(x) x[!(x %in% letters)])
t1 <- Sys.time()
t1-t0

Time difference of 0.8489602 secs

In [49]:
## Consider univariate frequency of tokens across corpus
tokens_abstract <- unlist(parsed_abstract)
unique_tokens_abstract <- unique(tokens_abstract)
length(unique_tokens_abstract)

In [50]:
## Summarize tokens used in corpus
tokens_df_abstract <- data.frame(table(tolower(tokens_abstract)))
names(tokens_df_abstract) <- c("token","freq")
tokens_df_abstract$token <- as.character(tokens_df_abstract$token)
tokens_df_abstract <- with(tokens_df_abstract, tokens_df_abstract[order(-freq),])
head(tokens_df_abstract,100)

Unnamed: 0_level_0,token,freq
Unnamed: 0_level_1,<chr>,<int>
73598,and,133930
152277,the,122064
125690,of,115422
153358,to,84795
108171,in,71082
70054,a,49162
159771,with,42280
100436,for,40234
159415,were,30626
158839,was,23578


In [51]:
##
## Drop stop words, short words, words only constituting numbers...
##
t0 <- Sys.time()
parsed_abstract5 <- lapply(parsed_abstract4, function(x) x[!(x %in% stopw)])
parsed_abstract5 <- lapply(parsed_abstract5, function(x) x[!(sapply(x, nchar)==1)])
parsed_abstract5 <- lapply(parsed_abstract5, function(x) x[!(!grepl("\\D", x))])
t1 <- Sys.time()
t1-t0

Time difference of 5.921428 secs

In [52]:
## Consider aspects on note length
abstract_length <- sapply(parsed_abstract5, length)
summary(abstract_length)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    4.0   112.0   134.0   137.6   158.0  1257.0 

In [53]:
data.frame(quantile(abstract_length, probs=seq(0,1,0.01)))

Unnamed: 0_level_0,quantile.abstract_length..probs...seq.0..1..0.01..
Unnamed: 0_level_1,<dbl>
0%,4.00
1%,38.00
2%,50.00
3%,58.00
4%,64.00
5%,70.00
6%,74.00
7%,78.00
8%,81.00
9%,84.00


In [54]:
## Prune corpus, only considering notes with AT LEAST 50 tokens and NO MORE than 1000 tokens
flag_abstract_len <- ifelse((abstract_length>=50) & (abstract_length<=1000), TRUE, FALSE)

parsed_abstract6 <- parsed_abstract5[flag_abstract_len==TRUE]

list(
    length(parsed_abstract5),
    length(parsed_abstract6)
)

In [55]:
length(parsed_abstract6)

In [56]:
table(flag_abstract_len)

flag_abstract_len
FALSE  TRUE 
  247 12276 

In [57]:
## Recompute corpus summary stats
## - Total number tokens
## - Number unique tokens (and token freq)
total_tokens <- unlist(parsed_abstract6) 
unique_tokens <- unique(total_tokens)
tokens_df <- data.frame(table(total_tokens))
names(tokens_df) <- c("token","freq")
tokens_df <- with(tokens_df, tokens_df[order(-freq),])
head(tokens_df, 100)

Unnamed: 0_level_0,token,freq
Unnamed: 0_level_1,<fct>,<int>
5701,care,19886
18260,health,17621
31930,patients,16417
42394,study,13527
10202,data,10470
25926,methods,8962
38398,risk,8947
34905,primary,8362
31665,participants,8082
42373,studies,7236


In [58]:
##########################################
## Construct sparse document term matrix
##########################################
t0 <- Sys.time()
parsed_abstract7 <- lapply(parsed_abstract6, function(x) as.data.frame(table(x), colnames=c("tokens","freq"), stringsAsFactors=FALSE))
t1 <- Sys.time()
t1 - t0

Time difference of 8.389565 secs

In [59]:
## Approx. 1.8mins
parsed_abstract8 <- data.frame(do.call("rbind", parsed_abstract7))
names(parsed_abstract8) <- c("token","freq")
dim(parsed_abstract8)

In [60]:
## Define final dictionary tokens and integer indices
tokens <- sort(unique(parsed_abstract8$token))
tokens_idx <- 1:length(tokens)
tokens_dict <- data.frame(tokens=tokens, tokens_idx=tokens_idx)

## Convert/map tokens (character sequences) to tokens (integer indices)
parsed_abstract8$token_idx <- fmatch(x=parsed_abstract8$token, table=tokens_dict$tokens)

In [61]:
## Create {i,j.x} indices to populate sparse document-term matrix (DTM))
##    i - row/doc index
##    j - col/token index
##    x - count: number of times token j occrs in document i

## i - row/doc index
len_note <- sapply(parsed_abstract7, function(x) nrow(x))
i <- rep(1:length(len_note), len_note)

## j - col/token index
j <- parsed_abstract8$token_idx

## x - count
x <- parsed_abstract8$freq

## Concatenate into data.frame
ijx_df <- data.frame(i=i, j=j, x=x)
str(ijx_df)

'data.frame':	1101775 obs. of  3 variables:
 $ i: int  1 1 1 1 1 1 1 1 1 1 ...
 $ j: int  277 523 533 912 917 1463 1745 1845 2421 2972 ...
 $ x: int  1 1 1 2 1 1 1 1 1 1 ...


In [62]:
###########################
## Create sparse and non-sparse DTMs
###########################
row_dim <- length(len_note)
col_dim <- nrow(tokens_dict)

sp_dtm <- sparseMatrix(i=i, j=j, x=x, dims=c(row_dim, col_dim), index1=TRUE)
class(sp_dtm)

In [63]:
dim(sp_dtm)

In [64]:
## Apply row/col names
colnames(sp_dtm) <- tokens_dict$tokens
rownames(sp_dtm) <- sapply(parsed_abstract7, paste, collapse=" ")

In [65]:
## Sparsity
1 - sum(sp_dtm)/(prod(dim(sp_dtm)))

In [66]:
## Summary rowSums
summary(rowSums(sp_dtm))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   50.0   114.0   134.0   139.4   158.0   852.0 

In [67]:
## Summary colSums
summary(colSums(sp_dtm))

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    1.00     1.00     2.00    35.05     8.00 19886.00 

In [68]:
data.frame(head(sort(colSums(sp_dtm), decreasing=TRUE), n=100))

Unnamed: 0_level_0,head.sort.colSums.sp_dtm...decreasing...TRUE...n...100.
Unnamed: 0_level_1,<dbl>
care,19886
health,17621
patients,16417
study,13527
data,10470
methods,8962
risk,8947
primary,8362
participants,8082
studies,7236


In [69]:
## Create dense version of original sparse matrix
# dtm <- as.matrix(sp_dtm)
# dim(dtm)
## Number of observations per row/note
# table(rowSums(dtm))
## Number of observations per col/tokens
# table(colSums(dtm))

In [70]:
##################################################
## Get refined X (data) matrix (based on incl/excl criteria)
##################################################

X <- scopus_df_uniq_sm[flag_abstract_len, ]
list(dim(X), dim(X), length(unique(ijx_df$i)), dim(sp_dtm))

In [71]:
############################################
## Save data to disk
############################################

## Save ijx coordinate format sparse matrix to disk
fpath_ijx_csv <- paste0(wd_path, "//ijx_df.csv")
write.csv(x=ijx_df, file=fpath_ijx_csv, row.names=FALSE)

## Save sparse matrix to disk
fpath_sp_dtm <- paste0(wd_path, "//sp_dtm.RDS")
saveRDS(object=sp_dtm, file=fpath_sp_dtm)

## Save X feature/covariate matrix to disk
fpath_xsmall_csv <- paste0(wd_path, "//X.csv")
write.csv(x=X, file=fpath_xsmall_csv, row.names=FALSE)

## Final vocab to disk
vocab <- data.frame(colnames(sp_dtm), stringsAsFactors=FALSE)
names(vocab) <- "vocab"

fpath_vocab_csv <- paste0(wd_path, "//vocab.csv")
write.csv(x=vocab, file=fpath_vocab_csv, row.names=FALSE)

## The subtly parsed text in the documents --- write to disk to use with Gensim topic coherence measures
fpath_texts <- paste0(wd_path, "//parsed_texts.csv")
write.csv(x=sapply(parsed_abstract6, paste0, collapse=" "), file=fpath_texts, row.names=FALSE)

In [72]:
#########################
## Session Information
#########################

In [73]:
Sys.Date()

In [74]:
sessionInfo()

R version 4.1.3 (2022-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17763)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] fastmatch_1.1-3 Matrix_1.4-1   

loaded via a namespace (and not attached):
 [1] lattice_0.20-45 fansi_1.0.3     utf8_1.2.2      digest_0.6.29  
 [5] crayon_1.5.1    IRdisplay_1.1   grid_4.1.3      repr_1.1.4     
 [9] lifecycle_1.0.1 jsonlite_1.8.0  evaluate_0.15   pillar_1.7.0   
[13] rlang_1.0.2     cli_3.3.0       uuid_1.1-0      vctrs_0.4.1    
[17] ellipsis_0.3.2  IRkernel_1.3    tools_4.1.3     glue_1.6.2     
[21] fastmap_1.1.0   compiler_4.1.3  base64enc_0.1-3 pbdZMQ_0.3-7   
[25] htmltools_0

In [75]:
version

               _                           
platform       x86_64-w64-mingw32          
arch           x86_64                      
os             mingw32                     
system         x86_64, mingw32             
status                                     
major          4                           
minor          1.3                         
year           2022                        
month          03                          
day            10                          
svn rev        81868                       
language       R                           
version.string R version 4.1.3 (2022-03-10)
nickname       One Push-Up                 