In [1]:
###############################################
## Additional Descriptive Statistics Generated on Bibliometrics/Altmetrics Indicators for Primary Care Research Institutions
##
## Author: Christopher Meaney
## Date: January 2023
##############################################

In [2]:
####################
## Filepaths to import data
####################

## Set working directory path
wd_path <- "Enter_a_Path_to_a_Working_Directory_For_This_Project"

## Import the core scopus dataframe
scopus_fpath <- paste0(wd_path, "//Core_Scopus_Dataset_Sm.csv")

## Import the Altmetrics dataframe (social impact indicators)
altmetrics_fpath <- scopus_fpath <- paste0(wd_path, "//Altmetrics_AuthImpact_Data.csv")

In [3]:
#####################
## Import core Scopus Dataset
#####################
scopus_df <- read.csv(scopus_fpath, header=TRUE, sep=",", stringsAsFactors=FALSE)
str(scopus_df)

'data.frame':	18874 obs. of  25 variables:
 $ au_id         : num  1e+10 1e+10 1e+10 1e+10 1e+10 ...
 $ prism_url     : chr  "https://api.elsevier.com/content/abstract/scopus_id/85140976962" "https://api.elsevier.com/content/abstract/scopus_id/85121747496" "https://api.elsevier.com/content/abstract/scopus_id/85112107056" "https://api.elsevier.com/content/abstract/scopus_id/85118672412" ...
 $ eid           : chr  "2-s2.0-85140976962" "2-s2.0-85121747496" "2-s2.0-85112107056" "2-s2.0-85118672412" ...
 $ doi           : chr  "10.1016/j.socscimed.2022.115463" "10.1111/add.15760" "10.1016/j.socscimed.2021.114262" "10.3399/BJGP.2021.0195" ...
 $ issn          : chr  "02779536" "09652140" "02779536" "09601643" ...
 $ eissn         : chr  "18735347" "13600443" "18735347" "14785242" ...
 $ pub_title     : chr  "Brief opportunistic interventions by general practitioners to promote smoking cessation: A conversation analytic study" "The old and familiar meets the new and unknown: patient and clin

In [4]:
## Only keep publications from certain years --- 2017:2022
scopus_df <- scopus_df[scopus_df$pub_year %in% 2017:2022, ]
dim(scopus_df)

In [5]:
## Publications by year
table(scopus_df$pub_year)


2017 2018 2019 2020 2021 2022 
2591 2811 2823 3075 3617 3957 

In [6]:
## How many unique authors
length(unique(scopus_df$au_id))

In [8]:
####################
## Create small data.frame that only considers unique publications
####################
scopus_df_uniq <- scopus_df[is.na(scopus_df$pub_title)==FALSE, ]
scopus_df_uniq <- scopus_df_uniq[!duplicated(scopus_df_uniq$pub_title), ]

list(
    dim(scopus_df_uniq),
    dim(scopus_df)
    )

In [9]:
## This should be the count of AUTHORS/RESEARCHERS publishing an article...
list(
    length(unique(scopus_df$au_institution)),
    length(unique(scopus_df$au_name)),
    length(unique(scopus_df$au_id))
)

In [10]:
## This should not be count of unique authors/researchers, since it decreases when you remove duplicates
## In other words, some people who co-authors articles with others in cohort, are excluded unnecessarily
list(
    length(unique(scopus_df_uniq$au_institution)),
    length(unique(scopus_df_uniq$au_name)),
    length(unique(scopus_df_uniq$au_id))
)

In [11]:
#####################
##
## Import Altmetrics data
##
#####################
altmetrics_df <- read.csv(altmetrics_fpath, header=TRUE, sep=",", stringsAsFactors=FALSE)
str(altmetrics_df)

'data.frame':	13093 obs. of  17 variables:
 $ doi     : chr  "10.1016/j.socscimed.2022.115463" "10.1111/add.15760" "10.1016/j.socscimed.2021.114262" "10.3399/BJGP.2021.0195" ...
 $ title   : chr  "Brief opportunistic interventions by general practitioners to promote smoking cessation: A conversation analytic study" "The old and familiar meets the new and unknown: patient and clinician perceptions on e-cigarettes for smoking r"| __truncated__ "Patients’ use of the internet to negotiate about treatment" "Factors affecting the documentation of spoken safety-netting advice in routine GP consultations: a cross-sectional study" ...
 $ authors : chr  "H. Wheat; R.K. Barnes; P. Aveyard; F. Stevenson; R. Begh" "Charlotte Albury; Rebecca Barnes; Anne Ferrey; Tim Coleman; Hazel Gilbert; Felix Naughton; Paul Aveyard; Rachna Begh" "Fiona A. Stevenson; Geraldine Leydon-Hudson; Elizabeth Murray; Maureen Seguin; Rebecca Barnes" "Peter J Edwards; Ian Bennett-Britton; Matthew J Ridd; Matthew Booker; Reb

In [12]:
## Only keep subset of variables for altmetrics
keep_vars <- c("doi","posts","tweets","msm","fbwalls","feeds","wiki","linkedin","peer","policy","rh","gplus","accounts","mendeley","score")
altmetrics_df_sm <- altmetrics_df[,keep_vars]

In [13]:
dim(scopus_df_uniq)

In [14]:
####################################
## Merge scopus data with altmetrics data
#################################
scopus_df_uniq <- merge(x=scopus_df_uniq, y=altmetrics_df_sm, by="doi", all.x=TRUE)

list(
    dim(altmetrics_df),
    dim(scopus_df_uniq)
)

In [15]:
## How many DOIs in common between scopus and altmetrics data.frame

list(
    dim(scopus_df_uniq),
    length(unique(scopus_df_uniq$doi)),
    length(intersect(scopus_df_uniq$doi, altmetrics_df$doi))
)

In [30]:
##############################
## Clean the Scopus DataFrame; imputing zeroes for missing altmetric data
##############################

##
## Sumarize missing data
##
data.frame(missing_counts=apply(scopus_df_uniq, 2, function(x) sum(is.na(x))))

##
## Impute zero for missing data
##
scopus_df_uniq$posts <- ifelse(is.na(scopus_df_uniq$posts)==TRUE, 0, scopus_df_uniq$posts)
scopus_df_uniq$tweets <- ifelse(is.na(scopus_df_uniq$tweets)==TRUE, 0, scopus_df_uniq$tweets)
scopus_df_uniq$msm <- ifelse(is.na(scopus_df_uniq$msm)==TRUE, 0, scopus_df_uniq$msm)
scopus_df_uniq$fbwalls <- ifelse(is.na(scopus_df_uniq$fbwalls)==TRUE, 0, scopus_df_uniq$fbwalls)
scopus_df_uniq$feeds <- ifelse(is.na(scopus_df_uniq$feeds)==TRUE, 0, scopus_df_uniq$feeds)
scopus_df_uniq$wiki <- ifelse(is.na(scopus_df_uniq$wiki)==TRUE, 0, scopus_df_uniq$wiki)
scopus_df_uniq$linkedin <- ifelse(is.na(scopus_df_uniq$linkedin)==TRUE, 0, scopus_df_uniq$linkedin)
scopus_df_uniq$peer <- ifelse(is.na(scopus_df_uniq$peer)==TRUE, 0, scopus_df_uniq$peer)
scopus_df_uniq$policy <- ifelse(is.na(scopus_df_uniq$policy)==TRUE, 0, scopus_df_uniq$policy)
scopus_df_uniq$rh <- ifelse(is.na(scopus_df_uniq$rh)==TRUE, 0, scopus_df_uniq$rh)
scopus_df_uniq$gplus <- ifelse(is.na(scopus_df_uniq$gplus)==TRUE, 0, scopus_df_uniq$gplus)
scopus_df_uniq$accounts <- ifelse(is.na(scopus_df_uniq$accounts)==TRUE, 0, scopus_df_uniq$accounts)
scopus_df_uniq$mendeley <- ifelse(is.na(scopus_df_uniq$mendeley)==TRUE, 0, scopus_df_uniq$mendeley)
scopus_df_uniq$score <- ifelse(is.na(scopus_df_uniq$score)==TRUE, 0, scopus_df_uniq$score)
scopus_df_uniq$cited_count <- ifelse(is.na(scopus_df_uniq$cited_count)==TRUE, 0, scopus_df_uniq$cited_count)

Unnamed: 0_level_0,missing_counts
Unnamed: 0_level_1,<int>
doi,0
au_id,0
prism_url,0
eid,0
issn,4434
eissn,806
pub_title,0
pub_author,0
pub_journal,0
pub_date,0


In [32]:
##
##    posts total number of posts
##    delicious number of Delicious users
##    fbwalls number of Facebook accounts
##    feeds number of blogs
##    forum number of internet forums users e.g. Seqanswers
##    gplus number of Google+ users
##    linkedin number of LinkedIn users
##    msm number of news outlets
##    peer_review_sites number of peer review sites
##    pinners number of Pinterest users
##    policies number of policy sources
##    qs number of questions, answers or comments on Stack Exchange sites (inc. Biostar)
##    rdts number of Reddit users
##    rh number of research highlight platforms
##    tweeters number of Twitter users
##    videos number of YouTube channels
##    weibo number of Sina Weibo users
##    wikipedia number of pages on Wikipedia
##

## apply(scopus_df_uniq[,c("posts","tweets","msm","fbwalls","feeds","wiki","linkedin","peer","policy","rh","gplus","accounts","mendeley","score")], 2, function(x) quantile(x, c(0, 0.25, 0.50, 0.75, 0.80, 0.85, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.00)))

In [18]:
#############################
##
##
## Overall, basic descriptive stats
##
##
##############################

In [33]:
## Number authors publishing at least one article --- note this is from overall dataset, not unique dataset
length(unique(scopus_df$au_id))

In [34]:
## How many institutions involved
length(unique(scopus_df$au_institution))

In [19]:
## Number publications (overall)
dim(scopus_df_uniq)

In [22]:
## Publications by year
table(scopus_df_uniq$pub_year)


2017 2018 2019 2020 2021 2022 
1872 2002 2012 2141 2404 2616 

In [31]:
## Number of citations
sum(scopus_df_uniq$cited_count)

In [35]:
## Number of altmetrics (media, tweets, wiki, policy)
list(
sum(scopus_df_uniq$msm),
sum(scopus_df_uniq$tweets),
sum(scopus_df_uniq$wiki),
sum(scopus_df_uniq$policy)
)

In [24]:
## Publications by journal
head(sort(table(scopus_df_uniq$pub_journal), decreasing=TRUE), 10)


                                                         BMJ Open 
                                                              655 
                                                         PLoS ONE 
                                                              321 
                              British Journal of General Practice 
                                                              165 
                                                BMC Public Health 
                                                              142 
                                     BMC Health Services Research 
                                                              126 
                             Journal of Medical Internet Research 
                                                              113 
                          Cochrane Database of Systematic Reviews 
                                                              111 
                                                        CMAJ 

In [36]:
## Number unique journals
length(unique(scopus_df_uniq$pub_journal))

In [25]:
#####################################################
##
## Investigate Metrics and other descriptive-stats by Institution
##
#####################################################

In [26]:
################
## Pubs by year/institution
################
pubs_by_year_inst <- unclass(table(scopus_df_uniq$pub_year, scopus_df_uniq$au_institution))
pubs_by_year_inst_df <- data.frame(pubs_by_year_inst)
names(pubs_by_year_inst_df) <- colnames(pubs_by_year_inst)
pubs_by_year_inst_df <- pubs_by_year_inst_df[,order(colSums(pubs_by_year_inst_df),decreasing=T)]
pubs_by_year_inst_df

#fpath_num_pubs <- "D://CopyZ_30Nov2020//DFCM_Docs//DFCM_Researchers//PeterSelby//MineScopus_DFCM_National_International_PrimaryCareResearchPrograms//Output_DescStats//Table1_NumPubs.csv"
#write.csv(x=pubs_by_year_inst_df, file=fpath_num_pubs)

Unnamed: 0_level_0,Oxford,Toronto,Dartmouth,UCL,UBC,McGill,Michigan,Ottawa,Laval,Cambridge,Harvard,OHSU,Duke,UCSF,HKU,Monash,UNSW
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
2017,391,282,174,141,110,112,94,74,61,60,85,68,57,40,56,39,28
2018,484,258,166,142,131,120,88,89,82,80,83,75,60,43,35,36,30
2019,458,282,133,154,121,122,110,86,80,66,72,73,62,67,43,49,34
2020,527,296,145,147,116,131,97,85,91,80,63,78,64,60,66,54,41
2021,574,386,174,171,146,125,110,84,117,96,63,61,64,64,54,73,42
2022,611,370,203,166,158,151,124,127,102,91,82,89,64,77,92,67,42


In [None]:
##############################
## Session Information
##############################

In [None]:
Sys.Date()

In [None]:
sessionInfo()

In [None]:
version