In [1]:
###############################################
## Import the core Scopus Dataset (previusly mined) --- use ScopusID as unique key to grab additional author meta-data (e.g. h-index, num-cites, etc.)
##
## Author: Christopher Meaney
## Date: January 2023
##############################################

In [2]:
########################
## Dependency packages
########################
library(rscopus)

In [3]:
#########################
## Set Scopus API key
#########################
api_key <- set_api_key("Enter_Your_API_Key_Here")

In [4]:
##########################
## Filepaths to where data should be imported to, or exported from
##########################

## Set working directory path
wd_path <- "Enter_a_Path_to_a_Working_Directory_For_This_Project"

## Filepath to export the queried Scopus API data; note that meta-data will be used to curate larger scientometric database
scopus_fpath <- paste0(wd_path, "//Core_Scopus_Dataset_Sm.csv")

## Filepath to import the IDs file
ids_meta_fpath <- paste0(wd_path, "//Final_Combined_ScopusIdsFile.csv")

## Filepath to output the queried metrics dataframe
metrics_fpath <- paste0(wd_path, "//Scopus_Auth_Metrics.csv")

In [5]:
#####################
## Import core Scopus Dataset
#####################
scopus_data <- read.csv(scopus_fpath, header=TRUE, sep=",", stringsAsFactors=FALSE)
str(scopus_data)

'data.frame':	18874 obs. of  25 variables:
 $ au_id         : num  1e+10 1e+10 1e+10 1e+10 1e+10 ...
 $ prism_url     : chr  "https://api.elsevier.com/content/abstract/scopus_id/85140976962" "https://api.elsevier.com/content/abstract/scopus_id/85121747496" "https://api.elsevier.com/content/abstract/scopus_id/85112107056" "https://api.elsevier.com/content/abstract/scopus_id/85118672412" ...
 $ eid           : chr  "2-s2.0-85140976962" "2-s2.0-85121747496" "2-s2.0-85112107056" "2-s2.0-85118672412" ...
 $ doi           : chr  "10.1016/j.socscimed.2022.115463" "10.1111/add.15760" "10.1016/j.socscimed.2021.114262" "10.3399/BJGP.2021.0195" ...
 $ issn          : chr  "02779536" "09652140" "02779536" "09601643" ...
 $ eissn         : chr  "18735347" "13600443" "18735347" "14785242" ...
 $ pub_title     : chr  "Brief opportunistic interventions by general practitioners to promote smoking cessation: A conversation analytic study" "The old and familiar meets the new and unknown: patient and clin

In [6]:
## Create a small version of the dataset containing only UNIQUE ScopusIDs and keeping author name
X <- scopus_data[!(duplicated(scopus_data$id)), c("nam","id")]
names(X) <- c("AuthorName","ScopusID")
str(X)

'data.frame':	591 obs. of  2 variables:
 $ AuthorName: chr  "RebeccaBarnes" "ChristineTCigolle" "ClaireKendall" "MinaRuthSilberberg" ...
 $ ScopusID  : num  1.00e+10 1.02e+10 1.28e+10 1.28e+10 1.44e+10 ...


In [7]:
## Double check uniqueness of ScopusID in small dataset

list(
    length(unique(scopus_data$id)),
    length(unique(X$ScopusID))
)

In [8]:
########################
## Import original ScopusIDs dataset; containing some meta-information on authors (e.g. author webpage; where we scraped there affiliation)
########################
meta <- read.csv(file=ids_meta_fpath, header=TRUE, sep=",", stringsAsFactors=FALSE)
str(meta)

'data.frame':	658 obs. of  6 variables:
 $ FirstName  : chr  "Jonathan" "Stephen" "Robbie" "Christi" ...
 $ LastName   : chr  "Mant" "Sutton" "Duschinsky" "Deaton" ...
 $ ScopusID   : num  2.66e+10 3.52e+10 3.70e+10 5.72e+10 7.01e+09 ...
 $ Country    : chr  "UK" "UK" "UK" "UK" ...
 $ Institution: chr  "Cambridge" "Cambridge" "Cambridge" "Cambridge" ...
 $ AuthorName : chr  "JonathanMant" "StephenSutton" "RobbieDuschinsky" "ChristiDeaton" ...


In [9]:
## Only keep variables needed
meta_sm <- meta[,c("ScopusID","Institution","Country")]
names(meta_sm) <- c("id","scrape_inst","country_inst")
str(meta_sm)

'data.frame':	658 obs. of  3 variables:
 $ id          : num  2.66e+10 3.52e+10 3.70e+10 5.72e+10 7.01e+09 ...
 $ scrape_inst : chr  "Cambridge" "Cambridge" "Cambridge" "Cambridge" ...
 $ country_inst: chr  "UK" "UK" "UK" "UK" ...


In [10]:
###################################
##
## User defined function to extract author publication meta-data from Scopus API
##
###################################

In [11]:
###############################################
##
##
## Query Scopus Author Meta-Data (h-index, num-cites, num-coauth, etc.)
##
##
###############################################

# vu <- c("LIGHT", "STANDARD", "ENHANCED", "METRICS", "ENTITLED")
# vu <- c("LIGHT")
# vu <- c("STANDARD")
# vu <- c("ENHANCED")
# vu <- c("METRICS")
# vu <- c("ENTITLED")

##
## Get Count Number Documents
##
get_doc_count <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        doc_count <- dat[["content"]][["author-retrieval-response"]][[1]][["coredata"]][["document-count"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Count Number Citations
##
get_cited_count <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        cited_count <- dat[["content"]][["author-retrieval-response"]][[1]][["coredata"]][["cited-by-count"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Citation Counts
##
get_citation_count <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        citation_count <- dat[["content"]][["author-retrieval-response"]][[1]][["coredata"]][["citation-count"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get H-index
##
get_h_index <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        fbwalls <- dat[["content"]][["author-retrieval-response"]][[1]][["h-index"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Co-Author Count
##
get_coauthor_count <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        coauthor_count <- dat[["content"]][["author-retrieval-response"]][[1]][["coauthor-count"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Most Recent PubYear
##
get_pub_last <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        pub_last <- dat[["content"]][["author-retrieval-response"]][[1]][["author-profile"]][["publication-range"]][["@end"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get First Publication Date
##
get_pub_first <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        pub_first <- dat[["content"]][["author-retrieval-response"]][[1]][["author-profile"]][["publication-range"]][["@start"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
        	# Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Affiliation Dept
##
get_affil_dept <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        affil_dept <- dat[["content"]][["author-retrieval-response"]][[1]][["author-profile"]][["affiliation-current"]][["affiliation"]][["ip-doc"]][["preferred-name"]][[2]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
        	# Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Affiliation Institution
##
get_affil_inst <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        affil_inst <- dat[["content"]][["author-retrieval-response"]][[1]][["author-profile"]][["affiliation-current"]][["affiliation"]][["ip-doc"]][["parent-preferred-name"]][[2]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Affiliation City
##
get_affil_city <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        affil_city <- dat[["content"]][["author-retrieval-response"]][[1]][["author-profile"]][["affiliation-current"]][["affiliation"]][["ip-doc"]][["address"]][["city"]]		
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
        	# Choose a return value in case of warning
        }
    )    
    return(out)
}


##
## Get Affiliation Country
##
get_affil_country <- function(dat) {
    out <- tryCatch(
        {
        ## For each author dataframe grab relevant info
        affil_country <- dat[["content"]][["author-retrieval-response"]][[1]][["author-profile"]][["affiliation-current"]][["affiliation"]][["ip-doc"]][["address"]][["country"]]
        },
        error=function(cond) {
            # Choose a return value in case of error
            return(c(NA))
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            return(c(NA))
        },
        finally={
            # Choose a return value in case of warning
        }
    )    
    return(out)
}


###########
##
## Create single function to grab author meta-data
##
###########
get_au_metrics <- function(id) {
    out <- tryCatch(
        {
        ## Grab data for given author ID
        dat_metrics <- author_retrieval(au_id=id, api_key=api_key, verbose=FALSE, view="METRICS")
        dat_std <- author_retrieval(au_id=id, api_key=api_key, verbose=FALSE, view="STANDARD")
        ## Grab individual data elements given data.frame above
        doc_count <- get_doc_count(dat=dat_metrics)
        cited_count <- get_cited_count(dat=dat_metrics)
        citation_count <- get_citation_count(dat=dat_metrics)
        h_index <- get_h_index(dat=dat_metrics)
        coauthor_count <- get_coauthor_count(dat=dat_metrics)
        ##
        pub_last <- get_pub_last(dat=dat_std)
        pub_first <- get_pub_first(dat=dat_std)
        affil_dept <- get_affil_dept(dat=dat_std)
        affil_inst <- get_affil_inst(dat=dat_std)
        affil_city <- get_affil_city(dat=dat_std)
        affil_country <- get_affil_country(dat=dat_std)
        ## Put into data.frame
        out <- data.frame(au_id=id,
                    doc_count=ifelse(is.null(doc_count),NA,doc_count),
                    cited_count=ifelse(is.null(cited_count),NA,cited_count),
                    citation_count=ifelse(is.null(citation_count),NA,citation_count),
                    h_index=ifelse(is.null(h_index),NA,h_index),
                    coauthor_count=ifelse(is.null(coauthor_count),NA,coauthor_count),
                    pub_last=ifelse(is.null(pub_last),NA,pub_last),
                    pub_first=ifelse(is.null(pub_first),NA,pub_first),
                    affil1=ifelse(is.null(affil_dept),NA,affil_dept),
                    affil2=ifelse(is.null(affil_inst),NA,affil_inst),
                    affil_city=ifelse(is.null(affil_city),NA,affil_city),
                    affil_country=ifelse(is.null(affil_country),NA,affil_country)
                    )
        return(out)
        },
        error=function(cond) {
            # Choose a return value in case of error
            out <- data.frame(au_id=id,
                    doc_count=NA,
                    cited_count=NA,
                    citation_count=NA,
                    h_index=NA,
                    coauthor_count=NA,
                    pub_last=NA,
                    pub_first=NA,
                    affil1=NA,
                    affil2=NA,
                    affil_city=NA,
                    affil_country=NA) 
        return(out)
        },
        warning=function(cond) {
            # Choose a return value in case of warning
            out <- data.frame(au_id=id,
                    doc_count=NA,
                    cited_count=NA,
                    citation_count=NA,
                    h_index=NA,
                    coauthor_count=NA,
                    pub_last=NA,
                    pub_first=NA,
                    affil1=NA,
                    affil2=NA,
                    affil_city=NA,
                    affil_country=NA) 
        return(out)
        },
        finally={
        	# Choose a return value in case of warning
        }
    )    
    return(out)
}


In [12]:
################################################
##
## Loop over list of Scopus IDs to grab associated author meta-data
##
################################################

## Instantiate empty list
au_metrics_list <- list()

##
## Loop over list grabbing data
##
t0 <- Sys.time()

for (i in 1:nrow(X)) {
    ## Grab Name and ID
    nam <- X$AuthorName[i]
    id <- X$ScopusID[i]
    cat(i, "\tQuerying Scopus ID: ", id, "\t Author Name:", nam, "\n")
    ## Add to list
    dat_sm <- get_au_metrics(id=id)
    dat_sm$nam <- nam
    dat_sm$id <- id
    au_metrics_list[[i]] <- dat_sm 
    ## Pause for a second as to not kill API
    Sys.sleep(runif(n=1, min=0, max=3))
}

t1 <- Sys.time()
t1-t0

1 	Querying Scopus ID:  10041373600 	 Author Name: RebeccaBarnes 
2 	Querying Scopus ID:  10240446500 	 Author Name: ChristineTCigolle 
3 	Querying Scopus ID:  12791296200 	 Author Name: ClaireKendall 
4 	Querying Scopus ID:  12804958400 	 Author Name: MinaRuthSilberberg 
5 	Querying Scopus ID:  14421205500 	 Author Name: BertrandLebouche 
6 	Querying Scopus ID:  14826143300 	 Author Name: ClaudeTopping 
7 	Querying Scopus ID:  15020768600 	 Author Name: CarlHeneghan 
8 	Querying Scopus ID:  15037395300 	 Author Name: MargaretSmith 
9 	Querying Scopus ID:  15051137000 	 Author Name: JeffMyers 
10 	Querying Scopus ID:  15519829000 	 Author Name: DeannaTelner 
11 	Querying Scopus ID:  15730009300 	 Author Name: MichelleNaimer 
12 	Querying Scopus ID:  15759696700 	 Author Name: DanielleHessler 
13 	Querying Scopus ID:  15922637500 	 Author Name: JohnDavidHeintzman 
14 	Querying Scopus ID:  15923365500 	 Author Name: JohnPMuench 
15 	Querying Scopus ID:  16024709400 	 Author Name: NickFah

Time difference of 18.83876 mins

In [13]:
## 
## Post Process the Data
##
au_metrics_df <- do.call("rbind", au_metrics_list)
str(au_metrics_df)

'data.frame':	591 obs. of  14 variables:
 $ au_id         : num  1.00e+10 1.02e+10 1.28e+10 1.28e+10 1.44e+10 ...
 $ doc_count     : chr  "63" "42" "117" "37" ...
 $ cited_count   : chr  "1213" "1603" "1004" "313" ...
 $ citation_count: chr  "1403" "1722" "1159" "318" ...
 $ h_index       : chr  "17" "18" "20" "10" ...
 $ coauthor_count: chr  "179" "134" "458" "127" ...
 $ pub_last      : chr  "2022" "2022" "2022" "2022" ...
 $ pub_first     : chr  "2004" "2005" "2006" "1994" ...
 $ affil1        : chr  "University of Oxford Medical Sciences Division" NA NA NA ...
 $ affil2        : chr  NA NA NA NA ...
 $ affil_city    : chr  "Oxford" NA NA NA ...
 $ affil_country : chr  "United Kingdom" NA NA NA ...
 $ nam           : chr  "RebeccaBarnes" "ChristineTCigolle" "ClaireKendall" "MinaRuthSilberberg" ...
 $ id            : num  1.00e+10 1.02e+10 1.28e+10 1.28e+10 1.44e+10 ...


In [14]:
##
## Merge with meta information on Institution
##
au_metrics_df <- merge(x=au_metrics_df, y=meta_sm, by="id", all.x=TRUE)

list(
    dim(au_metrics_df),
    dim(meta_sm)
)

In [15]:
str(au_metrics_df)

'data.frame':	591 obs. of  16 variables:
 $ id            : num  6.50e+09 6.51e+09 6.51e+09 6.51e+09 6.51e+09 ...
 $ au_id         : num  6.50e+09 6.51e+09 6.51e+09 6.51e+09 6.51e+09 ...
 $ doc_count     : chr  "35" "123" "142" "66" ...
 $ cited_count   : chr  "506" "2556" "1375" "1653" ...
 $ citation_count: chr  "693" "3016" "1818" "1809" ...
 $ h_index       : chr  "16" "29" "24" "18" ...
 $ coauthor_count: chr  "352" "160" "488" "215" ...
 $ pub_last      : chr  "2023" "2023" "2022" "2022" ...
 $ pub_first     : chr  "1996" "2003" "2002" "1998" ...
 $ affil1        : chr  NA "Oregon Health & Science University" NA "Oregon Health & Science University" ...
 $ affil2        : chr  NA NA NA NA ...
 $ affil_city    : chr  NA "Portland" NA "Portland" ...
 $ affil_country : chr  NA "United States" NA "United States" ...
 $ nam           : chr  "DeborahKopanskyGiles" "NathalieHuguet" "AishaLofters" "LyleJFagnan" ...
 $ scrape_inst   : chr  "Toronto" "OHSU" "Toronto" "OHSU" ...
 $ country_i

In [16]:
##################################
##
## Write to disk
##
##################################
write.csv(x=au_metrics_df, file=metrics_fpath, row.names=FALSE)

In [17]:
##########################################
##
## Postprocess the metrics dataframe --- Note: these are LIFETIME metrics; not those over 6yr study window
##
##########################################
au_metrics_df$doc_count <- as.numeric(au_metrics_df$doc_count)
au_metrics_df$cited_count <- as.numeric(au_metrics_df$cited_count)
au_metrics_df$citation_count <- as.numeric(au_metrics_df$citation_count)
au_metrics_df$h_index <- as.numeric(au_metrics_df$h_index)
au_metrics_df$coauthor_count <- as.numeric(au_metrics_df$coauthor_count)

In [18]:
## 1) Order by number documents published
head(with(au_metrics_df, au_metrics_df[order(-doc_count), c("nam","doc_count")]), 10)

Unnamed: 0_level_0,nam,doc_count
Unnamed: 0_level_1,<chr>,<dbl>
390,JeffreyKAronson,658
268,TrishGreenhalgh,651
55,SimondeLusignan,609
100,RichardHobbs,594
286,JohnABaron,573
251,SteveIliffe,554
118,TheresaMarteau,528
72,MargaretRitaKaragas,497
143,MaryTStory,472
315,MarkHarris,469


In [19]:
## 2) Order by number citations
head(with(au_metrics_df, au_metrics_df[order(-cited_count), c("nam","cited_count")]), 10)

Unnamed: 0_level_0,nam,cited_count
Unnamed: 0_level_1,<chr>,<dbl>
100,RichardHobbs,52745
114,GoyaWannamethee,43549
286,JohnABaron,42533
520,ChristiDeaton,36462
268,TrishGreenhalgh,31701
143,MaryTStory,30175
125,SusanJebb,24756
122,AnnaNATosteson,22703
513,RafaelPerera,20799
379,IrwinNazareth,19868


In [20]:
## 3) Order by h-index
head(with(au_metrics_df, au_metrics_df[order(-h_index), c("nam","h_index")]), 10)

Unnamed: 0_level_0,nam,h_index
Unnamed: 0_level_1,<chr>,<dbl>
286,JohnABaron,121
143,MaryTStory,115
114,GoyaWannamethee,93
268,TrishGreenhalgh,92
72,MargaretRitaKaragas,86
100,RichardHobbs,84
159,JamesNWeinstein,82
125,SusanJebb,81
187,KarimKhan,81
122,AnnaNATosteson,80


In [21]:
## 4) Order by number of co-authors
head(with(au_metrics_df, au_metrics_df[order(-coauthor_count), c("nam","coauthor_count")]), 10)

Unnamed: 0_level_0,nam,coauthor_count
Unnamed: 0_level_1,<chr>,<dbl>
549,JohnWilliams,24763
493,JulianSherlock,9728
408,RobertPetrella,7303
559,DanielPlotkin,3589
100,RichardHobbs,3187
292,AndrewFarmer,3104
208,BrockCChristensen,2881
168,CarolineClarke,2701
114,GoyaWannamethee,2653
305,PatrickArchambault,2538


In [22]:
##
## Aggregate by curated/scrapted institutions --- again, note these are LIFETIME statistics (not those over study timeframe)
##

In [23]:
## Document counts by institution
doc_count_df <- aggregate(doc_count ~ scrape_inst, data=au_metrics_df, function(x) c(length(x), sum(x, na.rm=TRUE), mean(x, na.rm=TRUE), sd(x, na.rm=TRUE), quantile(x=x, na.rm=TRUE, prob=c(0, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.975, 0.99, 1.00))))
doc_count_df <- data.frame(cbind(doc_count_df[,1]),cbind(doc_count_df[,2]))
names(doc_count_df) <- c("Institution","N","Sum","Mean","SD","Q0","Q1","Q2_5","Q5","Q10","Q25","Q50","Q75","Q90","Q95","Q97_5","Q99","Q100")
doc_count_df <- with(doc_count_df, doc_count_df[order(-N),])
doc_count_df

Unnamed: 0_level_0,Institution,N,Sum,Mean,SD,Q0,Q1,Q2_5,Q5,Q10,Q25,Q50,Q75,Q90,Q95,Q97_5,Q99,Q100
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
12,Oxford,163,13303,81.6135,123.87974,3,3.0,4.05,5.1,7.0,16.5,35.0,84.0,209.6,371.6,430.2,624.96,658
13,Toronto,72,7171,99.59722,98.16739,4,4.71,5.0,5.55,12.1,30.75,75.0,130.75,245.4,308.8,383.4,403.45,407
15,UCL,51,4298,84.27451,106.05151,2,3.5,5.25,6.0,7.0,15.0,43.0,127.0,207.0,269.5,343.0,452.5,554
2,Dartmouth,47,6497,138.23404,133.39631,2,4.3,7.6,13.1,20.2,33.0,107.0,173.5,310.0,434.0,491.15,538.04,573
14,UBC,35,3515,100.42857,87.68704,7,8.7,11.25,12.7,14.4,32.5,69.0,141.0,228.8,269.0,301.1,338.84,364
7,McGill,31,2980,96.12903,82.54281,4,5.2,7.0,10.0,13.0,25.5,85.0,128.5,224.0,230.0,262.25,309.5,341
6,Laval,30,1694,56.46667,81.32724,2,2.0,2.0,2.45,3.9,8.0,33.5,72.0,122.5,136.35,222.375,346.35,429
8,Michigan,28,2303,82.25,58.6418,7,7.54,8.35,11.1,19.9,45.75,71.0,91.5,173.0,199.2,215.325,231.93,243
4,Harvard,22,1808,82.18182,84.47013,3,4.26,6.15,9.45,18.1,21.25,38.0,129.0,171.0,187.2,262.1,311.24,344
10,OHSU,19,1874,98.63158,76.83837,8,8.36,8.9,9.8,13.2,45.5,83.0,133.0,195.8,244.3,268.15,282.46,292


In [24]:
## Num citations by institution
num_cited_df <- aggregate(cited_count ~ scrape_inst, data=au_metrics_df, function(x) c(length(x), sum(x, na.rm=TRUE), mean(x, na.rm=TRUE), sd(x, na.rm=TRUE), quantile(x=x, na.rm=TRUE, prob=c(0, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.975, 0.99, 1.00))))
num_cited_df <- data.frame(cbind(num_cited_df[,1]),cbind(num_cited_df[,2]))
names(num_cited_df) <- c("Institution","N","Sum","Mean","SD","Q0","Q1","Q2_5","Q5","Q10","Q25","Q50","Q75","Q90","Q95","Q97_5","Q99","Q100")
num_cited_df <- with(num_cited_df, num_cited_df[order(-N),])
num_cited_df

Unnamed: 0_level_0,Institution,N,Sum,Mean,SD,Q0,Q1,Q2_5,Q5,Q10,Q25,Q50,Q75,Q90,Q95,Q97_5,Q99,Q100
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
12,Oxford,163,530536,3254.8221,6332.5311,0,4.86,14.05,21.4,49.4,226.5,899.0,3320.0,10377.2,14199.2,16970.0,27395.1,52745
13,Toronto,72,162712,2259.8889,2770.8224,24,26.13,44.825,76.1,118.2,380.75,1141.5,3000.25,6656.5,7842.35,10281.925,10574.11,11113
15,UCL,51,177464,3479.6863,7176.4876,3,4.5,7.75,25.5,66.0,219.5,692.0,4659.5,9844.0,13909.0,19297.75,31708.5,43549
2,Dartmouth,47,320441,6817.8936,8131.5027,7,44.72,98.9,193.1,297.0,581.0,4933.0,9528.0,16749.6,19113.6,22262.15,33411.2,42533
14,UBC,35,93499,2671.4,3721.776,80,81.36,83.4,109.9,194.6,675.0,1022.0,2831.5,6785.8,10731.6,14080.8,14943.72,15519
7,McGill,31,88583,2857.5161,3659.9458,25,41.5,66.25,130.5,225.0,603.5,1584.0,3515.5,5672.0,10094.0,14320.75,15196.9,15781
6,Laval,30,36712,1223.7333,2527.1738,2,2.29,2.725,3.45,5.8,168.0,386.0,1619.75,2315.0,3121.6,6169.325,10668.53,13668
8,Michigan,28,72568,2591.7143,2895.0929,31,35.59,42.475,61.3,127.3,718.0,1675.5,3178.5,6011.4,8677.45,10018.825,10821.13,11356
4,Harvard,22,46291,2104.1364,2574.2405,36,52.38,76.95,119.85,236.3,423.0,911.0,3268.75,4260.8,4368.0,7763.4,10013.76,11514
10,OHSU,19,37414,1969.1579,2201.6161,12,14.7,18.75,25.5,109.4,626.0,1604.0,2687.5,3778.4,6041.1,7468.05,8324.22,8895


In [25]:
## h-index by institution
h_index_df <- aggregate(h_index ~ scrape_inst, data=au_metrics_df, function(x) c(length(x), sum(x, na.rm=TRUE), mean(x, na.rm=TRUE), sd(x, na.rm=TRUE), quantile(x=x, na.rm=TRUE, prob=c(0, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.975, 0.99, 1.00))))
h_index_df <- data.frame(cbind(h_index_df[,1]),cbind(h_index_df[,2]))
names(h_index_df) <- c("Institution","N","Sum","Mean","SD","Q0","Q1","Q2_5","Q5","Q10","Q25","Q50","Q75","Q90","Q95","Q97_5","Q99","Q100")
h_index_df <- with(h_index_df, h_index_df[order(-N),])
h_index_df

Unnamed: 0_level_0,Institution,N,Sum,Mean,SD,Q0,Q1,Q2_5,Q5,Q10,Q25,Q50,Q75,Q90,Q95,Q97_5,Q99,Q100
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
12,Oxford,163,3183,19.52761,18.592881,0,1.0,1.05,2.1,4.0,7.0,13.0,22.5,54.4,60.8,66.0,82.14,92
13,Toronto,72,1506,20.91667,14.0088,2,2.71,3.0,3.55,5.0,10.0,17.5,29.0,39.9,48.25,53.45,56.74,61
15,UCL,51,1108,21.72549,20.436319,1,1.0,1.25,2.5,4.0,6.5,13.0,38.0,52.0,60.5,64.0,78.5,93
2,Dartmouth,47,1769,37.6383,27.457022,1,1.92,3.6,7.0,8.6,12.0,39.0,54.0,75.8,81.4,85.4,104.9,121
14,UBC,35,850,24.28571,17.492135,3,3.68,4.7,5.0,6.4,13.5,19.0,32.0,44.4,56.6,67.4,75.56,81
7,McGill,31,704,22.70968,14.744928,2,3.2,5.0,6.0,6.0,12.5,20.0,30.5,36.0,52.5,61.5,62.4,63
6,Laval,30,424,14.13333,13.242391,1,1.0,1.0,1.45,2.0,4.0,10.5,21.75,26.1,28.65,39.9,55.56,66
8,Michigan,28,639,22.82143,12.223395,2,2.54,3.35,4.35,6.4,14.5,22.5,33.5,38.0,41.25,44.3,45.92,47
4,Harvard,22,441,20.04545,15.50876,3,3.21,3.525,4.1,6.2,10.25,12.0,30.0,33.8,37.8,52.25,61.7,68
10,OHSU,19,422,22.21053,14.01023,2,2.18,2.45,2.9,5.4,12.5,21.0,28.5,37.4,44.4,50.7,54.48,57


In [26]:
#################################
## How long research?
#################################
au_metrics_df$how_long <- as.numeric(au_metrics_df$pub_last) - as.numeric(au_metrics_df$pub_first) 

with(au_metrics_df, list(summary(as.numeric(pub_last)),
                         summary(as.numeric(pub_first)),
                         summary(how_long)
                        ))

[[1]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   2017    2022    2022    2022    2022    2023      27 

[[2]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   1951    1995    2005    2003    2012    2021      27 

[[3]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00   10.00   17.00   19.12   27.00   72.00      27 


In [27]:
aggregate(apply(au_metrics_df[,c("pub_last","pub_first","how_long")],2,as.numeric), list(au_metrics_df$scrape_inst), function(x) list(min(x, na.rm=TRUE), median(x, na.rm=TRUE), max(x, na.rm=TRUE)))

Group.1,pub_last,pub_first,how_long
<chr>,"<list[,3]>","<list[,3]>","<list[,3]>"
Cambridge,"2022, 2022, 2023","1977, 1996, 2012","10, 26, 45"
Dartmouth,"2017, 2022, 2023","1964, 1988, 2019","1, 34, 57"
Duke,"2021, 2022, 2023","1985, 2008, 2015","7, 15, 36"
Harvard,"2017, 2022, 2023","1968, 2002, 2018","4, 20, 54"
HKU,"2022, 2023, 2023","1987, 2009, 2020","2, 14, 36"
Laval,"2018, 2022, 2023","1994, 2007.5, 2021","0, 15, 28"
McGill,"2017, 2022, 2023","1978, 1998, 2015","6, 24, 44"
Michigan,"2020, 2022, 2023","1978, 2001, 2018","4, 21, 44"
Monash,"2022, 2022, 2023","1995, 2009.5, 2021","1, 12.5, 27"
OHSU,"2021, 2022, 2023","1990, 2001.5, 2018","4, 21, 33"


In [28]:
## Number of years exposure between start date and end date

In [29]:
## How many authors started research career (i.e. published first peer-reviewed article) after study start date
table(ifelse(as.numeric(au_metrics_df$pub_first)>=2017, TRUE, FALSE))


FALSE  TRUE 
  502    62 

In [30]:
## Get time under observation
au_metrics_df$pub_first_ <- pmax(as.numeric(au_metrics_df$pub_first), 2017)
au_metrics_df$pub_last_ <- pmin(as.numeric(au_metrics_df$pub_last), 2023)
au_metrics_df$how_long <- with(au_metrics_df, pub_last_ - pub_first_)

table(au_metrics_df$how_long)


  0   1   2   3   4   5   6 
  7  17  18  23  50 335 114 

In [31]:
##
## Note: to get evaluation over study timeframe (not LIFETIME), need work with raw articles (and their meta-data)
##

In [32]:
#################################
## Session Information
#################################

In [33]:
Sys.Date()

In [34]:
sessionInfo()

R version 4.1.3 (2022-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17763)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] rscopus_0.7.1

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.8.3     magrittr_2.0.3   tidyselect_1.1.2 uuid_1.1-0      
 [5] R6_2.5.1         rlang_1.0.2      fastmap_1.1.0    fansi_1.0.3     
 [9] plyr_1.8.7       httr_1.4.4       dplyr_1.0.9      tools_4.1.3     
[13] utf8_1.2.2       cli_3.3.0        htmltools_0.5.2  ellipsis_0.3.2  
[17] digest_0.6.29    tibble_3.1.7     lifecycle_1.0.1  crayon_1.5.1    
[21] IRdisplay_1.1    tidyr_1.2.0      purrr_0.3.4      repr_1.1.4      
[25] base6

In [35]:
version

               _                           
platform       x86_64-w64-mingw32          
arch           x86_64                      
os             mingw32                     
system         x86_64, mingw32             
status                                     
major          4                           
minor          1.3                         
year           2022                        
month          03                          
day            10                          
svn rev        81868                       
language       R                           
version.string R version 4.1.3 (2022-03-10)
nickname       One Push-Up                 