In [1]:
####################################################
## Look at overall SJR rankings of where family medicine documents are published (based on XXX year SJR data??)
##
##         - SciImago (Journal-level Data)
##
## Author: Chris Meaney
## Date: January 2023
####################################################

In [2]:
## Options for how many rows/cols displayed when printing matrices/dataframes/etc. in Jupyter
options(repr.matrix.max.rows=100) 
options(repr.matrix.max.cols=50) 

In [3]:
################
## Package dependencies (for APIs) and URLs to other data sources
################

## Bibliometric/scientometric metrics from Scopus/Elsevier
library(rscopus)

## URL to obtain SciImago SJR Journal Impact Factor information
sjr_url <- "https://www.scimagojr.com/journalrank.php?out=xls"

In [4]:
################
## Set rScopus API key
##
## Note: you can obtain a Scopus API key at the following URL: https://dev.elsevier.com/
################
api_key <- set_api_key("Enter_Your_API_Key_Here")
api_key

NULL

In [5]:
######################################################
## 
## Import Scopus Data
##
######################################################

In [6]:
####################
## Filepaths to import data
####################

## Set working directory path
wd_path <- "Enter_a_Path_to_a_Working_Directory_For_This_Project"

## Import the core scopus dataframe
scopus_fpath <- paste0(wd_path, "//Core_Scopus_Dataset_Sm.csv")


In [7]:
#####################
## Import core Scopus Dataset
#####################
scopus_df <- read.csv(scopus_fpath, header=TRUE, sep=",", stringsAsFactors=FALSE)
str(scopus_df)

'data.frame':	18874 obs. of  25 variables:
 $ au_id         : num  1e+10 1e+10 1e+10 1e+10 1e+10 ...
 $ prism_url     : chr  "https://api.elsevier.com/content/abstract/scopus_id/85140976962" "https://api.elsevier.com/content/abstract/scopus_id/85121747496" "https://api.elsevier.com/content/abstract/scopus_id/85112107056" "https://api.elsevier.com/content/abstract/scopus_id/85118672412" ...
 $ eid           : chr  "2-s2.0-85140976962" "2-s2.0-85121747496" "2-s2.0-85112107056" "2-s2.0-85118672412" ...
 $ doi           : chr  "10.1016/j.socscimed.2022.115463" "10.1111/add.15760" "10.1016/j.socscimed.2021.114262" "10.3399/BJGP.2021.0195" ...
 $ issn          : chr  "02779536" "09652140" "02779536" "09601643" ...
 $ eissn         : chr  "18735347" "13600443" "18735347" "14785242" ...
 $ pub_title     : chr  "Brief opportunistic interventions by general practitioners to promote smoking cessation: A conversation analytic study" "The old and familiar meets the new and unknown: patient and clin

In [8]:
table(scopus_df$pub_year)


2017 2018 2019 2020 2021 2022 
2591 2811 2823 3075 3617 3957 

In [9]:
####################
## Create small data.frame that only considers unique publications
####################
scopus_df_uniq <- scopus_df[is.na(scopus_df$pub_title)==FALSE, ]
scopus_df_uniq <- scopus_df_uniq[!duplicated(scopus_df_uniq$pub_title), ]

scopus_df_uniq <- scopus_df[is.na(scopus_df$eid)==FALSE, ]
scopus_df_uniq <- scopus_df_uniq[!duplicated(scopus_df_uniq$eid), ]

scopus_df_uniq <- scopus_df[is.na(scopus_df$doi)==FALSE, ]
scopus_df_uniq <- scopus_df_uniq[!duplicated(scopus_df_uniq$doi), ]

list(
    dim(scopus_df_uniq),
    dim(scopus_df)
    )

In [10]:
## How many included institutions and authors (in dataset of unique titles)
list(
    length(unique(scopus_df_uniq$au_institution)),
    length(unique(scopus_df_uniq$au_name)),
    length(unique(scopus_df_uniq$au_id))
)

In [11]:
##
## We will merge with SciMago SJR data below, using ISSN as key
##
## Note: this too is a bit of a HACK --- SciMago has BOTH 1) issn, and 2) eissn
## Note: this too is a bit of a HACK --- A given record/row of SciMago data, may contain multiple concatenated issn/eissn
##
## To simplify we focus on ONLY issn (a limitation); and further, only consider first issn if multiple are reported per record
##
dim(scopus_df_uniq)

In [12]:
length(unique(scopus_df_uniq$issn))

In [13]:
#######################################################
##
## Journal data from SciImago: 
## https://www.scimagojr.com/journalrank.php
##
#######################################################

In [14]:
sjr <- read.csv(url(sjr_url), sep=";", header=TRUE)
dim(sjr)

In [15]:
data.frame(names(sjr))

names.sjr.
<chr>
Rank
Sourceid
Title
Type
Issn
SJR
SJR.Best.Quartile
H.index
Total.Docs...2021.
Total.Docs...3years.


In [16]:
##
## It's a bit weird because ISSN is stored as "," comma-separated string. 
## Need to parse these string data, represent SciImago dataframe in long format (expanding on ISSN). 
## Remove duplicate ISSN. 
## Then join with rScopus data on ISSN.
##
## note: I **think** the different ISSN in the "," comma-separated list is ISSN vs. e-ISSN, etc.
##

In [17]:
##########################
## Hack --- parse and only keep the first ISSN
##########################
sjr$Issn_ <- sapply(strsplit(sjr$Issn, split=","), function(x) x[[1]])
head(sjr, 5)

Unnamed: 0_level_0,Rank,Sourceid,Title,Type,Issn,SJR,SJR.Best.Quartile,H.index,Total.Docs...2021.,Total.Docs...3years.,Total.Refs.,Total.Cites..3years.,Citable.Docs...3years.,Cites...Doc...2years.,Ref....Doc.,Country,Region,Publisher,Coverage,Categories,Issn_
Unnamed: 0_level_1,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,28773,Ca-A Cancer Journal for Clinicians,journal,"15424863, 00079235",56204,Q1,182,41,121,4006,17959,78,18675,9771,United States,Northern America,Wiley-Blackwell,1950-2021,Hematology (Q1); Oncology (Q1),15424863
2,2,20315,Nature Reviews Molecular Cell Biology,journal,"14710072, 14710080",33213,Q1,452,111,338,9025,13797,161,3855,8131,United Kingdom,Western Europe,Nature Publishing Group,2000-2021,Cell Biology (Q1); Molecular Biology (Q1),14710072
3,3,29431,Quarterly Journal of Economics,journal,"00335533, 15314650",31348,Q1,272,48,111,3406,2241,110,1630,7096,United Kingdom,Western Europe,Oxford University Press,1886-2021,Economics and Econometrics (Q1),335533
4,4,18434,Cell,journal,"00928674, 10974172",25716,Q1,814,517,1727,33658,73240,1639,4500,6510,United States,Northern America,Cell Press,1974-2021,"Biochemistry, Genetics and Molecular Biology (miscellaneous) (Q1)",928674
5,5,19434,MMWR Recommendations and Reports,journal,"10575987, 15458601",25045,Q1,148,124,17,2900,663,17,3379,2339,United States,Northern America,Centers for Disease Control and Prevention (CDC),1990-2021,"Epidemiology (Q1); Health Information Management (Q1); Health (social science) (Q1); Health, Toxicology and Mutagenesis (Q1); Medicine (miscellaneous) (Q1)",10575987


In [18]:
###########################
## Join the SJR data onto 
###########################
scopus_df_uniq <- merge(x=scopus_df_uniq, y=sjr, by.x="issn", by.y="Issn_", all.x=TRUE)
dim(scopus_df_uniq)

In [19]:
#########################
## Distribution of journal impact factor where papers published
## Note: many of the titles/ISSN are missing SJR (SciImago Journal Reports Imapct Factor)
#########################
summary(as.numeric(gsub(scopus_df_uniq$SJR, pattern=",", replace=".")))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.102   0.730   1.073   1.811   1.792  25.716    8339 

In [20]:
data.frame(quantile(as.numeric(gsub(scopus_df_uniq$SJR, pattern=",", replace=".")), seq(0,1,0.025), na.rm=TRUE))

Unnamed: 0_level_0,quantile.as.numeric.gsub.scopus_df_uniq.SJR..pattern........replace..........
Unnamed: 0_level_1,<dbl>
0%,0.102
2.5%,0.2776
5%,0.412
7.5%,0.438
10%,0.47
12.5%,0.483
15%,0.511
17.5%,0.568325
20%,0.628
22.5%,0.681


In [21]:
#########################
## SJR Best Quartile -- great deal missing data
#########################
table(scopus_df_uniq[["SJR.Best.Quartile"]], useNA="always")


   -   Q1   Q2   Q3   Q4 <NA> 
   1 3585  902  183   84 8338 

In [22]:
##########################
## Journal H-Index -- great deal missing data
##########################
summary(as.numeric(scopus_df_uniq[["H.index"]]))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    2.0    66.0   109.0   152.2   191.0  1079.0    8338 

In [23]:
##########################
## Country of journal title
##########################
sort(table(scopus_df_uniq[["Country"]], useNA="always"), decreasing=TRUE)


                <NA>        United States       United Kingdom 
                8338                 1927                 1779 
              Canada          Netherlands          Switzerland 
                 301                  234                  165 
           Australia              Germany              Ireland 
                  90                   83                   71 
              France              Austria                China 
                  17                   16                   12 
               Italy                Egypt                India 
                   9                    7                    5 
              Mexico               Poland              Denmark 
                   4                    4                    3 
           Singapore                Spain United Arab Emirates 
                   3                    3                    3 
            Colombia                Japan          New Zealand 
                   2                   

In [24]:
############################
## Stratify by institution
############################

In [25]:
## SJR Ranking vs. instituion

sjr_df_by_inst <- 
    data.frame(data.frame(aggregate(as.numeric(gsub(scopus_df_uniq$SJR, pattern=",", replace=".")), 
   list(scopus_df_uniq$au_institution),
   quantile, probs=seq(0,1,0.25), na.rm=TRUE))[,2])

names(sjr_df_by_inst) <- c("min", "q25", "q50", "q75", "max")

rownames(sjr_df_by_inst) <- 
    unlist(data.frame(aggregate(as.numeric(gsub(scopus_df_uniq$SJR, pattern=",", replace=".")), 
    list(scopus_df_uniq$au_institution),
    quantile, probs=seq(0,1,0.25), na.rm=TRUE))[,1])

t(sjr_df_by_inst)

Unnamed: 0,Cambridge,Dartmouth,Duke,Harvard,HKU,Laval,McGill,Michigan,Monash,OHSU,Ottawa,Oxford,Toronto,UBC,UCL,UCSF,UNSW
min,0.105,0.181,0.185,0.135,0.13,0.131,0.109,0.129,0.238,0.287,0.105,0.102,0.131,0.118,0.174,0.147,0.304
q25,0.681,0.881,0.813,0.701,0.793,0.47,0.6625,0.7235,0.5,0.8,0.635,0.875,0.624,0.537,0.748,0.8,0.5
q50,0.986,1.348,1.0795,1.371,0.981,0.789,0.895,1.05,0.793,0.914,0.919,1.451,1.111,0.847,1.274,1.071,0.814
q75,1.472,2.023,1.85975,3.65325,1.677,1.219,1.4755,1.664,1.225,1.712,1.667,2.291,1.664,1.438,2.064,2.208,1.155
max,15.652,24.907,16.513,24.907,15.652,15.652,15.652,24.907,4.336,4.029,3.395,25.716,24.907,24.907,24.907,24.907,3.67


In [26]:
## Journal quartile by institution
inst_by_journal_quartile <- data.frame(unclass(table(scopus_df_uniq$au_institution, scopus_df_uniq[["SJR.Best.Quartile"]], useNA="always")))
inst_by_journal_quartile <- inst_by_journal_quartile[order(-rowSums(inst_by_journal_quartile)), -c(1,6)]
inst_by_journal_quartile <- round(inst_by_journal_quartile/rowSums(inst_by_journal_quartile)*100, 2)
inst_by_journal_quartile <- inst_by_journal_quartile[-nrow(inst_by_journal_quartile),]
t(inst_by_journal_quartile)

Unnamed: 0,Oxford,Toronto,Dartmouth,UCL,UBC,McGill,Michigan,Ottawa,Laval,Cambridge,Harvard,OHSU,Duke,UCSF,HKU,Monash,UNSW
Q1,83.49,71.81,81.84,74.75,63.69,73.76,74.9,74.22,58.15,81.08,77.44,79.59,77.7,86.63,79.2,53.28,52.17
Q2,13.5,22.32,11.44,14.81,31.08,18.63,19.22,20.44,37.5,10.14,15.24,16.33,16.89,8.14,12.8,41.61,46.38
Q3,2.18,2.64,3.48,9.43,3.38,4.56,3.14,2.22,1.63,8.78,6.71,4.08,4.73,4.65,8.0,3.65,1.45
Q4,0.83,3.23,3.23,1.01,1.85,3.04,2.75,3.11,2.72,0.0,0.61,0.0,0.68,0.58,0.0,1.46,0.0


In [27]:
############################
## System information
############################

In [28]:
Sys.time()

[1] "2023-01-19 00:03:46 EST"

In [29]:
data.frame(Sys.info())

Unnamed: 0_level_0,Sys.info..
Unnamed: 0_level_1,<chr>
sysname,Windows
release,10 x64
version,build 17763
nodename,DFCM-59NJ8Y1
machine,x86-64
login,meaneych
user,meaneych
effective_user,meaneych


In [30]:
version

               _                           
platform       x86_64-w64-mingw32          
arch           x86_64                      
os             mingw32                     
system         x86_64, mingw32             
status                                     
major          4                           
minor          1.3                         
year           2022                        
month          03                          
day            10                          
svn rev        81868                       
language       R                           
version.string R version 4.1.3 (2022-03-10)
nickname       One Push-Up                 