In [1]:
###################################################
## Post Process the primary care research corpus results --- structure author lists into edge list, network adjacency matrix
##
## Author: Chris Meaney
## Date: January 2023
###################################################

In [2]:
############
## Dependencies
############

## For network analysis
library(igraph)


Attaching package: 'igraph'


The following objects are masked from 'package:stats':

    decompose, spectrum


The following object is masked from 'package:base':

    union




In [3]:
##############
## Filepaths to import data from disk, and export data to disk
##############

## Set working directory path
wd_path <- "Enter_a_Path_to_a_Working_Directory_For_This_Project"

## Import core scopus dataframe
fpath_scopus <- paste0(wd_path, "\\Core_Scopus_Dataset_Sm.csv")

## Import eid for edge list construction
fpath_eid1 <- paste0(wd_path, "\\Scopus_CoAuthNetwork_EID1.csv")
fpath_eid2 <- paste0(wd_path, "\\Scopus_CoAuthNetwork_EID2.csv")

## Import author affiliation data
affils_eid_fpath1 <- paste0(wd_path, "//Scopus_CoAuthAffiliationNetwork_EID1.csv")
affils_eid_fpath2 <- paste0(wd_path, "//Scopus_CoAuthAffiliationNetwork_EID2.csv")

## Import IDs 
ids_fpath <- paste0(wd_path, "//Final_Combined_ScopusIdsFile_JayaSelena.csv")

In [4]:
##################################
## Import Scopus dataset
##################################
scopus_df <- read.csv(file=fpath_scopus, header=TRUE, sep=",", stringsAsFactors=FALSE)
str(scopus_df)

'data.frame':	18874 obs. of  25 variables:
 $ au_id         : num  1e+10 1e+10 1e+10 1e+10 1e+10 ...
 $ prism_url     : chr  "https://api.elsevier.com/content/abstract/scopus_id/85140976962" "https://api.elsevier.com/content/abstract/scopus_id/85121747496" "https://api.elsevier.com/content/abstract/scopus_id/85112107056" "https://api.elsevier.com/content/abstract/scopus_id/85118672412" ...
 $ eid           : chr  "2-s2.0-85140976962" "2-s2.0-85121747496" "2-s2.0-85112107056" "2-s2.0-85118672412" ...
 $ doi           : chr  "10.1016/j.socscimed.2022.115463" "10.1111/add.15760" "10.1016/j.socscimed.2021.114262" "10.3399/BJGP.2021.0195" ...
 $ issn          : chr  "02779536" "09652140" "02779536" "09601643" ...
 $ eissn         : chr  "18735347" "13600443" "18735347" "14785242" ...
 $ pub_title     : chr  "Brief opportunistic interventions by general practitioners to promote smoking cessation: A conversation analytic study" "The old and familiar meets the new and unknown: patient and clin

In [5]:
## Only keep publications from certain years --- 2000:2021
table(scopus_df$pub_year)


2017 2018 2019 2020 2021 2022 
2591 2811 2823 3075 3617 3957 

In [6]:
## Map institution to region
scopus_df$au_institution_country <- with(scopus_df, 
                                                       ifelse(au_institution %in% c("Toronto","UBC","McGill","Laval","Ottawa"), "Canada",
                                                       ifelse(au_institution %in% c("Dartmouth","Michigan","Duke","OHSU","UCSF","Harvard"), "USA",
                                                       ifelse(au_institution %in% c("Oxford", "Cambridge","UCL"), "UK", "Other"))))

table(scopus_df$au_institution, scopus_df$au_institution_country)

           
            Canada Other   UK  USA
  Cambridge      0     0  560    0
  Dartmouth      0     0    0 1173
  Duke           0     0    0  380
  Harvard        0     0    0  485
  HKU            0   648    0    0
  Laval        785     0    0    0
  McGill      1025     0    0    0
  Michigan       0     0    0  793
  Monash         0   455    0    0
  OHSU           0     0    0  752
  Ottawa       774     0    0    0
  Oxford         0     0 5443    0
  Toronto     2484     0    0    0
  UBC          920     0    0    0
  UCL            0     0 1526    0
  UCSF           0     0    0  421
  UNSW           0   250    0    0

In [7]:
## How many unique institutions
length(unique(scopus_df$au_institution))

In [8]:
## How many unique authors
length(unique(scopus_df$au_name))

In [9]:
## Get original set of authors (N=591) no duplicates
scopus_df_uniq <- scopus_df[!(duplicated(scopus_df$au_name)), c("au_id","au_name","au_institution")]
dim(scopus_df_uniq)

In [10]:
head(scopus_df_uniq)

Unnamed: 0_level_0,au_id,au_name,au_institution
Unnamed: 0_level_1,<dbl>,<chr>,<chr>
1,10041373600,RebeccaBarnes,Oxford
34,10240446500,ChristineTCigolle,Michigan
49,12791296200,ClaireKendall,Ottawa
122,12804958400,MinaRuthSilberberg,Duke
137,14421205500,BertrandLebouche,McGill
217,14826143300,ClaudeTopping,Laval


In [11]:
#################################
##
## Import EID authors/ids files
##
#################################
eid_results1 <- read.csv(file=fpath_eid1, header=TRUE, sep=",")
eid_results2 <- read.csv(file=fpath_eid2, header=TRUE, sep=",")

eid_results <- rbind(eid_results1, eid_results2)

str(eid_results)

'data.frame':	150906 obs. of  4 variables:
 $ eid   : chr  "2-s2.0-85140976962" "2-s2.0-85140976962" "2-s2.0-85140976962" "2-s2.0-85140976962" ...
 $ auths : chr  "Wheat H." "Barnes R.K." "Aveyard P." "Stevenson F." ...
 $ ids   : num  5.64e+10 1.00e+10 5.51e+10 7.10e+09 3.51e+10 ...
 $ doc_id: int  1 1 1 1 1 2 2 2 2 2 ...


In [12]:
##
## Descriptive stats
##
list(
    dim(eid_results),
    length(unique(eid_results$eid)),
    length(unique(eid_results$doc_id)),
    length(unique(eid_results$ids))
)

In [13]:
#head(eid_results)

In [14]:
## Number unique authors (including the N=591 ppl in the original Scopus query)
list(
    length(unique(eid_results$ids)),
    length(unique(eid_results$auths))
    )

In [15]:
## Number unique authors --- after subtracting off the original N=591 included in the Scopus query
length(unique(eid_results$ids)) - length(unique(scopus_df$au_name))

In [16]:
##
## Normalize author names lists
## 
## WARNING --- this is a bit HACKY; and we expect that it will resolve some problems while creating other new problems
## For example, for common/generic Asian family names restricting to LastName+FirstInitial may not be enough granularity to uniquely specify authors
##

## See weird case for example
# eid_results[grepl(x=eid_results$auths, pattern="Upshur"), ]
eid_results$auths_norm <- sapply(strsplit(x=eid_results$auths, split="\\."), function(x) x[[1]])
# eid_results[grepl(x=eid_results$auths_norm, pattern="Upshur"), ]

list(
    length(unique(eid_results$auths)),
    length(unique(eid_results$auths_norm)),
    length(unique(eid_results$ids))
)

In [17]:
##
##
## Note...you will get slightly/subtly different answers depending on your definition of an author/person
## For example, is an author/person defined by 1) Scopus ID, 2) author name, 3) normalized author name, 4) some other identifier, etc.
## Here we use Scopus ID because we think it is less biased than author name --- but we acknowledge it is NOT perfect
## For example, some unique authors/people may have multiple Scopus IDs (a multiplicity problem)...
##
##

In [18]:
##
## Split into separate eid files
##
eid_split <- split(eid_results, f=as.factor(eid_results$eid))
length(eid_split)

In [19]:
## Get number of authors per publication (based on eid)
num_authors <- sapply(eid_split, nrow)

num_authors_df <- data.frame(table(num_authors))
prop_authors_df <- data.frame(prop.table(table(num_authors)))

authors_df <- cbind(num_authors_df, round(prop_authors_df[,2]*100, 2))
authors_df <- data.frame(apply(authors_df, 2, as.numeric))
names(authors_df) <- c("num_authors", "freq", "prop")
authors_df

num_authors,freq,prop
<dbl>,<dbl>,<dbl>
1,1184,9.04
2,416,3.18
3,731,5.58
4,1178,8.99
5,1347,10.28
6,1367,10.44
7,1217,9.29
8,995,7.60
9,817,6.24
10,689,5.26


In [20]:
## Check on author freq counts
list(
    dim(eid_results),
    sum(authors_df$num_authors * authors_df$freq)
)

In [21]:
## Quantiles on number of authors
data.frame(num_authors=quantile(num_authors, probs=c(0, 0.25, 0.50, 0.75, 1.0)))

Unnamed: 0_level_0,num_authors
Unnamed: 0_level_1,<dbl>
0%,1
25%,4
50%,7
75%,10
100%,3391


In [22]:
## Number of collaborators - minus the number of original authors in the search/query
list(
    length(unique(eid_results$ids)),
    length(unique(eid_results$ids)) - length(unique(scopus_df$au_name))
)

In [23]:
## Investigate weird cases with >1000 authors on publication 
## We have seen some large international collaborations of this nature
## However, for assessing collaboration, we feel it is best to drop these edge cases, as to not distort traditional small/medium team collaboration

# big_collabs <- eid_split[sapply(eid_split, nrow) > 1000]
# length(big_collabs)
# big_collabs[1]

## Note: these seem like legit, albeit, really big team science projects

In [24]:
##
## For now I will leave these in, since it makes counting collaborations more consistent
## However, I **think** these few very large collaborations may distort estimation of centrality measures
##
## That said, the removal of large collaborative projects is very subjective
## And further, the exact threshold for determination of what is big (>10, >25, >50, >100, etc.) is quite subjective
##

In [25]:
## Create flag and drop articles with very many authors (say num_authors>25)
# num_authors_flag <- num_authors>25
# table(num_authors_flag)

In [26]:
# eid_split <- eid_split[!num_authors_flag]
# length(eid_split)

In [27]:
##
## Get Author Pairs DF --- Note: this is based on unique Scopus ID (an assumption; noting we could use given name)
##
t0 <- Sys.time()

author_pairs_list <- lapply(eid_split, function(x) subset(expand.grid(x$ids, x$ids),Var1!=Var2))
el_auth <- do.call("rbind", author_pairs_list)
names(el_auth) <- c("Auth1","Auth2")
el_auth$Auth1 <- as.character(el_auth$Auth1)
el_auth$Auth2 <- as.character(el_auth$Auth2)

t1 <- Sys.time()
t1-t0

Time difference of 1.335881 mins

In [28]:
list(
    #str(el_auth),
    dim(el_auth),
    length(unique(c(el_auth$Auth1, el_auth$Auth2)))
)

In [29]:
## 
## Note: the dimension above are essentially the number of vertex-to-vertex ties/edges
## Note: we DOUBLE COUNT above; since we include both a->b and b->a type ties
## Note: we have EXCLUDED the a->a type ties/edges
## 
## Hence...the total number of ties is nrow(el_auth)/2
##

In [30]:
## Total number of unique ties/edges
nrow(el_auth)/2

In [31]:
##
## Note: this number/estimate is IMO greatly inflated because of some outlying LARGE studies
##
## For example, our largest study contains 3300ish authors --- this study alone creates choose(3300,2) ties ~ 5M
##

In [32]:
head(el_auth)

Unnamed: 0_level_0,Auth1,Auth2
Unnamed: 0_level_1,<chr>,<chr>
2-s2.0-79960678065.2,6603627308,23498419200
2-s2.0-79960678065.3,7201918014,23498419200
2-s2.0-79960678065.4,7003757445,23498419200
2-s2.0-79960678065.5,8240813400,23498419200
2-s2.0-79960678065.6,23498419200,6603627308
2-s2.0-79960678065.8,7201918014,6603627308


In [33]:
###############################
##
## Grab all co-author pairs - for coauthorship network analysis
##
###############################

##
## Create network graph from directed citation edge-list
##

el_auth <- el_auth[!is.na(el_auth$Auth1), ]
el_auth <- el_auth[!is.na(el_auth$Auth2), ]
el_auth <- el_auth[el_auth$Auth1 != "", ]
el_auth <- el_auth[el_auth$Auth2 != "", ]
dim(el_auth)

In [34]:
## Co-Authorship Network Graph
t0 <- Sys.time()
g_auth <- graph_from_edgelist(el=as.matrix(el_auth), directed=FALSE)
t1 <- Sys.time()
t1-t0

saveRDS(g_auth, file=paste0(wd_path, "\\Author_Network.RDS"))
# g_auth <- readRDS(file=paste0(wd_path, "\\Author_Network.RDS"))

Time difference of 2.426253 mins

In [35]:
###################################
##
## Summary statistics about graph, and vertices (i.e. centrality measures, hub-scores, etc.)
##
###################################

In [36]:
##
## Graph diameter
##
t0 <- Sys.time()
diam <- diameter(g_auth, directed=FALSE, unconnected=TRUE)
t1 <- Sys.time()
time_diam <- t1-t0
#time_diam

diam 

In [37]:
##
## Centrality measures
## See: https://en.wikipedia.org/wiki/Centrality
##

In [38]:
##
## Degreee centrality
##
t0 <- Sys.time()
cent_deg <- centr_degree(g_auth)
t1 <- Sys.time()
time_deg <- t1-t0
#time_deg

str(cent_deg)

List of 3
 $ res            : num [1:66684] 1332 8 8 8 8 ...
 $ centralization : num 0.269
 $ theoretical_max: num 4.45e+09


In [39]:
deg_vec <- cent_deg$res
deg_vec_names <- unlist(vertex_attr(g_auth))
deg_df <- data.frame(auth=deg_vec_names, deg=deg_vec)
# str(deg_df)
deg_df <- with(deg_df, deg_df[order(-deg),])
head(deg_df, 10)

Unnamed: 0_level_0,auth,deg
Unnamed: 0_level_1,<chr>,<dbl>
name4921,7102313705,19172
name5128,7006198464,19172
name5731,57198631836,19172
name12780,57200676779,16846
name5217,57204249661,16662
name10809,36341865600,16620
name11396,6602465281,16620
name11519,57215474538,16620
name4918,12798925900,16616
name4927,57203071203,16558


In [40]:
##
## Closeness centrality
##
t0 <- Sys.time()
cent_clo <- centr_clo(g_auth, mode="all")
t1 <- Sys.time()
time_clo <- t1-t0
#time_clo

str(cent_clo)

List of 3
 $ res            : num [1:66684] 0.312 0.238 0.238 0.238 0.238 ...
 $ centralization : num 1.46
 $ theoretical_max: num 33341


In [41]:
clo_vec <- cent_clo$res
clo_vec_names <- unlist(vertex_attr(g_auth))
clo_df <- data.frame(author=clo_vec_names, clo=clo_vec)
# str(clo_df)
clo_df <- with(clo_df, clo_df[order(-clo),])
head(clo_df, 10)

Unnamed: 0_level_0,author,clo
Unnamed: 0_level_1,<chr>,<dbl>
name127,55477039500,1
name23079,57201188264,1
name23148,57194028657,1
name23149,56037826900,1
name23150,55180583700,1
name23151,35729034100,1
name34304,18042857600,1
name35708,36440099800,1
name35709,7102327915,1
name40229,7102539808,1


In [42]:
##
## Between-ness centrality
##
t0 <- Sys.time()
cent_betw <- centr_betw(g_auth, directed=FALSE)
t1 <- Sys.time()
time_betw <- t1-t0
#time_betw
str(cent_betw)

List of 3
 $ res            : num [1:66684] 8701662 0 0 0 0 ...
 $ centralization : num 0.0254
 $ theoretical_max: num 1.48e+14


In [43]:
betw_vec <- cent_betw$res
betw_vec_names <- unlist(vertex_attr(g_auth))
betw_df <- data.frame(author=betw_vec_names, betw=betw_vec)
# str(betw_df)
betw_df <- with(betw_df, betw_df[order(-betw),])
head(betw_df, 10)

Unnamed: 0_level_0,author,betw
Unnamed: 0_level_1,<chr>,<dbl>
name690,57213046672,56559251
name85,7003334937,53772948
name893,57202103259,48894370
name4584,55484828600,42899718
name1011,8577632600,42707615
name2636,7005778918,42128333
name207,6603577135,38644940
name1849,7003779589,37889453
name2305,26643433800,37384179
name187,55666477900,35992378


In [44]:
##
## Page Rank
##
t0 <- Sys.time()
pr <- page_rank(g_auth)
t1 <- Sys.time()
time_pr <- t1-t0
# time_pr

In [45]:
pr_vec <- pr$vector
pr_vec_names <- names(pr_vec)
pr_df <- data.frame(author=pr_vec_names, pr=pr_vec)
# str(pr_df)
pr_df <- with(pr_df, pr_df[order(-pr),])
head(pr_df, 10)

Unnamed: 0_level_0,author,pr
Unnamed: 0_level_1,<chr>,<dbl>
7003334937,7003334937,0.0005850749
7003779589,7003779589,0.000531499
26643433800,26643433800,0.0005063092
57202103259,57202103259,0.0004633103
6603577135,6603577135,0.0004411518
18437196800,18437196800,0.0004325239
6506713049,6506713049,0.000393014
7004019124,7004019124,0.0003904088
7006242730,7006242730,0.0003900237
7103067740,7103067740,0.00037544


In [46]:
##
## Put centrality measures into single data frame
##
deg_vec <- head(paste0(deg_df$auth, " (", round(deg_df$deg, 0), ")"), 25)
betw_vec <- head(paste0(betw_df$author, " (", round(betw_df$betw, 0), ")"), 25)
clo_vec <- head(paste0(clo_df$author, " (", round(clo_df$clo, 3), ")"), 25)
pr_vec <- head(paste0(pr_df$author, " (", round(pr_df$pr, 4), ")"), 25)

cent_df <- data.frame(rank=1:25,
                      deg=deg_vec,
                      betw=betw_vec,
                      clo=clo_vec,
                      pr=pr_vec)

# cent_df

In [47]:
##########################################
##
##
## Write to DISK
##
##
##########################################

## Author Network Stats
write.csv(deg_df, file=paste0(wd_path, "//centrality_degree.csv"), row.names=FALSE)
write.csv(clo_df, file=paste0(wd_path, "//centrality_closeness.csv"), row.names=FALSE)
write.csv(betw_df, file=paste0(wd_path, "//centrality_betweenness.csv"), row.names=FALSE)
write.csv(pr_df, file=paste0(wd_path, "//centrality_pagerank.csv"), row.names=FALSE)


In [48]:
#############################################
##
## Try to plot the co-author graph
## https://kateto.net/wp-content/uploads/2016/01/NetSciX_2016_Workshop.pdf
##
#############################################

## Number of authors

list(
dim(el_auth),
length(unique(el_auth$Auth1)),
length(unique(el_auth$Auth2)),
length(unique(c(el_auth$Auth1, el_auth$Auth2)))
)

In [49]:
## Which names will be plotted on vertex

# threshold <- 750
# names(V(g_auth))[degree(g_auth)>threshold]

#l <- layout.lgl(g_auth,  
#       maxiter=150,
#       maxdelta=vcount(g_auth),
#       area=vcount(g_auth)^2,
#       coolexp=1.5
#       )

In [50]:
########
## Kamada Kawai Force Direct Graph Layout
########

# options(repr.plot.width=16, repr.plot.height=16)

# set.seed(54321) 

# t0 <- Sys.time()

# l_kk <- layout_with_kk(g_auth)

# plot(g_auth, 
#     layout=l_kk,  
#     ## Vertex attributes
#     #vertex.label = ifelse(degree(g_auth) > threshold, names(V(g_auth)), NA),
#     vertex.label = NA,
#     #vertex.label.cex=0.25,
#     #vertex.label.color="red", 
#     #vertex.label.font=2,
#     vertex.shape="circle", 
#     vertex.size=0.1, 
#     vertex.color=V(g_auth)$color,
#     vertex.frame.color='black',
#     ## Edge attributes
#     edge.mode=0,
#     edge.color='grey90',
#     edge.arrow.size=0.1, 
#     edge.width=0.1,
#     ## Other attributes
#     frame=TRUE
#     )

# t1 <- Sys.time()
# t1-t0

In [51]:
########
## Kamada Kawai Force Direct Graph Layout
########
# set.seed(54321) 

# t0 <- Sys.time()

# l_kk <- layout_with_kk(g_auth)

# plot(g_auth, 
#     layout=l_kk,  
#     ## Vertex attributes
#     #vertex.label = ifelse(degree(g_auth) > threshold, names(V(g_auth)), NA),
#     vertex.label = NA,
#     #vertex.label.cex=0.25,
#     #vertex.label.color="red", 
#     #vertex.label.font=2,
#     vertex.shape="circle", 
#     vertex.size=0.1, 
#     vertex.color="black",
#     vertex.frame.color='black',
#     ## Edge attributes
#     edge.mode=0,
#     edge.color='grey90',
#     edge.arrow.size=0.1, 
#     edge.width=0.1,
#     ## Other attributes
#     frame=TRUE
#     )

# t1 <- Sys.time()
# t1-t0

In [52]:
########################################################
##
##
## Collaborative Networks (Local vs National vs International)
##
##
########################################################

In [53]:
##
## Import author affiliation location/country data
##
affils_eid_df1 <- read.csv(file=affils_eid_fpath1, header=TRUE, sep=",", stringsAsFactors=FALSE)
affils_eid_df2 <- read.csv(file=affils_eid_fpath2, header=TRUE, sep=",", stringsAsFactors=FALSE)

affils_eid_df <- rbind(affils_eid_df1, affils_eid_df2)

head(affils_eid_df)

Unnamed: 0_level_0,eid,affil_city,affil_id,affil_name,affil_country,doc_id
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<int>
1,2-s2.0-85140976962,Plymouth,60024779,University of Plymouth,United Kingdom,1
2,2-s2.0-85140976962,London,60022148,University College London,United Kingdom,1
3,2-s2.0-85140976962,Oxford,60002634,University of Oxford Medical Sciences Division,United Kingdom,1
4,2-s2.0-85121747496,London,60022148,University College London,United Kingdom,2
5,2-s2.0-85121747496,Nottingham,60015138,University of Nottingham,United Kingdom,2
6,2-s2.0-85121747496,Norwich,60011775,"University of East Anglia, Faculty of Medicine and Health Sciences",United Kingdom,2


In [54]:
##
## Only keep subset of collaborations in the "dfcm_eid" vector
##
list(
    dim(affils_eid_df),
    length(unique(affils_eid_df$eid))
    )

In [55]:
##
## Handle missing affiliation data --- this often presents as a nrow=1 dataFrame with all NAs for affiliation data
##
affils_eid_df_sm <- affils_eid_df[!(is.na(affils_eid_df$affil_city) | is.na(affils_eid_df$affil_country)), ]

list(
    dim(affils_eid_df),
    dim(affils_eid_df_sm)
)

In [56]:
## Split affiliations by EID/
affil_split <- split(affils_eid_df_sm, f=as.factor(affils_eid_df_sm$eid))
length(affil_split)

In [57]:
## Affil size
affil_size <- sapply(affil_split, nrow)
table(affil_size)

affil_size
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
1220 1835 1997 1752 1274 1038  739  533  381  263  225  144  118   93   67   65 
  17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
  45   42   33   34   21   11   14   16   11    9    7    7    7    8    7    2 
  33   34   35   36   37   38   39   40   41   42   43   44   45   47   50   52 
   2    1    2    4    3    4    1    2    2    2    1    1    1    1    1    1 
  54   56   61   62   63   66   69   70   71   72   73   74   77   78   79   83 
   1    2    3    2    1    1    1    1    1    1    1    1    3    2    1    3 
  84   87   89   96  104  107  127  281 
   2    1    1    1    1    1    1    1 

In [58]:
## International vs National
national_collab_flag <- sapply(affil_split, function(x) length(unique(x[["affil_country"]]))==1)
table(national_collab_flag)

national_collab_flag
FALSE  TRUE 
 4088  7993 

In [59]:
## Get the international collaboartions
int_affil_split <- affil_split[!national_collab_flag]
length(int_affil_split)

In [60]:
## Get the national collaborations
nat_affil_split <- affil_split[national_collab_flag]
length(nat_affil_split)

In [61]:
## Are national collaborations only local --- based on affiliation city
local_collab_flag <- sapply(nat_affil_split, function(x) length(unique(x[["affil_city"]]))==1)
table(local_collab_flag)

local_collab_flag
FALSE  TRUE 
 5211  2782 

In [62]:
## Get local collaborations
local_affil_split <- nat_affil_split[local_collab_flag]
length(local_affil_split)

In [63]:
## Get national collaborations
nat_affil_split <- nat_affil_split[!local_collab_flag]
length(nat_affil_split)

In [64]:
## Vector of collab types
collab_types <- c(local=length(local_affil_split), 
                 national=length(nat_affil_split),
                 international=length(int_affil_split)
                 )

collab_types

In [65]:
sum(collab_types)

In [66]:
##
## Proportion of research publications which are local vs. national vs. international in scope
##
round(collab_types/sum(collab_types)*100, 2)

In [67]:
##
## What countries involves in production of most primary care research
##

In [68]:
affil_country <- data.frame(table(affils_eid_df_sm$affil_country))
names(affil_country) <- c("country", "freq")
affil_country <- with(affil_country, affil_country[order(-freq), ])
head(affil_country, 25)

Unnamed: 0_level_0,country,freq
Unnamed: 0_level_1,<fct>,<int>
26,Canada,20238
143,United States,16074
142,United Kingdom,15612
7,Australia,3682
94,Netherlands,796
127,Spain,647
51,Germany,616
66,Italy,545
13,Belgium,441
48,France,432


In [69]:
##
## What institutions are producing most primary care research
##

In [70]:
affil_inst <- data.frame(table(affils_eid_df_sm$affil_name))
names(affil_inst) <- c("institution", "freq")
affil_inst <- with(affil_inst, affil_inst[order(-freq), ])
head(affil_inst, 25)

Unnamed: 0_level_0,institution,freq
Unnamed: 0_level_1,<fct>,<int>
7553,University of Oxford Medical Sciences Division,2182
7654,University of Toronto,1799
7251,University College London,1096
6579,The University of British Columbia,936
7551,University of Ottawa,740
2447,Geisel School of Medicine at Dartmouth,666
7655,University of Toronto Faculty of Medicine,599
7206,Université McGill,568
4976,Ottawa Hospital Research Institute,567
3209,Institute for Clinical Evaluative Sciences,543


In [71]:
###########################
## Session Information
###########################

In [72]:
Sys.Date()

In [73]:
sessionInfo()

R version 4.1.3 (2022-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17763)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] igraph_1.3.1

loaded via a namespace (and not attached):
 [1] fansi_1.0.3     utf8_1.2.2      digest_0.6.29   crayon_1.5.1   
 [5] IRdisplay_1.1   repr_1.1.4      lifecycle_1.0.1 jsonlite_1.8.0 
 [9] magrittr_2.0.3  evaluate_0.15   pillar_1.7.0    rlang_1.0.2    
[13] cli_3.3.0       uuid_1.1-0      vctrs_0.4.1     ellipsis_0.3.2 
[17] IRkernel_1.3    tools_4.1.3     glue_1.6.2      fastmap_1.1.0  
[21] compiler_4.1.3  pkgconfig_2.0.3 base64enc_0.1-3 pbdZMQ_0.3-7   
[25] htmltools_0.5.2

In [74]:
version

               _                           
platform       x86_64-w64-mingw32          
arch           x86_64                      
os             mingw32                     
system         x86_64, mingw32             
status                                     
major          4                           
minor          1.3                         
year           2022                        
month          03                          
day            10                          
svn rev        81868                       
language       R                           
version.string R version 4.1.3 (2022-03-10)
nickname       One Push-Up                 