-
Notifications
You must be signed in to change notification settings - Fork 0
/
figure_1_demographics.R
149 lines (134 loc) · 8.7 KB
/
figure_1_demographics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Create demographics figure for Survey 2
# modeled on Figure 1 of Williams et al. (2019)
# see figure_1_demographics.R
# prerequisites
rm(list = ls())
require(tidyverse)
require(ggthemes)
# 1. create demographics table with each demographic variable, category and percentage
# Create dataframe of each desired demographic
# start with the `Merged_Data` set produced in Chapter 0 and rename columns
# Merged_Data <- read_csv("Merged_Data.csv")
Merged_Data <- read_csv("Merged_Data_Anonymous.csv")
Merged_Data <- Merged_Data %>%
rename(`Gender Identity` = Gender,
`Race/Ethnicity` = Ethnicity,
`Terminal Degree` = `Q33 Which of the following represents your highest academic degree?`,
`Decade of Degree` = `Q14 In which year did you earn your highest academic degree?`,
`Bioinformatics Training` = `Q12 Which of the following best describes your level of bioinformatics training? Select ALL that apply.`,
`Carnegie Classification` = BASIC2018_bins_text.Current,
`Institution Size` = SIZESET2018_bins_text.Current)
Merged_Data <- Merged_Data %>%
mutate(MSI_status = 2*MSI.Current + HBCU.Current + HSI.Current,
# create variable to encode four categories of MSI status:
# 2*MSI, no (0); HBCU, no (2) ; HSI, no (0) (453 responses) = 2 (None)
# 2*MSI, yes (1); HBCU, yes (1); HSI, no (0) (18 responses) = 3 (HBCU)
# 2*MSI, yes (1); HBCU, no (2) ; HSI, no (0) (27 responses) = 4 (Other MSI)
# 2*MSI, yes (1); HBCU, no (2); HSI, yes (1) (55 responses) = 5 (HSI)
EthnoGroups = case_when(str_detect(`Race/Ethnicity`, "European") ~ "European",
str_detect(`Race/Ethnicity`, "Scandinavian") ~ "European",
str_detect(`Race/Ethnicity`, "Asian") ~ "Asian",
TRUE ~ `Race/Ethnicity`),
TrainingGroups = case_when(str_detect(`Bioinformatics Training`, "graduate") ~ "At Least Some Coursework",
str_detect(`Bioinformatics Training`, "workshops") ~ "At Least Workshops/Bootcamps",
str_detect(`Bioinformatics Training`, "self") ~ "Self-taught Only",
str_detect(`Bioinformatics Training`, "no training/experience") ~ "No Training",
is.na(`Bioinformatics Training`) ~ "Unknown Training",
TRUE ~ "Unknown Training"),
`Gender Identity` = case_when(`Gender Identity` == "F" ~ "Woman",
`Gender Identity` == "M" ~ "Man",
`Gender Identity` == "U" ~ "Other Gender"),
`Terminal Degree` = case_when(
`Terminal Degree` == "Doctoral degree or equivalent (e.g;, PhD, EdD, etc)" ~ "Doctoral",
`Terminal Degree` == "Masters degree or equivalent (e.g., MS, MBA, MA, etc)" ~ "Masters",
`Terminal Degree` == "Professional doctoral degrees or equivalent (MD, DVM, etc)" ~ "Professional",
`Terminal Degree` == "Undergraduate bachelor degree or equivalent (e.g., BS, BSc, BA, etc)" ~ "Bachelor's",
is.na(`Terminal Degree`) ~ "Unknown Degree",
TRUE ~ "Other Degree"),
`Decade of Degree` = case_when(`Decade of Degree` == "2010-2019" ~ "2010s",
`Decade of Degree` == "2000-2009" ~ "2000s",
`Decade of Degree` == "1990-1999" ~ "1990s",
`Decade of Degree` == "1990-1999" ~ "1990s",
`Decade of Degree` == "1980-1989" ~ "1980s",
is.na(`Decade of Degree`) ~ "Unknown Decade",
TRUE ~ `Decade of Degree`),
`MSI Designation` = case_when(MSI_status == 2 ~ "Non-Minority-Serving",
MSI_status == 3 ~ "HBCU",
MSI_status == 4 ~ "Other MSI",
MSI_status == 5 ~ "HSI"),
`Carnegie Classification` = case_when(
`Carnegie Classification` == "Doctoral/Professional Universities" ~ "Doctoral/Professional",
`Carnegie Classification` == "Master's Colleges & Universities" ~ "Master's",
`Carnegie Classification` == "Baccalaureate Colleges" ~ "Baccalaureate",
`Carnegie Classification` == "Associate's Colleges" ~ "Associate's",
`Carnegie Classification` == "Baccalaureate/Associate's Colleges: Associate's Dominant" ~ "Baccalaureate/Associate's",
`Carnegie Classification` == "Baccalaureate/Associate's Colleges: Mixed Baccalaureate/Associate's" ~ "Baccalaureate/Associate's",
`Carnegie Classification` == "Baccalaureate/Associate's Colleges: Associate's Dominant" ~ "Baccalaureate/Associates",
TRUE ~ "Other Classification")
)
# make table with demographic variables only (individual & institutional info)
demographics_subset <- Merged_Data %>%
select(`Gender Identity`, `Race/Ethnicity` = EthnoGroups, `Terminal Degree`, `Decade of Degree`,
`Bioinformatics Training` = TrainingGroups,
`MSI Designation`, `Carnegie Classification`, `Institution Size`)
# count/percent table for each variable/category
demographics_tidy <- demographics_subset %>%
pivot_longer(cols = everything(),
names_to = "Demographic_variable",
values_to = "Category")
demographics_percents <- demographics_tidy %>%
group_by(Demographic_variable) %>%
count(Category, name = "count") %>%
mutate(percent = round(count/sum(count)*100)) %>%
arrange(Demographic_variable, desc(percent)) %>%
filter(Category != "Exclusively graduate/professional")
# set reverse of desired order of categories (so OK after coord_flip)
reverse_order <- c("Woman", "Man", "Other Gender",
"European", "Hispanic", "African American", "Asian", "Mediterranean", "Jewish", "Middle Eastern", "Native American",
"Doctoral", "Professional", "Masters", "Bachelor's", "Other Degree", "Unknown Degree",
"2020 or later", "2010s", "2000s", "1990s", "1980s", "Before 1980", "Unknown Decade",
"At Least Some Coursework", "At Least Workshops/Bootcamps", "Self-taught Only", "No Training", "Unknown Training",
"Non-Minority-Serving", "HSI", "HBCU", "Other MSI",
"Doctoral/Professional", "Master's", "Baccalaureate", "Baccalaureate/Associate's", "Associate's", "Other Classification",
"Large or Very Large", "Medium", "Small or Very Small") %>% rev()
# put categories in desired orders
demographics_percents <- demographics_percents %>%
mutate(Category = factor(Category, levels = reverse_order))
# generate plot with flipped coordinates, reordering variables
demographics_percents %>%
ggplot(aes(Category, percent))+
geom_bar(stat = "Identity")+
coord_flip()+
facet_wrap(vars(factor(Demographic_variable, levels = c("Gender Identity", "Bioinformatics Training",
"Race/Ethnicity", "MSI Designation",
"Terminal Degree", "Carnegie Classification",
"Decade of Degree", "Institution Size"))), # to order variables
scales = "free_y", ncol = 2) +
theme_fivethirtyeight(base_size = 22, base_family = "sans")+
theme(panel.background = element_rect(fill = "white"))+
theme(plot.background = element_rect(fill = "white"))+
theme(legend.background = element_rect(fill = "white"))+
theme(strip.text.y = element_blank())+
theme(axis.text.x = element_blank())+
geom_text(aes(label = paste0(percent, "%"), y = percent),
vjust =0, nudge_y = 5, nudge_x = -0.1, size = 6, color = "black") +
theme(axis.title.x=element_blank(),
axis.ticks.x=element_blank())+
theme(strip.text.x = element_text(size = 20, face = "bold"))+
theme( axis.line = element_line(colour = "black",
linewidth = 0.5, linetype = "solid"))+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank())+
theme(axis.line.x = element_blank())+
theme(axis.text.y = element_text(hjust = 1))
# ggsave() saves the last plot displayed
ggsave("figure_01_survey2.png",
units = "in",
height = 15,
width = 18)
# breakdown of CURE/SURE
Merged_Data %>%
select(Q24CURE_SURE) %>%
separate(col = Q24CURE_SURE, into = c("a", "b", "c", "d"), sep = ",") %>%
pivot_longer(cols = everything(), names_to = "placement", values_to = "response") %>%
count(response)