In [2]:
library(data.table)
library(emmeans)
library(ggplot2)
library(dplyr)

In [3]:
df=fread("../data/df.aggregate.csv")

## Fig. S4

In [4]:
filename_pdf="../output/supp_fig_s4.pdf"

cc=c("#2c2d4a","#be5d05","#f1aa00")

rws.cnt=df %>% dplyr::group_by(listing_id, host_id, h.ethnicity) %>% dplyr::summarise(rws.cnt=n()) 
rws.cnt$h.ethnicity=stringr::str_to_title(rws.cnt$h.ethnicity)
rws.cnt$h.ethnicity=factor(rws.cnt$h.ethnicity, 
                           levels=c("White", "Asian", "Black"))

pdf(filename_pdf, width=9.5, height=5)
g = ggplot(rws.cnt, aes(x=rws.cnt, fill=h.ethnicity)) +
  geom_histogram(alpha=0.7, position="dodge", bins=30) + 
  scale_y_continuous(limits=c(0,1250), breaks=seq(0,1250,250),
                     labels=c(seq(0,1250,250))) +
  scale_x_continuous(limits=c(1, 360), breaks=seq(0,360,60), labels=seq(0,360,60)) +
  xlab("Number of Reviews") + ylab("Frequency") + labs(fill="Host Race") + theme_classic() + 
  theme(legend.pos="top",
        legend.justification="left",
        text=element_text(size=12)) +
  scale_fill_manual(values=cc)
plot(g)
dev.off()

`summarise()` has grouped output by 'listing_id', 'host_id'. You can override using the `.groups` argument.

“Removed 4 rows containing missing values (geom_bar).”


## Fig. S5

In [5]:
filename_pdf="../output/supp_fig_s5.pdf"

res=lm(rws.cnt ~ h.ethnicity , data = rws.cnt)
pairs(emmeans(res, ~ h.ethnicity))
p.df=emmip(res, ~ h.ethnicity, CIs = T, plotit = F)

dodge=position_dodge(width=0.5)
pdf(filename_pdf, width=3.17*2, height=5)
p=ggplot(p.df, aes(x=h.ethnicity, y=yvar))+
  geom_point(size=2.5,position=dodge) + 
  geom_errorbar(aes(ymin=LCL, ymax=UCL),
                position=dodge, size=0.9, width=0.05) +
  scale_y_continuous(limits=c(16,22), breaks=seq(16,22,1),
                     labels=c(seq(16,22,1))) +
  xlab("Host Race") + ylab("Mean Reviews") + theme_classic() + 
  theme(panel.grid.major=element_blank(),
        text=element_text(size=12)) +
  scale_color_grey(start=0.5, end=0.5)
plot(p)
dev.off()

 contrast      estimate    SE   df t.ratio p.value
 White - Asian    0.725 0.775 7900   0.935  0.6178
 White - Black   -1.439 0.760 7900  -1.894  0.1404
 Asian - Black   -2.164 0.922 7900  -2.346  0.0497

P value adjustment: tukey method for comparing a family of 3 estimates 

## Fig. S6

In [6]:
filename_pdf="../output/supp_fig_s6.pdf"

res=lm(rws.cnt ~ h.ethnicity , data = rws.cnt[rws.cnt$rws.cnt>=6,])
p.df=emmip(res, ~ h.ethnicity, CIs = T, plotit = F)

dodge=position_dodge(width=0.5)
pdf(filename_pdf, width=3.17*2, height=5)
p=ggplot(p.df, aes(x=h.ethnicity, y=yvar))+
  geom_point(size=2.5,position=dodge) + 
  geom_errorbar(aes(ymin=LCL, ymax=UCL),
                position=dodge, size=0.9, width=0.05) +
  scale_y_continuous(limits=c(26,31), breaks=seq(26,31,1),
                     labels=c(seq(26,31,1))) +
  xlab("Host Race") + ylab("Mean Reviews") + theme_classic() + 
  theme(panel.grid.major=element_blank(),
        text=element_text(size=12)) +
  scale_color_grey(start=0.5, end=0.5)
plot(p)
dev.off()