In [2]:
library(data.table)
library(Rmisc)
library(ggplot2)
library(dplyr)
library(reshape2)
library(bit)
library(zoo)

## Data Preperation

### 1. Read raw data (i.e., reviews)

In [3]:
DIR = "../data/"
rws=fread(paste0(DIR,"rws.all.csv"))

### 2. Read facial attributes for both hosts and guests and merge those attributes with the review data

In [4]:
guest=fread(paste0(DIR,"guest.attribute.csv"))
colnames(guest)[c(8)]=c("g.ethnicity")
guest=guest[,c("guest_id","g.ethnicity")]
rws.gh=left_join(rws,guest, by="guest_id",type='left')

In [5]:
#-Note that h.num_face==0 represents hots without any human face in their profile photos.
#-In this case, h.ethnicity is empty string.
#-Additional filtering is required depending on analysis,
#-since profile photos with more than 1 human face also have empty string in the h.ethnicity column.
host=fread(paste0(DIR,"host.attribute.csv"))
colnames(host)[c(5,8)]=c("h.num_face","h.ethnicity")
host=host[,c("listing_id","host_id","h.num_face","h.ethnicity")]
rws.gh=left_join(rws.gh,host, by=c("listing_id","host_id"))

In [6]:
tmp_host=host
tmp_host$h.ethnicity=ifelse(tmp_host$h.num_face==0, "RA", tmp_host$h.ethnicity)

In [7]:
#-Remove those racially unidentified guests
rws.gh=rws.gh[rws.gh$g.ethnicity!="",]

In [8]:
#-Label RA for racially ambiguous hosts
rws.gh$h.ethnicity=ifelse(rws.gh$h.num_face==0, "RA", rws.gh$h.ethnicity)

In [9]:
#-Remove those racially unidentified hosts (with multiple human faces in their profile photos)
rws.gh=rws.gh[rws.gh$h.ethnicity!="",]

### 3. Create booking- / staying-level reputation and endorsement signals

#### 3.1. Create columns for # of previous reviews and race of previous reviewers for each booking/staying

In [10]:
df=rws.gh

In [11]:
df=df %>% group_by(listing_id) %>% arrange(date) %>%
  mutate(cum.cnt=row_number()-1,
         rws.race=lag(g.ethnicity, n = 1, default = NA))

#### 3.2. Count the previous guest race for each booking

In [12]:
#-Create column and add counter
df$g.W.cnt=ifelse(df$rws.race=="WHITE", 1, 0)
df$g.B.cnt=ifelse(df$rws.race=="BLACK", 1, 0)
df$g.A.cnt=ifelse(df$rws.race=="ASIAN", 1, 0)

In [13]:
#-Sort reviews for each listing by date
#-Then roll sum racial compositions in the most recent 6 reviews for each booking
#-Note that the most 6 reviews are the proxy of front-page reviews
df = df %>% group_by(listing_id) %>% arrange(date) %>%
    mutate(top6.W=rollsumr(g.W.cnt==1, 6, fill=NA),
           top6.B=rollsumr(g.B.cnt==1, 6, fill=NA),
           top6.A=rollsumr(g.A.cnt==1, 6, fill=NA))

In [14]:
#-Code the first review's the most recent previous reviewer's race as 0
#-and then count all the previous reviewers' races cumulatively
df$rws.race[which(is.na(df$rws.race))]=0

df = df %>% group_by(listing_id) %>% arrange(date) %>% 
    mutate(g.W.cnt=cumsum(rws.race=="WHITE"),
           g.B.cnt=cumsum(rws.race=="BLACK"),
           g.A.cnt=cumsum(rws.race=="ASIAN"))

In [15]:
df=as.data.table(df)

In [16]:
#-Extract only listings that allow instant booking
df=df[which(df$instant_bookable=="t"),]

In [17]:
create_ctf_1 = function(df.x) {
    df.tmp.ctf=df.x
    df.tmp.ctf$pairing.status=0
    df.tmp.ctf$g.ethnicity=ifelse(df.tmp.ctf$g.ethnicity=="BLACK" |
                                  df.tmp.ctf$g.ethnicity=="ASIAN",
                                  "WHITE", "BLACK")
    return(df.tmp.ctf)
}

create_ctf_2 = function(df.x) {
    df.tmp.ctf=df.x
    df.tmp.ctf$pairing.status=0
    df.tmp.ctf$g.ethnicity=ifelse(df.tmp.ctf$g.ethnicity=="BLACK" |
                                  df.tmp.ctf$g.ethnicity=="WHITE",
                                  "ASIAN", "BLACK")
    return(df.tmp.ctf)
}

create_ctf = function(df.x) {
    return(rbind(create_ctf_1(df.x),create_ctf_2(df.x)))
}

# ----------------------------------------------------------------------------------------

## Pairing probabilities between non-racially-identified hosts and guests with a racial identity conditional on same-race reviews

- Fig. 2
- No-human-face & no-person-name
- No-human-face & no-person-name + racially-unidentifiable-person-name
- No-human-face

### 1. Compute paring probabilities across different race combinations

#### 1.1. Extract only relevant columns and create counterfactuals

In [18]:
df.tmp.obs=df[, c("h.ethnicity", "g.ethnicity",
                  "top6.A", "top6.B", "top6.W"), with=FALSE]
df.tmp.obs$pairing.status=1

In [19]:
#-Cap to 5 due to small N for Asian and Black guests on SRE 5 and 6
df.tmp.obs$top6.A=ifelse(df.tmp.obs$top6.A>5, 5, df.tmp.obs$top6.A)
df.tmp.obs$top6.B=ifelse(df.tmp.obs$top6.B>5, 5, df.tmp.obs$top6.B)
df.tmp.obs$top6.W=ifelse(df.tmp.obs$top6.W>5, 5, df.tmp.obs$top6.W)

In [20]:
df.tmp.ctf=create_ctf(df.tmp.obs)

#### 1.2. Combine counterfactuals with observations and compute average pairing probabilities 

In [21]:
df.obs.ctf=rbind(df.tmp.obs,df.tmp.ctf)

In [22]:
df.obs.ctf$sre[df.obs.ctf$g.ethnicity=="WHITE"]=df.obs.ctf$top6.W[df.obs.ctf$g.ethnicity=="WHITE"]
df.obs.ctf$sre[df.obs.ctf$g.ethnicity=="ASIAN"]=df.obs.ctf$top6.A[df.obs.ctf$g.ethnicity=="ASIAN"]
df.obs.ctf$sre[df.obs.ctf$g.ethnicity=="BLACK"]=df.obs.ctf$top6.B[df.obs.ctf$g.ethnicity=="BLACK"]

In [23]:
df.obs.ctf=df.obs.ctf[complete.cases(df.obs.ctf), ]

### 2. Compute the gaps between same- and other-race booking probabilities

In [24]:
start_time=Sys.time()

l_tukey=list()
l_g.ethnicity=unique(df.obs.ctf$g.ethnicity)
l_h.ethnicity=unique(df.obs.ctf$h.ethnicity)
l_sre=unique(df.obs.ctf$sre)
for(x in l_g.ethnicity){
    for(y in l_sre){
        df.tmp=df.obs.ctf[(df.obs.ctf$g.ethnicity==x)&
                          (df.obs.ctf$sre==y),]
        df.tmp$h.ethnicity=factor(df.tmp$h.ethnicity,
                                  levels=c(setdiff(l_h.ethnicity,x),x),
                                  ordered=TRUE)
        a=aov(pairing.status~h.ethnicity, data=df.tmp)
        df.tukey=as.data.table(TukeyHSD(a)$h.ethnicity, keep.rownames="h.ethnicity")
        df.tukey=df.tukey[grepl(x, df.tukey$h.ethnicity),]
        df.tukey$h.ethnicity=sapply(strsplit(df.tukey$h.ethnicity, split='-', fixed=TRUE),
                                    function(x) (x[2]))
        df.tukey$g.ethnicity=x
        df.tukey$sre=y
        l_tukey[[paste("h.pair.diff", x, y, sep=".")]]=df.tukey
    }
}

end_time=Sys.time()
end_time - start_time

Time difference of 1.027195 secs

In [25]:
df.pairing.prob.diff=do.call(rbind.data.frame, l_tukey)

In [26]:
df.pairing.prob.diff$sre=factor(df.pairing.prob.diff$sre)

In [27]:
df.pairing.prob.diff$h.ethnicity=ifelse(df.pairing.prob.diff$h.ethnicity=="RA",
                                        "Racially Unidentified", df.pairing.prob.diff$h.ethnicity)

re_from="\\b([[:alpha:]])([[:alpha:]]+)"
df.pairing.prob.diff$h.ethnicity=gsub(re_from, "\\U\\1\\L\\2", df.pairing.prob.diff$h.ethnicity, perl=TRUE)

df.pairing.prob.diff$h.ethnicity=factor(df.pairing.prob.diff$h.ethnicity,
                                        levels=c("Racially Unidentified","White","Asian","Black"),
                                        ordered=TRUE)

In [28]:
df.pairing.prob.diff$g.ethnicity=factor(df.pairing.prob.diff$g.ethnicity,
                                        levels=c("WHITE","ASIAN","BLACK"),
                                        ordered=TRUE)

In [30]:
l_guest.race.label=c(
    "BLACK"="Black Guest",
    "WHITE"="White Guest",
    "ASIAN"="Asian Guest")
cc=c("#899da4","#2c2d4a","#be5d05","#f1aa00")

dodge=position_dodge(width=0.5)
pdf("../output/supp_fig_s2a.pdf", width=4.5, height=5.5)
g = ggplot(df.pairing.prob.diff, aes(x=sre, y=diff)) + 
  geom_line(position=dodge, size=1, aes(color=h.ethnicity, group=h.ethnicity)) +
  xlab("Number of Same-Race Endorsements\n(Out of 5 or More Front-Page Reviews)") + 
  ylab("Difference in Pairing Probability for Same- vs. Other-Race Hosts") +
  geom_point(position=dodge, aes(color=h.ethnicity, group=h.ethnicity), size=2) +
  geom_errorbar(aes(ymin=lwr, ymax=upr, color=h.ethnicity),
                size=0.75, width=0.25, position=dodge) +
  scale_color_manual(values=cc) +
  scale_x_discrete(labels=c("0","1","2","3","4","5+")) +
  facet_grid(g.ethnicity~., margins=FALSE, switch="y", scales="free_y",
             labeller=as_labeller(l_guest.race.label)) + labs(color="Host Race") + 
             theme_classic() + theme(panel.border=element_rect(fill=NA, size=0.3)) +
  coord_cartesian(ylim=c(-0.12,0.31)) +
  geom_hline(yintercept=0, linetype="dashed", color="#D55E00") +
  theme(legend.direction="vertical",
        legend.position=c(0.98,0.94),
        legend.justification="right",
        legend.margin=margin(0, unit="cm"),
        legend.spacing.y=unit(0.1,"cm"),
        legend.key.size=unit(0.4,"lines"),
        panel.grid.major=element_blank(),
        text=element_text(size=12),
        legend.title=element_text(size=11)) + guides(color=guide_legend(nrow=2,byrow=TRUE))
plot(g)
dev.off()

In [31]:
fig_s2_a=g

### BINARY

In [32]:
start_time=Sys.time()

df.tmp.obs.ctf=df.obs.ctf
df.tmp.obs.ctf$h.ethnicity.binary=df.tmp.obs.ctf$h.ethnicity
df.tmp.obs.ctf$h.ethnicity.binary=ifelse(df.tmp.obs.ctf$h.ethnicity.binary==df.tmp.obs.ctf$g.ethnicity,
                                         "SameRace", "OtherRace")

l_tukey=list()
l_g.ethnicity=unique(df.tmp.obs.ctf$g.ethnicity)
l_h.ethnicity=unique(df.tmp.obs.ctf$h.ethnicity)
l_sre=unique(df.tmp.obs.ctf$sre)
for(x in l_g.ethnicity){
    for(y in l_sre){
        df.tmp=df.tmp.obs.ctf[(df.tmp.obs.ctf$g.ethnicity==x)&
                              (df.tmp.obs.ctf$sre==y),]
        df.tmp$h.ethnicity.binary=factor(df.tmp$h.ethnicity.binary,
                                         levels=c("OtherRace", "SameRace"),
                                         ordered=TRUE)
        a=aov(pairing.status~h.ethnicity.binary, data=df.tmp)
        df.tukey=as.data.table(TukeyHSD(a)$h.ethnicity.binary, keep.rownames="h.ethnicity")
        df.tukey$h.ethnicity=sapply(strsplit(df.tukey$h.ethnicity, split='-', fixed=TRUE),
                                    function(x) (x[2]))
        df.tukey$g.ethnicity=x
        df.tukey$sre=y
        l_tukey[[paste("h.pair.diff", x, y, sep=".")]]=df.tukey
    }
}

end_time=Sys.time()
end_time - start_time

Time difference of 1.035484 secs

In [33]:
df.pairing.prob.diff=do.call(rbind.data.frame, l_tukey)

In [34]:
df.pairing.prob.diff$sre=factor(df.pairing.prob.diff$sre)

In [35]:
df.pairing.prob.diff$g.ethnicity=factor(df.pairing.prob.diff$g.ethnicity,
                                        levels=c("WHITE","ASIAN","BLACK"),
                                        ordered=TRUE)

In [38]:
l_guest.race.label=c(
    "BLACK"="Black Guest",
    "WHITE"="White Guest",
    "ASIAN"="Asian Guest")

dodge=position_dodge(width=0.5)
pdf("../output/supp_fig_s2b.pdf", width=4.5, height=5.5)
g = ggplot(df.pairing.prob.diff, aes(x=sre, y=diff)) + 
  geom_hline(yintercept=0, linetype="dashed", color="#D55E00") +
  geom_line(position=dodge, size=1, group=1) +
  xlab("Number of Same-Race Endorsements\n(Out of 5 or More Front-Page Reviews)") + 
  ylab("") +
  geom_point(position=dodge, size=2) +
  geom_errorbar(aes(ymin=lwr, ymax=upr),
                size=0.75, width=0.15, position=dodge) +
  scale_x_discrete(labels=c("0","1","2","3","4","5+")) +
  scale_y_continuous(limits=c(-0.05,0.2), expand=c(0,0), breaks=seq(-0.05,0.2,0.05),
                     labels=c("",seq(0,0.2,0.05))) +
  facet_grid(g.ethnicity~., margins=FALSE, switch="y", scales="free_y",
             labeller=as_labeller(l_guest.race.label)) + 
             theme_classic() + theme(panel.border=element_rect(fill=NA, size=0.3)) +
  theme(panel.spacing = unit(1, "lines"),
        panel.grid.major=element_blank(),
        text=element_text(size=12))
plot(g)
dev.off()

In [39]:
fig_s2_b=g

In [40]:
library(cowplot)

g <- plot_grid(fig_s2_a,fig_s2_b,
               align="hv", 
               labels="AUTO", label_size=12,
               ncol=2)

In [42]:
pdf(file="../output/supp_fig_s2.pdf", width=9, height=6)
print(g)
dev.off()