In [37]:
## Install gender package and linked database
install.packages("gender")
install.packages("genderdata", repos = "https://dev.ropensci.org", type = "source")

## Packages
library(dplyr)
library(stringr)
library(broom)
library(ggplot2)
library(tidyr)
library(gender)
library(stringr)




Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [40]:
## Read data 
CD_results_2018 <- read.csv("district_overall_2018.csv")
pres_results_2018 <- read.csv("pres_results_by_CD_2016.csv")


## Check data structure
head(CD_results_2018)

head(pres_results_2018)

pres_results_2018 <- pres_results_2018 %>%
    select(-c("X", "X.1", "X.2"))



Unnamed: 0_level_0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<lgl>,<chr>,<int>,<int>,<lgl>,<int>
1,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Audrey Denney,democrat,False,total,263096,583188,False,20190131
2,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Doug La Malfa,republican,False,total,320092,583188,False,20190131
3,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Jeff Denham,republican,False,total,211910,443800,False,20190131
4,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Josh Harder,democrat,False,total,231890,443800,False,20190131
5,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,John Fitzgerald,republican,False,total,142624,551362,False,20190131
6,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,Mark DeSaulnier,democrat,False,total,408738,551362,False,20190131


Unnamed: 0_level_0,CD,Incumbent,Party,Clinton,Trump,Winner,X,X.1,X.2
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<lgl>,<lgl>,<lgl>
1,AK-AL,"Young, Don",(R),37.6,52.8,Trump,,,
2,AL-01,"Byrne, Bradley",(R),34.1,63.5,Trump,,,
3,AL-02,"Roby, Martha",(R),33.0,64.9,Trump,,,
4,AL-03,"Rogers, Mike",(R),32.3,65.3,Trump,,,
5,AL-04,"Aderholt, Rob",(R),17.4,80.4,Trump,,,
6,AL-05,"Brooks, Mo",(R),31.3,64.7,Trump,,,


In [41]:
## Reformat data to wide

CD_results_2018_temp <- CD_results_2018 %>%
    filter(special == FALSE,
           writein == FALSE,
           stage == "gen",
           party %in% c("democrat", "republican")) %>%
    mutate(vote_share = candidatevotes / totalvotes) %>%
    select(-c("state", "state_fips", "state_cen", "state_ic", "special", "office", "writein", "stage", "mode", "unofficial", "version", "totalvotes")) %>%
    rename(total_votes = candidatevotes)

head(CD_results_2018_temp)

CD_results_2018_temp_2 <- CD_results_2018_temp %>%
    group_by(state_po, district, party, candidate) %>%
    summarize(total_votes = sum(total_votes),
              vote_share = sum(vote_share))

head(CD_results_2018_temp_2)


CD_results_2018_wide <- CD_results_2018_temp_2 %>%
    mutate(row = row_number()) %>%
    pivot_wider(id_cols = c("state_po", "district", "row"), names_from = party, values_from = c("candidate", "total_votes", "vote_share")) %>%
    mutate(candidate_democrat = replace_na(candidate_democrat, "No candidate"), 
            candidate_republican = replace_na(candidate_republican, "No candidate"),
            total_votes_democrat = replace_na(total_votes_democrat, 0),
            total_votes_republican = replace_na(total_votes_republican, 0),
            winner_party = case_when(total_votes_democrat > total_votes_republican ~ "Democrat",
                              total_votes_republican > total_votes_democrat ~ "Republican",
                              TRUE ~ "Other"),
            district = word(district, 2)) %>%
    select(-row) %>%
    rename(vote_share_dem = vote_share_democrat,
          vote_share_rep = vote_share_republican) %>%
    mutate(year = 2018)


head(CD_results_2018_wide)

glimpse(CD_results_2018_wide)


Unnamed: 0_level_0,year,state_po,district,candidate,party,total_votes,vote_share
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>
1,2018,CA,District 1,Audrey Denney,democrat,263096,0.4511341
2,2018,CA,District 1,Doug La Malfa,republican,320092,0.5488659
3,2018,CA,District 10,Jeff Denham,republican,211910,0.4774899
4,2018,CA,District 10,Josh Harder,democrat,231890,0.5225101
5,2018,CA,District 11,John Fitzgerald,republican,142624,0.2586758
6,2018,CA,District 11,Mark DeSaulnier,democrat,408738,0.7413242


`summarise()` regrouping output by 'state_po', 'district', 'party' (override with `.groups` argument)



state_po,district,party,candidate,total_votes,vote_share
<chr>,<chr>,<chr>,<chr>,<int>,<dbl>
AK,District 0,democrat,Alyse S. Galvin,131199,0.464971
AK,District 0,republican,Don Young,149779,0.5308187
AL,District 1,democrat,Robert Kennedy Jr.,89226,0.3677648
AL,District 1,republican,Bradley Byrne,153228,0.6315633
AL,District 2,democrat,Tabitha Isner,86931,0.3842594
AL,District 2,republican,Martha Roby,138879,0.6138841


state_po,district,candidate_democrat,candidate_republican,total_votes_democrat,total_votes_republican,vote_share_dem,vote_share_rep,winner_party,year
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
AK,0,Alyse S. Galvin,Don Young,131199,149779,0.464971,0.5308187,Republican,2018
AL,1,Robert Kennedy Jr.,Bradley Byrne,89226,153228,0.3677648,0.6315633,Republican,2018
AL,2,Tabitha Isner,Martha Roby,86931,138879,0.3842594,0.6138841,Republican,2018
AL,3,Mallory Hagan,Mike Rogers,83996,147770,0.3621844,0.6371731,Republican,2018
AL,4,Lee Auman,Robert Aderholt,46492,184255,0.2012911,0.7977477,Republican,2018
AL,6,Danner Kline,Gary Palmer,85644,192542,0.3077089,0.6917809,Republican,2018


Rows: 440
Columns: 10
Groups: state_po, district [429]
$ state_po               [3m[90m<chr>[39m[23m "AK", "AL", "AL", "AL", "AL", "AL", "AL", "AR"…
$ district               [3m[90m<chr>[39m[23m "0", "1", "2", "3", "4", "6", "7", "1", "2", "…
$ candidate_democrat     [3m[90m<chr>[39m[23m "Alyse S. Galvin", "Robert Kennedy Jr.", "Tabi…
$ candidate_republican   [3m[90m<chr>[39m[23m "Don Young", "Bradley Byrne", "Martha Roby", "…
$ total_votes_democrat   [3m[90m<dbl>[39m[23m 131199, 89226, 86931, 83996, 46492, 85644, 185…
$ total_votes_republican [3m[90m<dbl>[39m[23m 149779, 153228, 138879, 147770, 184255, 192542…
$ vote_share_dem         [3m[90m<dbl>[39m[23m 0.4649710, 0.3677648, 0.3842594, 0.3621844, 0.…
$ vote_share_rep         [3m[90m<dbl>[39m[23m 0.5308187, 0.6315633, 0.6138841, 0.6371731, 0.…
$ winner_party           [3m[90m<chr>[39m[23m "Republican", "Republican", "Republican", "Rep…
$ year                   [3m[90m<dbl>[39m[23m 2018, 2018, 2

In [42]:
## Gender fill
# Function to fill predicted gender
gender_fill <- function(x) { 
  n <- length(x)
  x <- word(x) ## removes last names, gender() only works on first name strings
  gender_rep <- rep("a", n)
  for (i in 1:n) {
      if (x[i] == "No") {
        gender_rep[i] <- "N/A" ## handles no candidate
      } else if (length(pull(gender(x[i]), var = gender)) == 0) {
        gender_rep[i] <- "Unknown" ## handles names the function can't predict
      }  else {
      gender_rep[i] <- pull(gender(as.character(x[i])), var = gender)
      }
    }
    gender_rep
  }


## First if checks for "No" because "No candidate" has been truncated to "No", needs to be changed if databases have different indicators for no candidate
## Second if checks for length == 0 because gender() generates an empty table if it is unable to predict a gender
## Else applies gender() to the rest of the cases and pulls the gender result out of the results (gender function creates a table with ancillary info), coercion to character 
## because gender() only works on character type




In [43]:
## Create gender columns, extremely extremely slow (like 20-30min at least), but works

CD_results_2018_wide$gender_dem <- gender_fill(CD_results_2018_wide$candidate_democrat)

CD_results_2018_wide$gender_rep <- gender_fill(CD_results_2018_wide$candidate_republican)

head(CD_results_2018_wide)

state_po,district,candidate_democrat,candidate_republican,total_votes_democrat,total_votes_republican,vote_share_dem,vote_share_rep,winner_party,year,gender_dem,gender_rep
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>
AK,0,Alyse S. Galvin,Don Young,131199,149779,0.464971,0.5308187,Republican,2018,female,male
AL,1,Robert Kennedy Jr.,Bradley Byrne,89226,153228,0.3677648,0.6315633,Republican,2018,male,male
AL,2,Tabitha Isner,Martha Roby,86931,138879,0.3842594,0.6138841,Republican,2018,female,female
AL,3,Mallory Hagan,Mike Rogers,83996,147770,0.3621844,0.6371731,Republican,2018,female,male
AL,4,Lee Auman,Robert Aderholt,46492,184255,0.2012911,0.7977477,Republican,2018,male,male
AL,6,Danner Kline,Gary Palmer,85644,192542,0.3077089,0.6917809,Republican,2018,male,male


In [44]:
## Fill in winner gender
CD_results_2018_wide <- CD_results_2018_wide %>%
    mutate(district = as.numeric(district),
          winner_gender = case_when(winner_party == "Republican" ~ gender_rep,
                                   winner_party == "Democrat" ~ gender_dem,
                                   TRUE ~ "Other"))

head(CD_results_2018_wide)


state_po,district,candidate_democrat,candidate_republican,total_votes_democrat,total_votes_republican,vote_share_dem,vote_share_rep,winner_party,year,gender_dem,gender_rep,winner_gender
<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>
AK,0,Alyse S. Galvin,Don Young,131199,149779,0.464971,0.5308187,Republican,2018,female,male,male
AL,1,Robert Kennedy Jr.,Bradley Byrne,89226,153228,0.3677648,0.6315633,Republican,2018,male,male,male
AL,2,Tabitha Isner,Martha Roby,86931,138879,0.3842594,0.6138841,Republican,2018,female,female,female
AL,3,Mallory Hagan,Mike Rogers,83996,147770,0.3621844,0.6371731,Republican,2018,female,male,male
AL,4,Lee Auman,Robert Aderholt,46492,184255,0.2012911,0.7977477,Republican,2018,male,male,male
AL,6,Danner Kline,Gary Palmer,85644,192542,0.3077089,0.6917809,Republican,2018,male,male,male


In [45]:
## Match CD to presidential results

pres_results_2018 <- pres_results_2018 %>%
    mutate(Party = case_when(Party == "(R)" ~ "republican",
                            Party == "(D)" ~ "democrat",
                            TRUE ~ "other"),
          state = substr(CD, start = 1, stop = 2),
          district = str_sub(CD, -2, -1)) %>%
    select(-CD) %>%
    mutate(district = replace(district, district == "AL", "0"))

pres_results_2018$district <- as.numeric(pres_results_2018$district)
CD_results_2018_wide$district <- as.numeric(CD_results_2018_wide$district)


head(pres_results_2018)

pres_results_join <- pres_results_2018 %>%
    select(state, district, Winner)


CD_results_2018_wide <- CD_results_2018_wide %>%
    left_join(pres_results_join, by = c("state_po" = "state", "district" = "district")) %>%
    rename(pres_winner = Winner)

head(CD_results_2018_wide)




Unnamed: 0_level_0,Incumbent,Party,Clinton,Trump,Winner,state,district
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
1,"Young, Don",republican,37.6,52.8,Trump,AK,0
2,"Byrne, Bradley",republican,34.1,63.5,Trump,AL,1
3,"Roby, Martha",republican,33.0,64.9,Trump,AL,2
4,"Rogers, Mike",republican,32.3,65.3,Trump,AL,3
5,"Aderholt, Rob",republican,17.4,80.4,Trump,AL,4
6,"Brooks, Mo",republican,31.3,64.7,Trump,AL,5


state_po,district,candidate_democrat,candidate_republican,total_votes_democrat,total_votes_republican,vote_share_dem,vote_share_rep,winner_party,year,gender_dem,gender_rep,winner_gender,pres_winner
<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
AK,0,Alyse S. Galvin,Don Young,131199,149779,0.464971,0.5308187,Republican,2018,female,male,male,Trump
AL,1,Robert Kennedy Jr.,Bradley Byrne,89226,153228,0.3677648,0.6315633,Republican,2018,male,male,male,Trump
AL,2,Tabitha Isner,Martha Roby,86931,138879,0.3842594,0.6138841,Republican,2018,female,female,female,Trump
AL,3,Mallory Hagan,Mike Rogers,83996,147770,0.3621844,0.6371731,Republican,2018,female,male,male,Trump
AL,4,Lee Auman,Robert Aderholt,46492,184255,0.2012911,0.7977477,Republican,2018,male,male,male,Trump
AL,6,Danner Kline,Gary Palmer,85644,192542,0.3077089,0.6917809,Republican,2018,male,male,male,Trump


In [46]:
## Identify flips

CD_results_2018_wide <- CD_results_2018_wide %>%
    mutate(flip = case_when(pres_winner == "Trump" & winner_party == "Democrat" ~ TRUE,
                           TRUE ~ FALSE)) %>%
    rename(state = state_po)

head(CD_results_2018_wide)



state,district,candidate_democrat,candidate_republican,total_votes_democrat,total_votes_republican,vote_share_dem,vote_share_rep,winner_party,year,gender_dem,gender_rep,winner_gender,pres_winner,flip
<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<lgl>
AK,0,Alyse S. Galvin,Don Young,131199,149779,0.464971,0.5308187,Republican,2018,female,male,male,Trump,False
AL,1,Robert Kennedy Jr.,Bradley Byrne,89226,153228,0.3677648,0.6315633,Republican,2018,male,male,male,Trump,False
AL,2,Tabitha Isner,Martha Roby,86931,138879,0.3842594,0.6138841,Republican,2018,female,female,female,Trump,False
AL,3,Mallory Hagan,Mike Rogers,83996,147770,0.3621844,0.6371731,Republican,2018,female,male,male,Trump,False
AL,4,Lee Auman,Robert Aderholt,46492,184255,0.2012911,0.7977477,Republican,2018,male,male,male,Trump,False
AL,6,Danner Kline,Gary Palmer,85644,192542,0.3077089,0.6917809,Republican,2018,male,male,male,Trump,False


In [47]:
## Save wide dataset
# commented out to prevent overwrite
# write.csv(CD_results_2018_wide, "flipped_house_2018_wide.csv")


In [48]:
## Reformat to long dataset

Dem_data <- CD_results_2018_wide %>%
    select(state, district, candidate_democrat, total_votes_democrat, vote_share_dem, winner_party, winner_gender, pres_winner, flip, year) %>%
    rename(candidate = candidate_democrat,
           total_votes = total_votes_democrat,
          vote_share = vote_share_dem)

Rep_data <- CD_results_2018_wide %>%
    select(state, district, candidate_republican, total_votes_republican, vote_share_rep, winner_party, winner_gender, pres_winner, flip, year) %>%
    rename(candidate = candidate_republican,
    total_votes = total_votes_republican,
    vote_share = vote_share_rep)


CD_results_2018_long <- rbind(Dem_data, Rep_data)


head(CD_results_2018_long)


state,district,candidate,total_votes,vote_share,winner_party,winner_gender,pres_winner,flip,year
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<lgl>,<dbl>
AK,0,Alyse S. Galvin,131199,0.464971,Republican,male,Trump,False,2018
AL,1,Robert Kennedy Jr.,89226,0.3677648,Republican,male,Trump,False,2018
AL,2,Tabitha Isner,86931,0.3842594,Republican,female,Trump,False,2018
AL,3,Mallory Hagan,83996,0.3621844,Republican,male,Trump,False,2018
AL,4,Lee Auman,46492,0.2012911,Republican,male,Trump,False,2018
AL,6,Danner Kline,85644,0.3077089,Republican,male,Trump,False,2018


In [49]:
## Save long dataset
# commented out to prevent overwrite
# write.csv(CD_results_2018_long, "flipped_house_2018_long.csv")

`summarise()` ungrouping output (override with `.groups` argument)



office,n()
<chr>,<int>
U.S. Representative,1411


ERROR: Error in `$<-.data.frame`(`*tmp*`, district, value = numeric(0)): replacement has 0 rows, data has 435
