In [43]:
## Install gender package and linked database
install.packages("gender")
install.packages("genderdata", repos = "https://dev.ropensci.org", type = "source")

## Packages
library(dplyr)
library(stringr)
library(broom)
library(ggplot2)
library(tidyr)
library(gender)


## Data 
CD_results_2018 <- read.csv("district_overall_2018.csv")

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

PLEASE NOTE: The method provided by this package must be used cautiously
usage in the README or the package documentation.



In [34]:
## Check  data structure

head(CD_results_2018)

Unnamed: 0_level_0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<lgl>,<chr>,<int>,<int>,<lgl>,<int>
1,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Audrey Denney,democrat,False,total,263096,583188,False,20190131
2,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Doug La Malfa,republican,False,total,320092,583188,False,20190131
3,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Jeff Denham,republican,False,total,211910,443800,False,20190131
4,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Josh Harder,democrat,False,total,231890,443800,False,20190131
5,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,John Fitzgerald,republican,False,total,142624,551362,False,20190131
6,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,Mark DeSaulnier,democrat,False,total,408738,551362,False,20190131


In [46]:
## Reformat data to wide

CD_results_2018_wide <- CD_results_2018 %>%
    filter(special == FALSE, writein == FALSE, stage == "gen") %>%
    select(-c("state_fips", "state_cen", "state_ic", "special", "writein", "mode", "unofficial", "version")) %>%
    mutate(row = row_number(),
          party = replace(party, !party %in% c("democrat","republican"), "other"),
          vote_share = candidatevotes / totalvotes) %>%
    rename(total_votes = candidatevotes) %>%
    pivot_wider(id_cols = c("state", "district", "office", "row"), names_from = party, values_from = c("candidate", "total_votes", "vote_share")) %>%
    mutate(candidate_democrat = replace_na(candidate_democrat, "No candidate"), 
            candidate_republican = replace_na(candidate_republican, "No candidate"),
            candidate_other = replace_na(candidate_other, "No candidate"),
            total_votes_democrat = replace_na(total_votes_democrat, 0),
            total_votes_republican = replace_na(total_votes_republican, 0),
            total_votes_other = replace_na(total_votes_other, 0),
            winner_party = case_when(total_votes_democrat > total_votes_republican & total_votes_democrat > total_votes_other ~ "Democrat",
                              total_votes_republican > total_votes_democrat & total_votes_republican > total_votes_other ~ "Republican",
                              TRUE ~ "Other")) %>%
    select(-row) %>%
    rename(vote_share_dem = vote_share_democrat,
          vote_share_rep = vote_share_republican) %>%
    mutate(year = 2018)


head(CD_results_2018_wide)

glimpse(CD_results_2018_wide)



state,district,office,candidate_democrat,candidate_republican,candidate_other,total_votes_democrat,total_votes_republican,total_votes_other,vote_share_dem,vote_share_rep,vote_share_other,winner_party,year
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
California,District 1,U.S. Representative,Audrey Denney,No candidate,No candidate,263096,0,0,0.4511341,,,Democrat,2018
California,District 1,U.S. Representative,No candidate,Doug La Malfa,No candidate,0,320092,0,,0.5488659,,Republican,2018
California,District 10,U.S. Representative,No candidate,Jeff Denham,No candidate,0,211910,0,,0.4774899,,Republican,2018
California,District 10,U.S. Representative,Josh Harder,No candidate,No candidate,231890,0,0,0.5225101,,,Democrat,2018
California,District 11,U.S. Representative,No candidate,John Fitzgerald,No candidate,0,142624,0,,0.2586758,,Republican,2018
California,District 11,U.S. Representative,Mark DeSaulnier,No candidate,No candidate,408738,0,0,0.7413242,,,Democrat,2018


Rows: 1,255
Columns: 14
$ state                  [3m[90m<chr>[39m[23m "California", "California", "California", "Cal…
$ district               [3m[90m<chr>[39m[23m "District 1", "District 1", "District 10", "Di…
$ office                 [3m[90m<chr>[39m[23m "U.S. Representative", "U.S. Representative", …
$ candidate_democrat     [3m[90m<chr>[39m[23m "Audrey Denney", "No candidate", "No candidate…
$ candidate_republican   [3m[90m<chr>[39m[23m "No candidate", "Doug La Malfa", "Jeff Denham"…
$ candidate_other        [3m[90m<chr>[39m[23m "No candidate", "No candidate", "No candidate"…
$ total_votes_democrat   [3m[90m<dbl>[39m[23m 263096, 0, 0, 231890, 0, 408738, 0, 550584, 52…
$ total_votes_republican [3m[90m<dbl>[39m[23m 0, 320092, 211910, 0, 142624, 0, 83560, 0, 0, …
$ total_votes_other      [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 68514, 0, 0, 0, 0, …
$ vote_share_dem         [3m[90m<dbl>[39m[23m 0.4511341, NA, NA, 0.5225101, NA, 0.7413242,

In [45]:
## Gender fill
# Function to fill predicted gender
gender_fill <- function(x) { 
  n <- length(x)
  x <- word(x) ## removes last names, gender() only works on first name strings
  gender_rep <- rep("a", n)
  for (i in 1:n) {
      if (x[i] == "No") {
        gender_rep[i] <- "N/A" ## handles no candidate
      } else if (length(pull(gender(x[i]), var = gender)) == 0) {
        gender_rep[i] <- "Unknown" ## handles names the function can't predict
      }  else {
      gender_rep[i] <- pull(gender(as.character(x[i])), var = gender)
      }
    }
    gender_rep
  }


## First if checks for "No" because "No candidate" has been truncated to "No", needs to be changed if databases have different indicators for no candidate
## Second if checks for length == 0 because gender() generates an empty table if it is unable to predict a gender
## Else applies gender() to the rest of the cases and pulls the gender result out of the results (gender function creates a table with ancillary info), coercion to character 
## because gender() only works on character type




In [50]:
## Create gender columns, extremely extremely slow (like 20-30min at least), but works

CD_results_2018_wide$gender_dem <- gender_fill(CD_results_2018_wide$candidate_democrat)

CD_results_2018_wide$gender_rep <- gender_fill(CD_results_2018_wide$candidate_republican)

head(CD_results_2018_wide)

state,district,office,candidate_democrat,candidate_republican,candidate_other,total_votes_democrat,total_votes_republican,total_votes_other,vote_share_dem,vote_share_rep,vote_share_other,winner_party,year,gender_dem,gender_rep
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>
California,District 1,U.S. Representative,Audrey Denney,No candidate,No candidate,263096,0,0,0.4511341,,,Democrat,2018,female,
California,District 1,U.S. Representative,No candidate,Doug La Malfa,No candidate,0,320092,0,,0.5488659,,Republican,2018,,male
California,District 10,U.S. Representative,No candidate,Jeff Denham,No candidate,0,211910,0,,0.4774899,,Republican,2018,,male
California,District 10,U.S. Representative,Josh Harder,No candidate,No candidate,231890,0,0,0.5225101,,,Democrat,2018,male,
California,District 11,U.S. Representative,No candidate,John Fitzgerald,No candidate,0,142624,0,,0.2586758,,Republican,2018,,male
California,District 11,U.S. Representative,Mark DeSaulnier,No candidate,No candidate,408738,0,0,0.7413242,,,Democrat,2018,male,


In [51]:
## Fill in winner gender



In [None]:
## Match CD to presidential results




In [48]:
## Save wide dataset

In [49]:
## Reformat to long dataset

In [None]:
## Save long dataset