In [1]:
## Install gender package and linked database
install.packages("gender")
install.packages("genderdata", repos = "https://dev.ropensci.org", type = "source")

## Packages
library(dplyr)
library(gender)
library(stringr)
library(broom)
library(tidyr)
library(ggplot2)


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

“package ‘dplyr’ was built under R version 4.0.2”

Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


PLEASE NOTE: The method provided by this package must be used cautiously
usage in the README or the package documentation.

“package ‘broom’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘ggplot2’ was built under R version 4.0.1”


In [2]:
## Read data in, change path as necessary
elections_2016 <- read.csv("2016_election_results.csv")

head(elections_2016)

Unnamed: 0_level_0,X,Democrat,Republican,Other,State,Year,District,Democrat.Incumbent,Democrat.Votes,Republican.Incumbent,Republican.Votes,Other.Incumbent,Other.Votes
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<int>
1,0,Scott J. Kawasaki,No candidate,No candidate,Alaska,2016,1,True,1,False,0,0,0
2,1,Truno Holdaway,Steve M. Thompson,No candidate,Alaska,2016,2,False,1153,True,3268,0,0
3,2,Christina M. Sinclair,Tammie Wilson,Jeanne Olson,Alaska,2016,3,False,537,True,4291,False,2270
4,3,David Guttenberg,No candidate,No candidate,Alaska,2016,4,True,1,False,0,0,0
5,4,Adam Wool,Aaron Lojewski,No candidate,Alaska,2016,5,True,3812,False,3384,0,0
6,5,Jason T. Land,David M. Talerico,No candidate,Alaska,2016,6,False,2327,True,5126,0,0


In [3]:
## Function to fill predicted gender
gender_fill <- function(x) { 
  n <- length(x)
  x <- word(x) ## removes last names, gender() only works on first name strings
  gender_rep <- rep("a", n)
  for (i in 1:n) {
  if (x[i] == "No") {
    gender_rep[i] <- "N/A" ## handles no candidate
  } else if (length(pull(gender(x[i]), var = gender)) == 0) {
    gender_rep[i] <- "Unknown" ## handles names the function can't predict
  }  else {
  gender_rep[i] <- pull(gender(as.character(x[i])), var = gender)
  }
  }
  gender_rep
  }


## First if checks for "No" because "No candidate" has been truncated to "No", needs to be changed if databases have different indicators for no candidate
## Second if checks for length == 0 because gender() generates an empty table if it is unable to predict a gender
## Else applies gender() to the rest of the cases and pulls the gender result out of the results (gender function creates a table with ancillary info), coercion to character 
## because gender() only works on character type




In [4]:
## Testing function

test_string <- c("Michael B", "Jenny A", "No candidate", "xyzo")


print(gender_fill(test_string) == c("male", "female", "N/A", "Unknown"))





[1] TRUE TRUE TRUE TRUE


In [9]:
## Create gender columns, extremely extremely slow (like 20-30min at least), but works

data$gender_Dem <- gender_fill(data$Democrat)

data$gender_Rep <- gender_fill(data$Republican)

head(data)

Unnamed: 0_level_0,X,Democrat,Republican,Other,State,Year,District,Democrat.Incumbent,Democrat.Votes,Republican.Incumbent,Republican.Votes,Other.Incumbent,Other.Votes,gender_Dem,gender_Rep
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<chr>
1,0,Scott J. Kawasaki,No candidate,No candidate,Alaska,2016,1,True,1,False,0,0,0,male,
2,1,Truno Holdaway,Steve M. Thompson,No candidate,Alaska,2016,2,False,1153,True,3268,0,0,Unknown,male
3,2,Christina M. Sinclair,Tammie Wilson,Jeanne Olson,Alaska,2016,3,False,537,True,4291,False,2270,female,female
4,3,David Guttenberg,No candidate,No candidate,Alaska,2016,4,True,1,False,0,0,0,male,
5,4,Adam Wool,Aaron Lojewski,No candidate,Alaska,2016,5,True,3812,False,3384,0,0,male,male
6,5,Jason T. Land,David M. Talerico,No candidate,Alaska,2016,6,False,2327,True,5126,0,0,male,male


In [75]:
## Created file with gender column so you don't have to run the function every time, commented out to prevent accidental overwrite
## write.csv(elections_2016, "2016_election_results_updated.csv")
elections_2016_gender <- read.csv("2016_election_results_updated.csv")

In [5]:
## Generate winner and gender of winner

elections_2016_gender <- elections_2016_gender %>%
    mutate(winner = case_when(Democrat.Votes > Republican.Votes & Democrat.Votes > Other.Votes ~ "Democrat",
                              Republican.Votes > Democrat.Votes & Republican.Votes > Other.Votes ~ "Republican",
                              TRUE ~ "Other"),
          winner_gender = case_when(winner == "Democrat" ~ gender_Dem,
                                   winner == "Republican" ~ gender_Rep,
                                   TRUE ~ "Other candidate")) %>%
    select(!c(X.1, X))

## Generate vote share (didn't bother doing for Other party candiates)
elections_2016_gender <- elections_2016_gender %>%
    mutate(Democrat.Vote.Prop = Democrat.Votes / (Democrat.Votes + Republican.Votes + Other.Votes),
          Republican.Vote.Prop = Republican.Votes / (Democrat.Votes + Republican.Votes + Other.Votes))


head(elections_2016_gender)

### !!! Important: In analysis, exclude vote shares of 1.0000, signals no opposing candidate !!!


ERROR: Error in eval(lhs, parent, parent): object 'elections_2016_gender' not found


In [6]:

## Separate data by party for comparison
Dem_data <- elections_2016_gender %>%
    select(Democrat, State, District, Year, Democrat.Incumbent, Democrat.Votes, Democrat.Vote.Prop, gender_Dem)

Rep_data <- elections_2016_gender %>%
    select(Republican, State, District, Year, Republican.Incumbent, Republican.Votes, Republican.Vote.Prop, gender_Rep)

head(Dem_data)

head(Rep_data)



ERROR: Error in eval(lhs, parent, parent): object 'elections_2016_gender' not found


In [7]:
## Basic descriptives

Dem_data %>%
    group_by(Year, gender_Dem) %>%
    filter(gender_Dem %in% c("male", "female", "Unknown"), !is.na(Democrat.Votes)) %>%
    summarize(mean_votes = mean(Democrat.Votes))

Rep_data %>%
    group_by(Year, gender_Rep) %>%
    filter(gender_Rep %in% c("male", "female", "Unknown"), !is.na(Republican.Votes)) %>%
    summarize(mean_votes = mean(Republican.Votes))


Dem_data %>%
    group_by(gender_Dem) %>%
    filter(gender_Dem %in% c("male", "female", "Unknown"), !is.na(Democrat.Vote.Prop), !Democrat.Vote.Prop == 1) %>%
    summarize(count = n(),
              mean_vote_share = mean(Democrat.Vote.Prop))

Rep_data %>%
    group_by(gender_Rep) %>%
    filter(gender_Rep %in% c("male", "female", "Unknown"), !is.na(Republican.Vote.Prop), !Republican.Vote.Prop == 1) %>%
    summarize(count = n(),
              mean_vote_share = mean(Republican.Vote.Prop))


ERROR: Error in eval(lhs, parent, parent): object 'Dem_data' not found


In [8]:
## Graphs (need to change vote total to vote share, vote total is bad stats)

Dem_data %>%
    group_by(Year, gender_Dem) %>%
    filter(gender_Dem %in% c("male", "female"), !is.na(Democrat.Votes)) %>%
    summarize(count = n(), 
              mean_votes = mean(Democrat.Votes)) %>%
    ggplot(aes(x = gender_Dem, y = mean_votes, fill = gender_Dem)) + 
        geom_bar(stat = "identity") + 
        labs(title = "Vote Share by Gender, Democratic Party, 2016",
            y = "Average Total Votes",
            x = "Gender") +
        theme(legend.position = "none")

Rep_data %>%
    group_by(Year, gender_Rep) %>%
    filter(gender_Rep %in% c("male", "female"), !is.na(Republican.Votes)) %>%
    summarize(count = n(),
              mean_votes = mean(Republican.Votes)) %>%
    ggplot(aes(x = gender_Rep, y = mean_votes, fill = gender_Rep)) + 
        geom_bar(stat = "identity") +
        labs(title = "Vote Share by Gender, Republican Party, 2016",
            y = "Average Total Votes",
            x = "Gender") +
        theme(legend.position = "none")


## Tables  (need to work on this whole section, not currently working)
elections_2016_gender %>% 
    filter(gender_Dem %in% c("male", "female"), gender_Rep %in% c("male", "female")) %>%
    group_by(gender_Dem, gender_Rep, winner) %>%
    summarize(count = n())

elections_2016_gender %>% 
    filter(gender_Dem %in% c("male", "female"), gender_Rep %in% c("male", "female")) %>%
    group_by(State, winner_gender) %>%
    summarize(count = n()) %>%
    mutate(prop = count/sum(count)) %>%
    head()


##Vote Prop of winning candidates
elections_2016_gender %>% 
    filter(gender_Dem %in% c("male", "female"), gender_Rep %in% c("male", "female")) %>%
    group_by(State, winner_gender) %>%
    summarize(count = n()) %>%
    mutate(prop = count/sum(count)) %>%
    head()

elections_2016_gender %>% 
    filter(gender_Dem %in% c("male", "female"), gender_Rep %in% c("male", "female")) %>%
    group_by(State, winner_gender) %>%
    summarize(count = n()) %>%
    mutate(prop = count/sum(count)) %>%
    summarize(prop_diff = max(prop) - min(prop)) %>%
    filter(abs(prop_diff) > 0.05) %>%
    head()


ERROR: Error in eval(lhs, parent, parent): object 'Dem_data' not found


In [9]:
## 2018 data prep

# Read data in
state_elections_2018 <- read.csv("state_overall_2018.csv")

# Filter out races for other positions, special elections, write-ins, NAs
names(state_elections_2018)

filter_candidates <- state_elections_2018 %>%
    group_by(candidate) %>%
    summarize(count = n()) %>%
    filter(count > 25) %>% # Candidate with most appearances has 24, above that are aggregates/NAs
    select(candidate)

state_elections_2018_filtered <- state_elections_2018 %>%
    filter(office %in% c("State Assembly Member", "State Representative", "State Senator", "State Representative A", "State Representative B"),
          special == FALSE,
           writein == FALSE,
           !candidate %in% filter_candidates$candidate,
          ) %>%
    select(!c("writein","special","unofficial","version"))

dim(state_elections_2018)
dim(state_elections_2018_filtered)

head(state_elections_2018_filtered)



`summarise()` ungrouping output (override with `.groups` argument)



Unnamed: 0_level_0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,candidate,party,mode,candidatevotes,totalvotes
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>
1,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Bobby James Dolan III,independent,absentee,125,11684
2,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Bobby James Dolan III,independent,election day,4175,11684
3,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Bobby James Dolan III,independent,provisional,36,11684
4,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Phillip Pettus,republican,absentee,266,11684
5,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Phillip Pettus,republican,election day,7034,11684
6,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Phillip Pettus,republican,provisional,48,11684


In [31]:
# Reformat data to single line per candidate and election

unique(state_elections_2018_filtered$mode)


total_list <- state_elections_2018_filtered %>%
    filter(mode == "total") %>%
    select(candidate)

non_total_list <- state_elections_2018_filtered %>%
    filter(!mode == "total") %>%
    select(candidate)

sum(unique(total_list$candidate) %in% unique(non_total_list$candidate))

match_list <- unique(total_list[total_list$candidate %in% non_total_list$candidate,])


test_case <- state_elections_2018_filtered %>%
    filter(!candidate %in% match_list) %>%
    group_by(state, district, office, party, candidate) %>%
    summarize(total_votes = sum(candidatevotes)) %>%
    arrange(state, district, office)

state_elections_2018_formatted <- test_case %>%
    filter(party %in% c("democrat", "republican")) %>%
    mutate(row = row_number()) %>%
    pivot_wider(id_cols = c("state", "district", "office", "row"), names_from = party, values_from = c("candidate", "total_votes")) %>%
    mutate(candidate_democrat = replace_na(candidate_democrat, "No candidate"), 
            candidate_republican = replace_na(candidate_republican, "No candidate"),
            total_votes_democrat = replace_na(total_votes_democrat, 0),
            total_votes_republican = replace_na(total_votes_republican, 0)) %>%
    select(-row)
## Prob best possible set up

head(state_elections_2018_formatted)


`summarise()` regrouping output by 'state', 'district', 'office', 'party' (override with `.groups` argument)



state,district,office,candidate_republican,candidate_democrat,total_votes_republican,total_votes_democrat
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
Alabama,District 1,State Representative,Phillip Pettus,No candidate,7348,0
Alabama,District 1,State Senator,Tim Melson,Caroline Self,33141,15830
Alabama,District 10,State Representative,Mike Ball,J.B. King,11240,8565
Alabama,District 10,State Senator,Andrew Jones,No candidate,25902,0
Alabama,District 100,State Representative,Victor Gaston,No candidate,12086,0
Alabama,District 101,State Representative,Chris Pringle,No candidate,10274,0


In [32]:
## Generate gender for 2018 candidates (takes absolutely forever, do not run this if you don't need to)

state_elections_2018_formatted$gender_rep <- gender_fill(state_elections_2018_formatted$candidate_republican)

state_elections_2018_formatted$gender_dem <- gender_fill(state_elections_2018_formatted$candidate_democrat)



In [36]:
## Created file with gender column so you don't have to run the function every time
## commented out to prevent accidental overwrite
##write.csv(state_elections_2018_formatted, "2018_election_results_updated.csv")
state_elections_2018_formatted <- read.csv("2018_election_results_updated.csv")

head(state_elections_2018_formatted, 10)



Unnamed: 0_level_0,X,state,district,office,candidate_republican,candidate_democrat,total_votes_republican,total_votes_democrat,gender_rep,gender_dem
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<chr>
1,1,Alabama,District 1,State Representative,Phillip Pettus,No candidate,7348,0,male,
2,2,Alabama,District 1,State Senator,Tim Melson,Caroline Self,33141,15830,male,female
3,3,Alabama,District 10,State Representative,Mike Ball,J.B. King,11240,8565,male,Unknown
4,4,Alabama,District 10,State Senator,Andrew Jones,No candidate,25902,0,male,
5,5,Alabama,District 100,State Representative,Victor Gaston,No candidate,12086,0,male,
6,6,Alabama,District 101,State Representative,Chris Pringle,No candidate,10274,0,male,
7,7,Alabama,District 102,State Representative,Shane Stringer,No candidate,11048,0,male,
8,8,Alabama,District 103,State Representative,No candidate,Barbara Drummond,0,8818,,female
9,9,Alabama,District 104,State Representative,Margie Wilcox,Arlene Cunningham Easley,10152,4695,female,female
10,10,Alabama,District 105,State Representative,Chip Brown,No candidate,10176,0,male,


In [81]:
## Random workspace

elections_2016_gender %>% 
    filter(gender_Dem %in% c("male", "female"), gender_Rep %in% c("male", "female")) %>%
    group_by(State, winner_gender) %>%
    summarize(count = n()) %>%
    mutate(prop = count/sum(count)) %>%
    head()

## Combine Rep/Dem dataframes vertically to check all genders at once? but then can't check vote prop ---> generate prop in original table and migrate that, then stack
Dem_data <- Dem_data %>%
    rename(Candidate = Democrat,
          Incumbent = Democrat.Incumbent,
          Votes = Democrat.Votes,
          Vote.Prop = Democrat.Vote.Prop,
          Gender = gender_Dem) %>%
    mutate(Party = "Democrat")

Rep_data <- Rep_data %>%
    rename(Candidate = Republican,
          Incumbent = Republican.Incumbent,
          Votes = Republican.Votes,
          Vote.Prop = Republican.Vote.Prop,
          Gender = gender_Rep) %>%
    mutate(Party = "Republican")


elections_2016_long <- rbind(Dem_data, Rep_data)

head(elections_2016_long)
# could add winner yes/no column too




`summarise()` regrouping output by 'State' (override with `.groups` argument)



State,winner_gender,count,prop
<chr>,<chr>,<int>,<dbl>
Alaska,female,9,0.4090909
Alaska,male,13,0.5909091
Arkansas,female,4,0.173913
Arkansas,male,19,0.826087
California,female,13,0.2280702
California,male,44,0.7719298


Unnamed: 0_level_0,Candidate,State,District,Year,Incumbent,Votes,Vote.Prop,Gender,Party
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<chr>,<int>,<dbl>,<chr>,<chr>
1,Scott J. Kawasaki,Alaska,1,2016,True,1,1.0,male,Democrat
2,Truno Holdaway,Alaska,2,2016,False,1153,0.26080072,Unknown,Democrat
3,Christina M. Sinclair,Alaska,3,2016,False,537,0.07565511,female,Democrat
4,David Guttenberg,Alaska,4,2016,True,1,1.0,male,Democrat
5,Adam Wool,Alaska,5,2016,True,3812,0.52973874,male,Democrat
6,Jason T. Land,Alaska,6,2016,False,2327,0.31222327,male,Democrat


In [84]:
dim(elections_2016_gender)

0,1
numeric {base},R Documentation

0,1
length,A non-negative integer specifying the desired length. Double values will be coerced to integer: supplying an argument of length other than one is an error.
x,object to be coerced or tested.
...,further arguments passed to or from other methods.
