In [1]:
## Install gender package and linked database
install.packages("gender")
install.packages("genderdata", repos = "https://dev.ropensci.org", type = "source")

## Packages
library(dplyr)
library(gender)
library(stringr)
library(broom)
library(tidyr)


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

“package ‘dplyr’ was built under R version 4.0.2”

Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


PLEASE NOTE: The method provided by this package must be used cautiously
usage in the README or the package documentation.

“package ‘broom’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”


In [85]:
## Read data in, change path as necessary
data <- read.csv("2016_election_results.csv")

head(data)

Unnamed: 0_level_0,X,Democrat,Republican,Other,State,Year,District,Democrat.Incumbent,Democrat.Votes,Republican.Incumbent,Republican.Votes,Other.Incumbent,Other.Votes
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<int>
1,0,Scott J. Kawasaki,No candidate,No candidate,Alaska,2016,1,True,1,False,0,0,0
2,1,Truno Holdaway,Steve M. Thompson,No candidate,Alaska,2016,2,False,1153,True,3268,0,0
3,2,Christina M. Sinclair,Tammie Wilson,Jeanne Olson,Alaska,2016,3,False,537,True,4291,False,2270
4,3,David Guttenberg,No candidate,No candidate,Alaska,2016,4,True,1,False,0,0,0
5,4,Adam Wool,Aaron Lojewski,No candidate,Alaska,2016,5,True,3812,False,3384,0,0
6,5,Jason T. Land,David M. Talerico,No candidate,Alaska,2016,6,False,2327,True,5126,0,0


In [3]:
## Function to fill predicted gender
gender_fill <- function(x) { 
  n <- length(x)
  x <- word(x) ## removes last names, gender() only works on first name strings
  gender_rep <- rep("a", n)
  for (i in 1:n) {
  if (x[i] == "No") {
    gender_rep[i] <- "N/A" ## handles no candidate
  } else if (length(pull(gender(x[i]), var = gender)) == 0) {
    gender_rep[i] <- "Unknown" ## handles names the function can't predict
  }  else {
  gender_rep[i] <- pull(gender(as.character(x[i])), var = gender)
  }
  }
  gender_rep
  }


## First if checks for "No" because "No candidate" has been truncated to "No", needs to be changed if databases have different indicators for no candidate
## Second if checks for length == 0 because gender() generates an empty table if it is unable to predict a gender
## Else applies gender() to the rest of the cases and pulls the gender result out of the results (gender function creates a table with ancillary info), coercion to character 
## because gender() only works on character type




In [4]:
## Testing function

test_string <- c("Michael B", "Jenny A", "No candidate", "xyzo")


print(gender_fill(test_string) == c("male", "female", "N/A", "Unknown"))





[1] TRUE TRUE TRUE TRUE


In [9]:
## Create gender columns, extremely extremely slow (like 20-30min at least), but works

data$gender_Dem <- gender_fill(data$Democrat)

data$gender_Rep <- gender_fill(data$Republican)

head(data)

Unnamed: 0_level_0,X,Democrat,Republican,Other,State,Year,District,Democrat.Incumbent,Democrat.Votes,Republican.Incumbent,Republican.Votes,Other.Incumbent,Other.Votes,gender_Dem,gender_Rep
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<chr>
1,0,Scott J. Kawasaki,No candidate,No candidate,Alaska,2016,1,True,1,False,0,0,0,male,
2,1,Truno Holdaway,Steve M. Thompson,No candidate,Alaska,2016,2,False,1153,True,3268,0,0,Unknown,male
3,2,Christina M. Sinclair,Tammie Wilson,Jeanne Olson,Alaska,2016,3,False,537,True,4291,False,2270,female,female
4,3,David Guttenberg,No candidate,No candidate,Alaska,2016,4,True,1,False,0,0,0,male,
5,4,Adam Wool,Aaron Lojewski,No candidate,Alaska,2016,5,True,3812,False,3384,0,0,male,male
6,5,Jason T. Land,David M. Talerico,No candidate,Alaska,2016,6,False,2327,True,5126,0,0,male,male


In [24]:
## Created file with gender column so you don't have to run the function every time, commented out to prevent accidental overwrite
## write.csv(data, "2016_election_results_updated.csv")
data <- read.csv("2016_election_results_updated.csv")

In [25]:
## Separate data
Dem_data <- data %>%
    select(Democrat, State, Year, Democrat.Incumbent, Democrat.Votes, gender_Dem)

Rep_data <- data %>%
    select(Republican, State, Year, Republican.Incumbent, Republican.Votes, gender_Rep)

head(Dem_data)

head(Rep_data)



Unnamed: 0_level_0,Democrat,State,Year,Democrat.Incumbent,Democrat.Votes,gender_Dem
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<int>,<chr>
1,Scott J. Kawasaki,Alaska,2016,True,1,male
2,Truno Holdaway,Alaska,2016,False,1153,Unknown
3,Christina M. Sinclair,Alaska,2016,False,537,female
4,David Guttenberg,Alaska,2016,True,1,male
5,Adam Wool,Alaska,2016,True,3812,male
6,Jason T. Land,Alaska,2016,False,2327,male


Unnamed: 0_level_0,Republican,State,Year,Republican.Incumbent,Republican.Votes,gender_Rep
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<int>,<chr>
1,No candidate,Alaska,2016,False,0,
2,Steve M. Thompson,Alaska,2016,True,3268,male
3,Tammie Wilson,Alaska,2016,True,4291,female
4,No candidate,Alaska,2016,False,0,
5,Aaron Lojewski,Alaska,2016,False,3384,male
6,David M. Talerico,Alaska,2016,True,5126,male


In [26]:
## Basic descriptive

Dem_data %>%
    group_by(Year, gender_Dem) %>%
    filter(gender_Dem %in% c("male", "female", "Unknown"), !is.na(Democrat.Votes)) %>%
    summarize(mean_votes = mean(Democrat.Votes))

Rep_data %>%
    group_by(Year, gender_Rep) %>%
    filter(gender_Rep %in% c("male", "female", "Unknown"), !is.na(Republican.Votes)) %>%
    summarize(mean_votes = mean(Republican.Votes))

## Visualize Dem names function couldn't categorize (can do same for GOP)
Dem_data %>%
    select(Democrat, gender_Dem) %>%
    filter(gender_Dem == "Unknown")


`summarise()` regrouping output by 'Year' (override with `.groups` argument)



Year,gender_Dem,mean_votes
<int>,<chr>,<dbl>
2016,female,13083.32
2016,male,12067.61
2016,Unknown,13180.8


`summarise()` regrouping output by 'Year' (override with `.groups` argument)



Year,gender_Rep,mean_votes
<int>,<chr>,<dbl>
2016,female,13882.47
2016,male,14177.42
2016,Unknown,10137.58


Democrat,gender_Dem
<chr>,<chr>
Truno Holdaway,Unknown
J.P. Bob Johnson,Unknown
Grimsley Graham,Unknown
Kansen Chu,Unknown
S. Monique Limon,Unknown
Mesbah Islam,Unknown
S. Quinton Johnson,Unknown
W. Charles Paradee,Unknown
S. Bradley Connor,Unknown
"Wengay M. ""Newt"" Newton Sr.",Unknown


In [84]:
## 2018 data prep

# Read data in
state_elections_2018 <- read.csv("state_overall_2018.csv")

# Filter out races for other positions, special elections, write-ins, NAs
names(state_elections_2018)

filter_candidates <- state_elections_2018 %>%
    group_by(candidate) %>%
    summarize(count = n()) %>%
    filter(count > 25) %>% # Candidate with most appearances has 24, above that are aggregates/NAs
    select(candidate)

state_elections_2018_filtered <- state_elections_2018 %>%
    filter(office %in% c("State Assembly Member", "State Representative", "State Senator", "State Representative A", "State Representative B"),
          special == FALSE,
           writein == FALSE,
           !candidate %in% filter_candidates$candidate,
          ) %>%
    select(!c("writein","special","unofficial","version"))

dim(state_elections_2018)
dim(state_elections_2018_filtered)

head(state_elections_2018_filtered)



`summarise()` ungrouping output (override with `.groups` argument)



Unnamed: 0_level_0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,candidate,party,mode,candidatevotes,totalvotes
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>
1,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Bobby James Dolan III,independent,absentee,125,11684
2,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Bobby James Dolan III,independent,election day,4175,11684
3,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Bobby James Dolan III,independent,provisional,36,11684
4,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Phillip Pettus,republican,absentee,266,11684
5,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Phillip Pettus,republican,election day,7034,11684
6,2018,Alabama,AL,1,63,41,State Representative,District 1,gen,Phillip Pettus,republican,provisional,48,11684


In [94]:
# Reformat data to single line per candidate and election

unique(state_elections_2018_filtered$mode)


total_list <- state_elections_2018_filtered %>%
    filter(mode == "total") %>%
    select(candidate)

non_total_list <- state_elections_2018_filtered %>%
    filter(!mode == "total") %>%
    select(candidate)

sum(unique(total_list$candidate) %in% unique(non_total_list$candidate))

match_list <- unique(total_list[total_list$candidate %in% non_total_list$candidate,])


test_case <- state_elections_2018_filtered %>%
    filter(!candidate %in% match_list) %>%
    group_by(state, district, office, party, candidate) %>%
    summarize(total_votes = sum(candidatevotes)) %>%
    arrange(state, district, office)

state_elections_2018_formatted <- test_case %>%
    filter(party %in% c("democrat", "republican")) %>%
    pivot_wider(id_cols = c("state", "district", "office"), names_from = party, values_from = c("candidate", "total_votes")) %>%
    mutate(candidate_democrat = replace_na(candidate_democrat, "No candidate"), 
            candidate_republican = replace_na(candidate_republican, "No candidate"),
            total_votes_democrat = replace_na(total_votes_democrat, 0),
            total_votes_republican = replace_na(totalvotes_republican, 0))
## Prob best possible set up




state,district,office,party,candidate,total_votes
<chr>,<chr>,<chr>,<chr>,<chr>,<int>
Alabama,District 1,State Representative,independent,Bobby James Dolan III,4336
Alabama,District 1,State Representative,republican,Phillip Pettus,7348
Alabama,District 1,State Senator,democrat,Caroline Self,15830
Alabama,District 1,State Senator,republican,Tim Melson,33141
Alabama,District 10,State Representative,democrat,J.B. King,8565
Alabama,District 10,State Representative,libertarian,Elijah J. Boyd,1130


`summarise()` regrouping output by 'state', 'district', 'office', 'party' (override with `.groups` argument)

“Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates”
“Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates”


state,district,office,candidate_republican,candidate_democrat,total_votes_republican,total_votes_democrat
<chr>,<chr>,<chr>,<list>,<list>,<list>,<list>
Alabama,District 1,State Representative,Phillip Pettus,No candidate,7348,
Alabama,District 1,State Senator,Tim Melson,Caroline Self,33141,15830.0
Alabama,District 10,State Representative,Mike Ball,J.B. King,11240,8565.0
Alabama,District 10,State Senator,Andrew Jones,No candidate,25902,
Alabama,District 100,State Representative,Victor Gaston,No candidate,12086,
Alabama,District 101,State Representative,Chris Pringle,No candidate,10274,


In [None]:
## Generate gender for 2018 candidates (doesn't currently work due to complications with dataset)

state_elections_2018_formatted$gender_rep <- gender_fill(state_elections_2018_formatted$candidate_republican)

state_elections_2018_formatted$gender_dem <- gender_fill(state_elections_2018_formatted$candidate_democrat)



“argument is not an atomic vector; coercing”
“argument is not an atomic vector; coercing”


In [89]:
head(state_elections_2018_formatted)

logical(0)
