In [2]:
## Install gender package and linked database
install.packages("gender")
install.packages("genderdata", repos = "https://dev.ropensci.org", type = "source")

## Packages
library(dplyr)
library(gender)
library(stringr)
library(broom)


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [4]:
## Read data in, change path as necessary
data <- read.csv("2016_election_results.csv")



In [7]:
## Function to fill predicted gender
gender_fill <- function(x) { 
  n <- length(x)
  x <- word(x) ## removes last names, gender() only works on first name strings
  gender_rep <- rep("a", n)
  for (i in 1:n) {
  if (x[i] == "No") {
    gender_rep[i] <- "N/A" ## handles no candidate
  } else if (length(pull(gender(x[i]), var = gender)) == 0) {
    gender_rep[i] <- "Unknown" ## handles names the function can't predict
  }  else {
  gender_rep[i] <- pull(gender(as.character(x[i])), var = gender)
  }
  }
  gender_rep
  }


## First if checks for "No" because "No candidate" has been truncated to "No", needs to be changed if databases have different indicators for no candidate
## Second if checks for length == 0 because gender() generates an empty table if it is unable to predict a gender
## Else applies gender() to the rest of the cases and pulls the gender result out of the results (gender function creates a table with ancillary info), coercion to character 
## because gender() only works on character type




In [8]:
## Testing function

test_string <- c("Michael B", "Jenny A", "No candidate", "xyzo")


print(gender_fill(test_string) == c("male", "female", "N/A", "Unknown"))





[1] TRUE TRUE TRUE TRUE


In [9]:
## Create gender columns, extremely extremely slow (like 20-30min at least), but works

data$gender_Dem <- gender_fill(data$Democrat)

data$gender_Rep <- gender_fill(data$Republican)

head(data)

Unnamed: 0_level_0,X,Democrat,Republican,Other,State,Year,District,Democrat.Incumbent,Democrat.Votes,Republican.Incumbent,Republican.Votes,Other.Incumbent,Other.Votes,gender_Dem,gender_Rep
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<chr>
1,0,Scott J. Kawasaki,No candidate,No candidate,Alaska,2016,1,True,1,False,0,0,0,male,
2,1,Truno Holdaway,Steve M. Thompson,No candidate,Alaska,2016,2,False,1153,True,3268,0,0,Unknown,male
3,2,Christina M. Sinclair,Tammie Wilson,Jeanne Olson,Alaska,2016,3,False,537,True,4291,False,2270,female,female
4,3,David Guttenberg,No candidate,No candidate,Alaska,2016,4,True,1,False,0,0,0,male,
5,4,Adam Wool,Aaron Lojewski,No candidate,Alaska,2016,5,True,3812,False,3384,0,0,male,male
6,5,Jason T. Land,David M. Talerico,No candidate,Alaska,2016,6,False,2327,True,5126,0,0,male,male


In [11]:
## Created file with gender column so you don't have to run the function every time
write.csv(data, "2016_election_results_updated.csv")


In [6]:
## Separate data
Dem_data <- data %>%
    select(Democrat, State, Year, Democrat.Incumbent, Democrat.Votes, gender_Dem)

Rep_data <- data %>%
    select(Republican, State, Year, Republican.Incumbent, Republican.Votes, gender_Rep)

head(Dem_data)

head(Rep_data)



Unnamed: 0_level_0,Democrat,State,Year,Democrat.Incumbent,Democrat.Votes,gender_Dem
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<int>,<chr>
1,Scott J. Kawasaki,Alaska,2016,True,1,male
2,Truno Holdaway,Alaska,2016,False,1153,Unknown
3,Christina M. Sinclair,Alaska,2016,False,537,female
4,David Guttenberg,Alaska,2016,True,1,male
5,Adam Wool,Alaska,2016,True,3812,male
6,Jason T. Land,Alaska,2016,False,2327,male


Unnamed: 0_level_0,Republican,State,Year,Republican.Incumbent,Republican.Votes,gender_Rep
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<int>,<chr>
1,No candidate,Alaska,2016,False,0,
2,Steve M. Thompson,Alaska,2016,True,3268,male
3,Tammie Wilson,Alaska,2016,True,4291,female
4,No candidate,Alaska,2016,False,0,
5,Aaron Lojewski,Alaska,2016,False,3384,male
6,David M. Talerico,Alaska,2016,True,5126,male


In [16]:
## Basic descriptive

Dem_data %>%
    group_by(Year, gender_Dem) %>%
    filter(gender_Dem %in% c("male", "female", "Unknown"), !is.na(Democrat.Votes)) %>%
    summarize(mean_votes = mean(Democrat.Votes))

Rep_data %>%
    group_by(Year, gender_Rep) %>%
    filter(gender_Rep %in% c("male", "female", "Unknown"), !is.na(Republican.Votes)) %>%
    summarize(mean_votes = mean(Republican.Votes))

## Visualize Dem names function couldn't categorize (can do same for GOP)
Dem_data %>%
    select(Democrat, gender_Dem) %>%
    filter(gender_Dem == "Unknown")


`summarise()` regrouping output by 'Year' (override with `.groups` argument)



Year,gender_Dem,mean_votes
<int>,<chr>,<dbl>
2016,female,13083.32
2016,male,12067.61
2016,Unknown,13180.8


`summarise()` regrouping output by 'Year' (override with `.groups` argument)



Year,gender_Rep,mean_votes
<int>,<chr>,<dbl>
2016,female,13882.47
2016,male,14177.42
2016,Unknown,10137.58


Democrat,gender_Dem
<chr>,<chr>
Truno Holdaway,Unknown
J.P. Bob Johnson,Unknown
Grimsley Graham,Unknown
Kansen Chu,Unknown
S. Monique Limon,Unknown
Mesbah Islam,Unknown
S. Quinton Johnson,Unknown
W. Charles Paradee,Unknown
S. Bradley Connor,Unknown
"Wengay M. ""Newt"" Newton Sr.",Unknown
