In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the csv file and view the DataFrame
senate_file = "data_original/senate_overall_2018.csv"
senate_df_original = pd.read_csv(senate_file)
senate_df_original.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Martha McSally,republican,False,total,1135200,2384308,False,20190110
1,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Kyrsten Sinema,democrat,False,total,1191100,2384308,False,20190110
2,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Angela Green,green,False,total,57442,2384308,False,20190110
3,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,,,True,total,566,2384308,False,20190110
4,2018,California,CA,6,93,71,US Senate,statewide,gen,False,Dianne Feinstein,democrat,False,total,6019422,11113364,False,20190110


In [3]:
# Make a copy 
senate_df = senate_df_original
senate_df.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Martha McSally,republican,False,total,1135200,2384308,False,20190110
1,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Kyrsten Sinema,democrat,False,total,1191100,2384308,False,20190110
2,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Angela Green,green,False,total,57442,2384308,False,20190110
3,2018,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,,,True,total,566,2384308,False,20190110
4,2018,California,CA,6,93,71,US Senate,statewide,gen,False,Dianne Feinstein,democrat,False,total,6019422,11113364,False,20190110


In [4]:
# Drop the columns that we don't need
senate_df = senate_df.drop(columns=["year", "state_fips", "state_cen", "state_ic", "office", "district", "stage", "writein",
                                  "mode", "unofficial", "version"])
senate_df.head()

Unnamed: 0,state,state_po,special,candidate,party,candidatevotes,totalvotes
0,Arizona,AZ,False,Martha McSally,republican,1135200,2384308
1,Arizona,AZ,False,Kyrsten Sinema,democrat,1191100,2384308
2,Arizona,AZ,False,Angela Green,green,57442,2384308
3,Arizona,AZ,False,,,566,2384308
4,California,CA,False,Dianne Feinstein,democrat,6019422,11113364


In [6]:
# Rename columns
senate_df = senate_df.rename(columns={"state": "State", "state_po": "State Abbreviation", "special": "Special Election",
                                    "candidate": "Candidate", "party": "Party", "candidatevotes": "Candidate Votes",
                                    "totalvotes": "Total Votes"})
senate_df.head()

Unnamed: 0,State,State Abbreviation,Special Election,Candidate,Party,Candidate Votes,Total Votes
0,Arizona,AZ,False,Martha McSally,republican,1135200,2384308
1,Arizona,AZ,False,Kyrsten Sinema,democrat,1191100,2384308
2,Arizona,AZ,False,Angela Green,green,57442,2384308
3,Arizona,AZ,False,,,566,2384308
4,California,CA,False,Dianne Feinstein,democrat,6019422,11113364


In [7]:
# Create a new column with the % of the vote that the candidate received
senate_df["Candidate Percent"] = senate_df["Candidate Votes"] / senate_df["Total Votes"]
senate_df.head()

Unnamed: 0,State,State Abbreviation,Special Election,Candidate,Party,Candidate Votes,Total Votes,Candidate Percent
0,Arizona,AZ,False,Martha McSally,republican,1135200,2384308,0.476113
1,Arizona,AZ,False,Kyrsten Sinema,democrat,1191100,2384308,0.499558
2,Arizona,AZ,False,Angela Green,green,57442,2384308,0.024092
3,Arizona,AZ,False,,,566,2384308,0.000237
4,California,CA,False,Dianne Feinstein,democrat,6019422,11113364,0.541638


In [8]:
# Change all parties other than democrat and republican to "other"
senate_df.loc[(senate_df["Party"] != "republican") & (senate_df["Party"] != "democrat"), "Party"] = "other"
senate_df.loc[senate_df["Party"] == "other"]

Unnamed: 0,State,State Abbreviation,Special Election,Candidate,Party,Candidate Votes,Total Votes,Candidate Percent
2,Arizona,AZ,False,Angela Green,other,57442,2384308,0.024092
3,Arizona,AZ,False,,other,566,2384308,0.000237
8,Connecticut,CT,False,Christopher S Murphy,other,37894,1386840,0.027324
9,Connecticut,CT,False,Richard Lion,other,8838,1386840,0.006373
10,Connecticut,CT,False,Jeff Russell,other,6618,1386840,0.004772
...,...,...,...,...,...,...,...,...
139,Virginia,VA,False,,other,5125,3351373,0.001529
144,West Virginia,WV,False,Rusty Hollen,other,24231,582911,0.041569
147,Wisconsin,WI,False,,other,42,2657841,0.000016
150,Wyoming,WY,False,Joseph Porambo,other,5658,203420,0.027814


In [9]:
# View final DataFrame
senate_df.head()

Unnamed: 0,State,State Abbreviation,Special Election,Candidate,Party,Candidate Votes,Total Votes,Candidate Percent
0,Arizona,AZ,False,Martha McSally,republican,1135200,2384308,0.476113
1,Arizona,AZ,False,Kyrsten Sinema,democrat,1191100,2384308,0.499558
2,Arizona,AZ,False,Angela Green,other,57442,2384308,0.024092
3,Arizona,AZ,False,,other,566,2384308,0.000237
4,California,CA,False,Dianne Feinstein,democrat,6019422,11113364,0.541638


In [13]:
# Export the cleaned data
senate_df.to_csv("data_clean/senate_overall_2018_clean.csv")