In [1]:
# import dependencies
import numpy as np
import pandas as pd

In [2]:
# read csv file
file_path = "data_original/countypres_2000-2016.csv"

# create the data frame 
pres_df_original = pd.read_csv(file_path)
pres_df_original.head()

Unnamed: 0,year,state,state_po,county,FIPS,office,candidate,party,candidatevotes,totalvotes,version
0,2000,Alabama,AL,Autauga,1001.0,President,Al Gore,democrat,4942.0,17208,20191203
1,2000,Alabama,AL,Autauga,1001.0,President,George W. Bush,republican,11993.0,17208,20191203
2,2000,Alabama,AL,Autauga,1001.0,President,Ralph Nader,green,160.0,17208,20191203
3,2000,Alabama,AL,Autauga,1001.0,President,Other,,113.0,17208,20191203
4,2000,Alabama,AL,Baldwin,1003.0,President,Al Gore,democrat,13997.0,56480,20191203


In [3]:
# Make a copy
pres_df = pres_df_original

In [4]:
# Drop columns we do not need
pres_df = pres_df.drop(columns=['FIPS', 'version'])
pres_df.head()

Unnamed: 0,year,state,state_po,county,office,candidate,party,candidatevotes,totalvotes
0,2000,Alabama,AL,Autauga,President,Al Gore,democrat,4942.0,17208
1,2000,Alabama,AL,Autauga,President,George W. Bush,republican,11993.0,17208
2,2000,Alabama,AL,Autauga,President,Ralph Nader,green,160.0,17208
3,2000,Alabama,AL,Autauga,President,Other,,113.0,17208
4,2000,Alabama,AL,Baldwin,President,Al Gore,democrat,13997.0,56480


In [5]:
#dropping years not needed

pres_df = pres_df[pres_df.year != 2000]
pres_df = pres_df[pres_df.year != 2004]
pres_df = pres_df[pres_df.year != 2008]
pres_df = pres_df[pres_df.year != 2012]
pres_df.head()

Unnamed: 0,year,state,state_po,county,office,candidate,party,candidatevotes,totalvotes
40517,2016,Alabama,AL,Autauga,President,Hillary Clinton,democrat,5936.0,24973
40518,2016,Alabama,AL,Autauga,President,Donald Trump,republican,18172.0,24973
40519,2016,Alabama,AL,Autauga,President,Other,,865.0,24973
40520,2016,Alabama,AL,Baldwin,President,Hillary Clinton,democrat,18458.0,95215
40521,2016,Alabama,AL,Baldwin,President,Donald Trump,republican,72883.0,95215


In [6]:
#renaming columns
pres_df = pres_df.rename(columns={"year": "Year", "state": "State", "state_po": "State Abbreviation", "county": "County",
                                    "office": "Office", "candidate": "Candidate", "party": "Party",
                                    "candidatevotes": "Candidate Votes", "totalvotes": "Total Votes"})
pres_df.head()

Unnamed: 0,Year,State,State Abbreviation,County,Office,Candidate,Party,Candidate Votes,Total Votes
40517,2016,Alabama,AL,Autauga,President,Hillary Clinton,democrat,5936.0,24973
40518,2016,Alabama,AL,Autauga,President,Donald Trump,republican,18172.0,24973
40519,2016,Alabama,AL,Autauga,President,Other,,865.0,24973
40520,2016,Alabama,AL,Baldwin,President,Hillary Clinton,democrat,18458.0,95215
40521,2016,Alabama,AL,Baldwin,President,Donald Trump,republican,72883.0,95215


In [7]:
# Change all parties other than democrat and republican to "other"
pres_df.loc[(pres_df["Party"] != "republican") & (pres_df["Party"] != "democrat"), "Party"] = "other"
pres_df.loc[pres_df["Party"] == "other"]

Unnamed: 0,Year,State,State Abbreviation,County,Office,Candidate,Party,Candidate Votes,Total Votes
40519,2016,Alabama,AL,Autauga,President,Other,other,865.0,24973
40522,2016,Alabama,AL,Baldwin,President,Other,other,3874.0,95215
40525,2016,Alabama,AL,Barbour,President,Other,other,144.0,10469
40528,2016,Alabama,AL,Bibb,President,Other,other,207.0,8819
40531,2016,Alabama,AL,Blount,President,Other,other,573.0,25588
...,...,...,...,...,...,...,...,...,...
50511,2016,Alaska,AK,District 37,President,Other,other,703.0,5062
50514,2016,Alaska,AK,District 38,President,Other,other,1194.0,5095
50517,2016,Alaska,AK,District 39,President,Other,other,1092.0,5639
50520,2016,Alaska,AK,District 40,President,Other,other,895.0,4610


In [9]:
# Create a new column with the % of the vote that the candidate received
pres_df["Candidate Percent"] = pres_df["Candidate Votes"] / pres_df["Total Votes"]
pres_df.head()

Unnamed: 0,Year,State,State Abbreviation,County,Office,Candidate,Party,Candidate Votes,Total Votes,Candidate Percent
40517,2016,Alabama,AL,Autauga,President,Hillary Clinton,democrat,5936.0,24973,0.237697
40518,2016,Alabama,AL,Autauga,President,Donald Trump,republican,18172.0,24973,0.727666
40519,2016,Alabama,AL,Autauga,President,Other,other,865.0,24973,0.034637
40520,2016,Alabama,AL,Baldwin,President,Hillary Clinton,democrat,18458.0,95215,0.193856
40521,2016,Alabama,AL,Baldwin,President,Donald Trump,republican,72883.0,95215,0.765457


In [10]:
# Export the cleaned data tocsv
pres_df.to_csv("data_clean/pres_clean.csv", index=False)