In [1]:
import pandas as pd
import numpy as np

In [2]:
house_file = "data_original/district_overall_2018.csv"
house_df_original = pd.read_csv(house_file)
house_df_original.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Audrey Denney,democrat,False,total,263096,583188,False,20190131
1,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Doug La Malfa,republican,False,total,320092,583188,False,20190131
2,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Jeff Denham,republican,False,total,211910,443800,False,20190131
3,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Josh Harder,democrat,False,total,231890,443800,False,20190131
4,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,John Fitzgerald,republican,False,total,142624,551362,False,20190131


In [3]:
# Make a copy 
house_df = house_df_original
house_df.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Audrey Denney,democrat,False,total,263096,583188,False,20190131
1,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Doug La Malfa,republican,False,total,320092,583188,False,20190131
2,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Jeff Denham,republican,False,total,211910,443800,False,20190131
3,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Josh Harder,democrat,False,total,231890,443800,False,20190131
4,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,John Fitzgerald,republican,False,total,142624,551362,False,20190131


In [4]:
# Drop the columns that we don't need
house_df = house_df.drop(columns=["year", "state_fips", "state_cen", "state_ic", "office", "stage", "writein",
                                  "mode", "unofficial", "version"])
house_df.head()

Unnamed: 0,state,state_po,district,special,candidate,party,candidatevotes,totalvotes
0,California,CA,District 1,False,Audrey Denney,democrat,263096,583188
1,California,CA,District 1,False,Doug La Malfa,republican,320092,583188
2,California,CA,District 10,False,Jeff Denham,republican,211910,443800
3,California,CA,District 10,False,Josh Harder,democrat,231890,443800
4,California,CA,District 11,False,John Fitzgerald,republican,142624,551362


In [5]:
# Rename columns
house_df = house_df.rename(columns={"state": "State", "state_po": "State Abbreviation", "district": "District Number",
                                    "special": "Special Election", "candidate": "Candidate", "party": "Party",
                                    "candidatevotes": "Candidate Votes", "totalvotes": "Total Votes"})
house_df.head()

Unnamed: 0,State,State Abbreviation,District Number,Special Election,Candidate,Party,Candidate Votes,Total Votes
0,California,CA,District 1,False,Audrey Denney,democrat,263096,583188
1,California,CA,District 1,False,Doug La Malfa,republican,320092,583188
2,California,CA,District 10,False,Jeff Denham,republican,211910,443800
3,California,CA,District 10,False,Josh Harder,democrat,231890,443800
4,California,CA,District 11,False,John Fitzgerald,republican,142624,551362


In [6]:
# Clean up Congressional District column
house_df["District Number"] = house_df["District Number"].str.split(' ', expand=False).str[1]

In [7]:
# Create new column with unique identifier for state-district
house_df["Congressional District"] = house_df["State Abbreviation"] + "-" + house_df["District Number"]
house_df.head()

Unnamed: 0,State,State Abbreviation,District Number,Special Election,Candidate,Party,Candidate Votes,Total Votes,Congressional District
0,California,CA,1,False,Audrey Denney,democrat,263096,583188,CA-1
1,California,CA,1,False,Doug La Malfa,republican,320092,583188,CA-1
2,California,CA,10,False,Jeff Denham,republican,211910,443800,CA-10
3,California,CA,10,False,Josh Harder,democrat,231890,443800,CA-10
4,California,CA,11,False,John Fitzgerald,republican,142624,551362,CA-11


In [8]:
# Create a new column with the % of the vote that the candidate received
house_df["Candidate Percent"] = house_df["Candidate Votes"] / house_df["Total Votes"]
house_df.head()

Unnamed: 0,State,State Abbreviation,District Number,Special Election,Candidate,Party,Candidate Votes,Total Votes,Congressional District,Candidate Percent
0,California,CA,1,False,Audrey Denney,democrat,263096,583188,CA-1,0.451134
1,California,CA,1,False,Doug La Malfa,republican,320092,583188,CA-1,0.548866
2,California,CA,10,False,Jeff Denham,republican,211910,443800,CA-10,0.47749
3,California,CA,10,False,Josh Harder,democrat,231890,443800,CA-10,0.52251
4,California,CA,11,False,John Fitzgerald,republican,142624,551362,CA-11,0.258676


In [10]:
# Reorder columns
house_df = house_df[["Congressional District", "State", "State Abbreviation", "District Number", "Special Election",
                                 "Candidate", "Party", "Candidate Votes", "Total Votes", "Candidate Percent"]]
house_df.head()

Unnamed: 0,Congressional District,State,State Abbreviation,District Number,Special Election,Candidate,Party,Candidate Votes,Total Votes,Candidate Percent
0,CA-1,California,CA,1,False,Audrey Denney,democrat,263096,583188,0.451134
1,CA-1,California,CA,1,False,Doug La Malfa,republican,320092,583188,0.548866
2,CA-10,California,CA,10,False,Jeff Denham,republican,211910,443800,0.47749
3,CA-10,California,CA,10,False,Josh Harder,democrat,231890,443800,0.52251
4,CA-11,California,CA,11,False,John Fitzgerald,republican,142624,551362,0.258676


In [12]:
# Export the cleaned data
house_df.to_csv("data_clean/district_overall_2018_clean.csv", index=False)