In [1]:
import pandas as pd
import numpy as np

In [2]:
house_file = "Data/district_overall_2018.csv"
house_df_original = pd.read_csv(house_file)
house_df_original.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Audrey Denney,democrat,False,total,263096,583188,False,20190131
1,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Doug La Malfa,republican,False,total,320092,583188,False,20190131
2,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Jeff Denham,republican,False,total,211910,443800,False,20190131
3,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Josh Harder,democrat,False,total,231890,443800,False,20190131
4,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,John Fitzgerald,republican,False,total,142624,551362,False,20190131


In [3]:
# Make a copy 
house_df = house_df_original
house_df.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Audrey Denney,democrat,False,total,263096,583188,False,20190131
1,2018,California,CA,6,93,71,U.S. Representative,District 1,gen,False,Doug La Malfa,republican,False,total,320092,583188,False,20190131
2,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Jeff Denham,republican,False,total,211910,443800,False,20190131
3,2018,California,CA,6,93,71,U.S. Representative,District 10,gen,False,Josh Harder,democrat,False,total,231890,443800,False,20190131
4,2018,California,CA,6,93,71,U.S. Representative,District 11,gen,False,John Fitzgerald,republican,False,total,142624,551362,False,20190131


In [4]:
# Drop the columns that we don't need
house_df = house_df.drop(columns=["year", "state_po", "state_fips", "state_cen", "state_ic", "office", "stage", "special", "writein",
                                  "mode", "unofficial", "version"])
house_df.head()

Unnamed: 0,state,district,candidate,party,candidatevotes,totalvotes
0,California,District 1,Audrey Denney,democrat,263096,583188
1,California,District 1,Doug La Malfa,republican,320092,583188
2,California,District 10,Jeff Denham,republican,211910,443800
3,California,District 10,Josh Harder,democrat,231890,443800
4,California,District 11,John Fitzgerald,republican,142624,551362


In [5]:
# Rename columns
house_df = house_df.rename(columns={"state": "State", "district": "District Number",
                                    "candidate": "Candidate", "party": "Party",
                                    "candidatevotes": "Candidate Votes",
                                    "totalvotes": "Total Votes"
                                   })
house_df.head()

Unnamed: 0,State,District Number,Candidate,Party,Candidate Votes,Total Votes
0,California,District 1,Audrey Denney,democrat,263096,583188
1,California,District 1,Doug La Malfa,republican,320092,583188
2,California,District 10,Jeff Denham,republican,211910,443800
3,California,District 10,Josh Harder,democrat,231890,443800
4,California,District 11,John Fitzgerald,republican,142624,551362


In [6]:
# Clean up Congressional District column
house_df["District Number"] = house_df["District Number"].str.split(' ', expand=False).str[1]
house_df.head()

Unnamed: 0,State,District Number,Candidate,Party,Candidate Votes,Total Votes
0,California,1,Audrey Denney,democrat,263096,583188
1,California,1,Doug La Malfa,republican,320092,583188
2,California,10,Jeff Denham,republican,211910,443800
3,California,10,Josh Harder,democrat,231890,443800
4,California,11,John Fitzgerald,republican,142624,551362


In [7]:
# Create a new column with the % of the vote that the candidate received
house_df["Candidate Percent"] = house_df["Candidate Votes"] / house_df["Total Votes"]
house_df.head()

Unnamed: 0,State,District Number,Candidate,Party,Candidate Votes,Total Votes,Candidate Percent
0,California,1,Audrey Denney,democrat,263096,583188,0.451134
1,California,1,Doug La Malfa,republican,320092,583188,0.548866
2,California,10,Jeff Denham,republican,211910,443800,0.47749
3,California,10,Josh Harder,democrat,231890,443800,0.52251
4,California,11,John Fitzgerald,republican,142624,551362,0.258676


In [8]:
# Create new column with unique identifier for state-district
house_df["Congressional District"] = house_df["State"] + " " + house_df["District Number"]
house_df.head()

Unnamed: 0,State,District Number,Candidate,Party,Candidate Votes,Total Votes,Candidate Percent,Congressional District
0,California,1,Audrey Denney,democrat,263096,583188,0.451134,California 1
1,California,1,Doug La Malfa,republican,320092,583188,0.548866,California 1
2,California,10,Jeff Denham,republican,211910,443800,0.47749,California 10
3,California,10,Josh Harder,democrat,231890,443800,0.52251,California 10
4,California,11,John Fitzgerald,republican,142624,551362,0.258676,California 11


In [None]:
# DISCUSS WITH GROUP - EXCLUDE OTHER??? Otherwise, would have to sum up all "other" for each district...

# Change all parties other than democrat and republican to "other"
# house_df.loc[(house_df["Party"] != "republican") & (house_df["Party"] != "democrat"), "Party"] = "other"
# house_df.loc[house_df["Party"] == "other"]

In [9]:
# Create a new DataFrame looking at Republican candidates only
republican_df = house_df.loc[house_df["Party"] == "republican"]
republican_df = republican_df.rename(columns={"Candidate": "Republican Candidate", 
                                              "Candidate Votes": "Republican Votes",
                                              "Candidate Percent": "Republican Percent"})
republican_df = republican_df.drop(columns="Party")
republican_df.head()

Unnamed: 0,State,District Number,Republican Candidate,Republican Votes,Total Votes,Republican Percent,Congressional District
1,California,1,Doug La Malfa,320092,583188,0.548866,California 1
2,California,10,Jeff Denham,211910,443800,0.47749,California 10
4,California,11,John Fitzgerald,142624,551362,0.258676,California 11
6,California,12,Lisa Remmer,83560,634144,0.131768,California 12
10,California,14,Cristina Osmena,110878,533646,0.207774,California 14


In [10]:
# Create a new DataFrame looking at Democrat candidates only
democrat_df = house_df.loc[house_df["Party"] == "democrat"]
democrat_df = democrat_df.rename(columns={"Candidate": "Democrat Candidate", 
                                          "Candidate Votes": "Democrat Votes",
                                          "Candidate Percent": "Democrat Percent"})
democrat_df = democrat_df.drop(columns="Party")
democrat_df.head()

Unnamed: 0,State,District Number,Democrat Candidate,Democrat Votes,Total Votes,Democrat Percent,Congressional District
0,California,1,Audrey Denney,263096,583188,0.451134,California 1
3,California,10,Josh Harder,231890,443800,0.52251,California 10
5,California,11,Mark DeSaulnier,408738,551362,0.741324,California 11
7,California,12,Nancy Pelosi,550584,634144,0.868232,California 12
8,California,13,Barbara Lee,521160,589674,0.88381,California 13


In [None]:
# DISCUSS WITH GROUP - EXLCUDE OTHER???

# Create a new DataFrame looking at Other candidates only
#other_df = house_df.loc[house_df["Party"] == "other"]
#other_df = other_df.rename(columns={"Candidate": "Other Candidate", 
                                    "Candidate Votes": "Other Votes",
                                    "Candidate Percent": "Other Percent"})
#other_df = other_df.drop(columns="Party")
#other_df.head()

In [11]:
# Merge the DataFrames

# Start with Republican + Democrat DataFrames and clean up columns
house_party_df = pd.merge(republican_df, democrat_df, on="Congressional District")
house_party_df = house_party_df.rename(columns={"State_x": "State", "District Number_x": "District Number", 
                                                "Total Votes_x": "Total Votes"})
house_party_df = house_party_df.drop(columns=["State_y", "District Number_y", "Total Votes_y"])


# DISCUSS WITH GROUP - EXCLUDE OTHER???
# Add Other DataFrame to merged DataFrame and clean up columns
#house_party_df = pd.merge(house_party_df, other_df, on="Congressional District")
#house_party_df = house_party_df.rename(columns={"State_x": "State", "District Number_x": "District Number", 
#                                                "Total Votes_x": "Total Votes"})
#house_party_df = house_party_df.drop(columns=["State_y", "District Number_y", "Total Votes_y"])

house_party_df.head()

Unnamed: 0,State,District Number,Republican Candidate,Republican Votes,Total Votes,Republican Percent,Congressional District,Democrat Candidate,Democrat Votes,Democrat Percent
0,California,1,Doug La Malfa,320092,583188,0.548866,California 1,Audrey Denney,263096,0.451134
1,California,10,Jeff Denham,211910,443800,0.47749,California 10,Josh Harder,231890,0.52251
2,California,11,John Fitzgerald,142624,551362,0.258676,California 11,Mark DeSaulnier,408738,0.741324
3,California,12,Lisa Remmer,83560,634144,0.131768,California 12,Nancy Pelosi,550584,0.868232
4,California,14,Cristina Osmena,110878,533646,0.207774,California 14,Jackie Speier,422768,0.792226


In [13]:
# Create a column showing the winner

# If Republicans won the most votes, winner = "republican"
house_party_df.loc[(house_party_df["Republican Votes"] > house_party_df["Democrat Votes"]), "Winner"] = "republican"

# If Democrats won the most votes, winner = "democrat"
house_party_df.loc[(house_party_df["Democrat Votes"] > house_party_df["Republican Votes"]), "Winner"] = "democrat"

house_party_df.head()

Unnamed: 0,State,District Number,Republican Candidate,Republican Votes,Total Votes,Republican Percent,Congressional District,Democrat Candidate,Democrat Votes,Democrat Percent,Winner
0,California,1,Doug La Malfa,320092,583188,0.548866,California 1,Audrey Denney,263096,0.451134,republican
1,California,10,Jeff Denham,211910,443800,0.47749,California 10,Josh Harder,231890,0.52251,democrat
2,California,11,John Fitzgerald,142624,551362,0.258676,California 11,Mark DeSaulnier,408738,0.741324,democrat
3,California,12,Lisa Remmer,83560,634144,0.131768,California 12,Nancy Pelosi,550584,0.868232,democrat
4,California,14,Cristina Osmena,110878,533646,0.207774,California 14,Jackie Speier,422768,0.792226,democrat


In [17]:
# Reorder columns
house_party_df = house_party_df[["Congressional District", "State", "District Number", "Total Votes", "Republican Candidate", 
                                "Republican Votes", "Republican Percent", "Democrat Candidate", "Democrat Votes",
                                "Democrat Percent", "Winner"]]
house_party_df.head()

Unnamed: 0,Congressional District,State,District Number,Total Votes,Republican Candidate,Republican Votes,Republican Percent,Democrat Candidate,Democrat Votes,Democrat Percent,Winner
0,California 1,California,1,583188,Doug La Malfa,320092,0.548866,Audrey Denney,263096,0.451134,republican
1,California 10,California,10,443800,Jeff Denham,211910,0.47749,Josh Harder,231890,0.52251,democrat
2,California 11,California,11,551362,John Fitzgerald,142624,0.258676,Mark DeSaulnier,408738,0.741324,democrat
3,California 12,California,12,634144,Lisa Remmer,83560,0.131768,Nancy Pelosi,550584,0.868232,democrat
4,California 14,California,14,533646,Cristina Osmena,110878,0.207774,Jackie Speier,422768,0.792226,democrat
