# 2016 Target Data Ballotpedia

Target variable is webscraped from www.ballotpedia.com.

In [None]:
import pandas as pd
import requests
import warnings
warnings.filterwarnings("ignore")

In [None]:
def get_states_of_usa():
  # Get dataframe
  wiki = pd.read_html("https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States")
  states = wiki[1]
  
  # Change column name of states column
  states.columns = states.columns.droplevel()
  states.rename(columns={"Flag, name andpostal abbreviation[13]": "state"}, inplace=True)
  
  # Clean some rows with additional wikipedia infos
  states.loc[states[states.state.str.endswith("[D]")].index, "state"] = states[states.state.str.endswith("[D]")]["state"].str[:-3]
  
  # Create list of column and replace spaces with underlines for the URLs
  list_of_states_temp = states.state.str.replace(" ", "_").tolist()

  return list_of_states_temp

In [None]:
%%time
list_of_states_temp = get_states_of_usa()

CPU times: user 188 ms, sys: 6.19 ms, total: 194 ms
Wall time: 369 ms


In [None]:
def check_url(list_of_states_temp):
  # List for states which held elections and states which didn't hold elections
  list_of_states = []
  not_available = []

  # Check whether url exists
  for state in list_of_states_temp:
    # Arizona, Idaho, North / South Dakota, Washington: all districts have more than 1 seat -> delete
    # New Hampshire: because of different reasons just about 30 of total 400 seats -> delete
    # New York is available, but data is not comfortable to scrape -> by hand
    if state in ["Arizona", "Idaho", "North_Dakota", "South_Dakota", "New_Hampshire", "New_York", "Washington"]:
      not_available.append(state)
    else:
      get_house = requests.get("https://ballotpedia.org/" + state + "_House_of_Representatives_elections,_2016")
      get_assembly = requests.get("https://ballotpedia.org/" + state + "_State_Assembly_elections,_2016")
      
      if get_house.status_code == 200 or get_assembly.status_code == 200:
        list_of_states.append(state)
      else:
        not_available.append(state)
  
  return list_of_states, not_available

In [None]:
%%time
list_of_states, not_available = check_url(list_of_states_temp)

CPU times: user 1.62 s, sys: 115 ms, total: 1.74 s
Wall time: 1min 16s


In [None]:
def get_election_data(list_of_states):
  # Create a list of urls for all the states
  state_urls = []
  for state in list_of_states:
    if state not in ["Nevada"]:
      state_urls.append("https://ballotpedia.org/"+ state + "_House_of_Representatives_elections,_2016")
    else:
      state_urls.append("https://ballotpedia.org/"+ state + "_State_Assembly_elections,_2016")
  
  # Create dictionary to save all the finished state dataframes
  df_dictionary = {}
  
  # Loop through the list to get all the data
  for iteration, url in enumerate(state_urls):
    print(list_of_states[iteration])
    website = pd.read_html(url)
    if list_of_states[iteration] == "Texas":
      df = website[7]
    else:
      df = website[6]
    
    # Drop the upper level of multi index (if present) to make indexing easier
    if df.columns.nlevels > 1:
      df.columns = df.columns.droplevel()
    df.columns = [col.lower() for col in df.columns]

    # Check if there are districts with more than one seat -> exclude
    if "no. of seats" in df:
      df["no. of seats"] = df["no. of seats"].str.split().str[0]
      df = df[df["no. of seats"] == "1"]

    # Add a state column
    df["state"] = [list_of_states[iteration]] * len(df)
    df["year"] = [url[-4:]] * len(df)

    # Fill NA's with 0 to be able to convert them to integer later on
    df[["democrat", "republican", "other"]] = df[["democrat", "republican", "other"]].fillna("0")


    # Delete the names of candidates and keep number of votes
    df["democrat"] = df["democrat"].apply(lambda x: x.split(": ")[-1].split(" (I)")[0]).str.replace(",", "")
    df["republican"] = df["republican"].apply(lambda x: x.split(": ")[-1].split(" (I)")[0]).str.replace(",", "")
    df["other"] = df["other"].apply(lambda x: x.split(": ")[-1].split(" ")[0]).str.replace(",", "")

    # No candidate means election lost = 0
    # if there is no vote count means election was unopposed -> election won = 1
    df.loc[df[df.democrat == "No candidate"].index, "democrat"] = "0"
    df.loc[df[~df.democrat.str.isdigit()].index, "democrat"] = "1"

    df.loc[df[df.republican == "No candidate"].index, "republican"] = "0"
    df.loc[df[~df.republican.str.isdigit()].index, "republican"] = "1"

    df.loc[df[df.other == "No candidate"].index, "other"] = "0"
    df.loc[df[~df.other.str.isdigit()].index, "other"] = "1"

    # Drop last row (is additional info)
    #df.drop(df.tail(1).index, inplace=True)

    # Convert datatype to integer
    df.democrat = df.democrat.astype("int")
    df.republican = df.republican.astype("int")
    df.other = df.other.astype("int")

    # Add a target column which shows the party which won the seat in the respective district
    df["target"] = df[["democrat", "republican", "other"]].idxmax(axis=1)
    # Change winner values to 0=republican, 1=democrat, 2=independent
    df["target"] = df["target"].replace({"republican": 0, "democrat": 1, "other": 2})

    # Append dataframe to the dataframe dictionary
    df_dictionary[list_of_states[iteration]] = df

  return df_dictionary

In [None]:
election_data = get_election_data(list_of_states)

Alaska
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Illinois
Indiana
Iowa
Kansas
Kentucky
Maine
Massachusetts
Michigan
Minnesota
Missouri
Montana
Nevada
New_Mexico
North_Carolina
Ohio
Oklahoma
Oregon
Pennsylvania
Rhode_Island
South_Carolina
Tennessee
Texas
Utah
Vermont
West_Virginia
Wisconsin
Wyoming


In [None]:
df_target = pd.concat(election_data, axis=0)

In [None]:
def clean_target(df):
  # Drop MultiIndex
  df.index = df.index.droplevel()

  # Reset Index to 0 - ca. 3000
  df = df.reset_index(drop=True)

  # Change all characters to lowercase
  df[["district", "state"]] = df[["district", "state"]].apply(lambda x: x.str.lower())

  # Drop additional columns
  df = df.drop(["unnamed: 4_level_1", "no. of seats"], axis=1)

  # Drop erroneous rows
  df = df.drop(df[df["district"].isna()].index)
  df = df.drop(df[df["district"].str.contains("notes")].index)

  # Clean Massachusetts district names
  massachusetts_districts = df[df["state"] == "massachusetts"].district.str.split()
  
  numbers = {"first": "1st", "second": "2nd", "third": "3rd", "fourth": "4th", "fifth": "5th", "sixth": "6th", "seventh": "7th", 
           "eighth": "8th", "ninth": "9th", "tenth": "10th", "eleventh": "11th", "twelfth": "12th", "thirteenth": "13th", 
           "fourteenth": "14th", "fifteenth": "15th", "sixteenth": "16th", "seventeenth": "17th", "eighteenth": "18th", 
           "nineteenth": "19th", "twentieth": "20th", "twenty-first": "21st", "twenty-second": "22nd", "twenty-third": "23rd", 
           "twenty-fourth": "24th", "twenty-fifth": "25th", "twenty-sixth": "26th", "twenty-seventh": "27th", "twenty-eighth": "28th", 
           "twenty-ninth": "29th", "thirtieth": "30th", "thirty-first": "31st", "thirty-second": "32nd", "thirty-third": "33rd", 
           "thirty-fourth": "34th","thirty-fifth": "35th","thirty-sixth": "36th","thirty-seventh": "37th"}
  
  ordinals = massachusetts_districts.str[0].replace(numbers)
  districts = massachusetts_districts.str[1]

  df.loc[df[df["state"] == "massachusetts"].index, "district"] = ordinals + " " + districts

  # Change Massachusetts district name of 'barnstable, dukes & nantucket
  df.loc[df[df.district == "barnstable, dukes,"].index, "district"] = "barnstable, dukes & nantucket"

  # Clean one district each for Vermont and Minnesota
  df.loc[df[(df["state"] == "vermont") & (df["district"] == "windsor-orange-1[1]")].index, "district"] = "windsor-orange-1"
  df.loc[df[(df["state"] == "minnesota") & (df["district"] == "32b[6]")].index, "district"] = "32b"

  # Change year and target to integer
  df["year"] = df["year"].astype("int")
  df["target"] = df["target"].astype("int")

  # Delete _ from state column
  df.state = df.state.str.replace("_", " ")

  return df

In [None]:
df = clean_target(df_target)

In [None]:
import numpy as np

state_list = ["new york"] * 150
year_list = [2016] * 150
district_list = np.arange(1, 151)
target_list = [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
               0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
               0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
               1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
               1, 1, 1, 0, 0, 0, 0, 0, 1, 0]

new_york = pd.DataFrame({"district": district_list,
                         "state": state_list,
                         "year": year_list,
                         "target": target_list})
new_york.head()

Unnamed: 0,district,state,year,target
0,1,new york,2016,1
1,2,new york,2016,0
2,3,new york,2016,0
3,4,new york,2016,1
4,5,new york,2016,0


In [None]:
df = pd.concat([df, new_york]).sort_values(by=["state", "district"])

In [None]:
df.head()

Unnamed: 0,district,democrat,republican,other,state,year,target
0,1,1.0,0.0,0.0,alaska,2016,1
9,10,2021.0,5901.0,0.0,alaska,2016,0
10,11,0.0,5752.0,2681.0,alaska,2016,0
11,12,2061.0,5597.0,949.0,alaska,2016,0
12,13,0.0,1.0,0.0,alaska,2016,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3821 entries, 0 to 3693
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   district    3821 non-null   object 
 1   democrat    3671 non-null   float64
 2   republican  3671 non-null   float64
 3   other       3671 non-null   float64
 4   state       3821 non-null   object 
 5   year        3821 non-null   int64  
 6   target      3821 non-null   int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 238.8+ KB


In [None]:
# Save to CSV
df.to_csv("drive/MyDrive/US Elections/data_target_2016.csv", index=False)