In [1]:
import os

os.chdir("..")

print(os.getcwd())

c:\Users\mquick\Documents\zeitgeist blog\projects\political_fragmentation


In [100]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import requests
from bs4 import BeautifulSoup as bs
from utils import gini
from typing import Sequence, Any

### Collect elections data from Wikipedia

* Elections from 1824 to 2020
* 2016 is only exception

In [149]:
class WikiScraper:
    
    def __init__(self, election_year: int) -> None:
        self.election_year = election_year
        
    def get_tables(self) -> list[pd.DataFrame]:
        url = f"https://en.wikipedia.org/wiki/{self.election_year}_United_States_presidential_election"
        return pd.read_html(url)
    
    def find_state_results_table(self, all_tables: list[pd.DataFrame]) -> pd.DataFrame:
        state_results_index = [
            i 
            for i in range(len(all_tables)) 
            if any(
                [re.search(r"state total|margin", str(col), re.IGNORECASE) 
                 for col in all_tables[i].columns
                 ])
            ]
        if not state_results_index:
            raise Exception
        return all_tables[state_results_index[-1]]
    
    @staticmethod
    def collapse_columns(columns: Sequence[str]) -> list[str]:
        collapsed_cols = []
        for col in columns:
             col = ' '.join(
                 set(
                     [
                         c 
                         for c in col 
                         if not c.startswith("Unnamed") 
                         and "Candidates with" not in c
                         ]
                     )
                 )
             col = col.strip()
             collapsed_cols.append(col)
        return collapsed_cols
    
    @staticmethod
    def get_candidate_names(columns: list[str]) -> list[str]:
        return [
            col.replace("%", "").strip() 
            for col in columns 
            if "%" in col
            and not re.search(r"margin", col, re.IGNORECASE)
            ]
        
    @staticmethod
    def get_candidate_results_cols(columns: list[str], candidates: list[str]) -> list[str]:
        return [
            col 
            for col in columns 
            if any([candidate in col for candidate in candidates]) 
            and ("%" in col or ("#" in col or "Vote" in col))
            ]
    
    def clean_df_columns(self, results_df: pd.DataFrame) -> pd.DataFrame:
        results_df.columns = self.collapse_columns(results_df.columns.values)
        candidates = self.get_candidate_names(results_df.columns)
        candidate_results_cols = self.get_candidate_results_cols(results_df.columns, candidates)
        results_df = results_df.rename(columns={results_df.columns[0]: "State"})
        return results_df[["State"] + candidate_results_cols]
    
    @staticmethod
    def clean_numeric_val(value: Any) -> float:
        numeric_val = re.sub(r"[^\d\.]", "", str(value))
        if not numeric_val:
            return 0
        return float(numeric_val)
    
    def clean_df_rows(self, results_df: pd.DataFrame) -> pd.DataFrame:
        results_df = results_df.dropna(subset=["State"])
        results_df = results_df[~results_df["State"].apply(lambda x: bool(re.search(r"total", x, re.IGNORECASE)) or x.isupper())]
        for col in results_df.columns[1:]:
            results_df[col] = results_df[col].apply(self.clean_numeric_val)
        return results_df
    
    def format_results_df(self, results_df: pd.DataFrame) -> pd.DataFrame:
        results_df = results_df.melt(id_vars="State", var_name="variable", value_name="value")
        results_df["result_type"] = results_df["variable"].apply(
            lambda x: 
                "votes_pct" if "%" in x 
                else "votes_total" if ("#" in x or "Vote" in x)
                else np.nan
                )
        if not results_df[results_df["result_type"].isna()].empty:
            raise Exception
        results_df["ticket"] = results_df["variable"].apply(lambda x: re.sub(r"[%#]+|Votes?\s?[cast]*", "", x).strip())
        results_df = results_df\
            .drop("variable", axis=1)\
            .pivot_table(index=["State", "ticket"], columns="result_type", values="value")\
            .reset_index()\
            .rename_axis(None, axis=1)\
            .assign(election_year = self.election_year)
        return results_df[["State", "election_year", "ticket", "votes_pct", "votes_total"]]
        
    def get_state_results(self) -> pd.DataFrame:
        all_tables = self.get_tables()
        raw_results_df = self.find_state_results_table(all_tables)
        clean_cols_df = self.clean_df_columns(raw_results_df)
        clean_df = self.clean_df_rows(clean_cols_df)
        formatted_df = self.format_results_df(clean_df)
        return formatted_df

In [151]:
elections_df = pd.concat([
    WikiScraper(year).get_state_results()
    for year in range(1824, 2024, 4)
    if year != 2016
    ])

In [159]:
elections_df.loc[(elections_df["votes_pct"]>=80) & (elections_df["election_year"]>=1948)]

Unnamed: 0,State,election_year,ticket,votes_pct,votes_total
128,Mississippi[e][91],1948,J. Strom Thurmond Dixiecrat,87.17,167538.0
92,Mississippi,1964,Barry Goldwater Republican,87.14,356528.0
153,Rhode Island,1964,Lyndon B. Johnson Democratic,80.87,315463.0
58,District of Columbia,2004,John Kerry Democratic,89.18,202970.0
56,District of Columbia,2008,Barack Obama Democratic,92.46,245800.0
40,District of Columbia,2012,Barack Obama Democratic,90.91,267070.0
40,District of Columbia,2020,Biden/Harris Democratic,92.15,317323.0


In [158]:
elections_df.to_csv("./data/usa/historical_election_state_results.csv", index=False)

### Calculate Gini coefficients

* Candidates by Gini score
* Elections by weighted average Gini score