In [1]:
import os
import pandas as pd

import sys; sys.path.append("..") # Adds parent directory to python modules path.
from topdown_parsers import *

import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import geopandas as gpd
from sklearn.linear_model import LinearRegression

from ER_noise import *

In [2]:
csv_dir_without_hhs = "er_csvs/runs_without_hhs"
csv_dir_with_hhs = "er_csvs/runs_with_hhs"
runoff_filepath = "reconstructed_dallas_precicnts_w_elects.csv"

epsilon_values = ["0.25", "0.5", "1", "2"]
epsilon_splits = ["equal", "bottom-heavy", "mid-heavy", "top-heavy"]

In [9]:
def label_split_and_budget(df, eps_split, eps_budget):
    """
    """
    if eps_split == "mid":
        df["split"] = "mid-heavy"
    elif eps_split == "top":
        df["split"] = "top-heavy"
    elif eps_split == "bottom":
        df["split"] = "bottom-heavy"
    elif eps_split == "eq":
        df["split"] = "equal"
    else:
        raise ValueError("Split value is {}, but was expecting one of [equal, top, mid, heavy]".format(eps_split))
        
    if eps_budget == "0pt25":
        df["epsilon"] = "0.25"
    elif eps_budget == "0pt5":
        df["epsilon"] = "0.5"
    elif eps_budget == "1":
        df["epsilon"] = "1"
    elif eps_budget == "2":
        df["epsilon"] = "2"
    else:
        raise ValueError("Budget value is {}, but was expecting one of [0pt25, 0pt5, 1, 2]".format(eps_budget))
        
    return df
    
def add_runoff(runoff_filepath, df, precinct_col="Precinct"):
    """ Adds the runoff vote data in runoff_filepath to `df`.
    """
    runoff = pd.read_csv(runoff_filepath)
    runoff = runoff[runoff["CNTYVTD"].isin(df[precinct_col])] # only keeping dallas
    
    df = df.merge(runoff, how="left", left_on="Precinct", right_on="CNTYVTD")
    return df
    
def combine_csvs(csv_dir, runoff_filepath, with_hh):
    """ Combines all the csvs in `csv_dir`, labels their budget and split, and then 
        merges the `runoff_filepath` file to it to combine vote data.
        `with_hh` is a Bool that is True if the csvs are runs with Household Constraints.
    """
    dfs = []
    
    for root, dirs, files in os.walk(csv_dir):
        for file in files:
            
            if os.path.splitext(file)[1] != ".csv":
                continue
            
            if with_hh: 
                eps_split = file[:-4].split("_")[3]
                eps_budget = file[:-4].split("_")[4]
            else:
                eps_split = file[:-4].split("_")[2]
                eps_budget = file[:-4].split("_")[3]
            
            df = pd.read_csv(os.path.join(root, file))
            df = label_split_and_budget(df, eps_split, eps_budget)
            df = add_runoff(runoff_filepath, df)
            
            dfs.append(df)
    
    main_df = pd.concat(dfs)
    
    return main_df

def plot_er_and_point_estimates(df, epsilon_values, epsilon_splits, race, with_hh=False):
    """
    """
    save_hh_str = "with_hh" if with_hh else "without_hh"
    title_hh_str = "with HH constraints" if with_hh else "without HH constraints"

    # configs
    settings = ["weight", "filter", "no_filter"]
    for sett in settings:
        if sett == "weight":
            title_sett_str = "weighted"; save_sett_str = "weighted"; weight = True; filt = False
        elif sett == "filter":
            title_sett_str = "filter at thresh=10"; save_sett_str = "filt_10"; weight = False; filt = True
        else:
            title_sett_str = "no_filter"; save_sett_str = "no_filt"; weight = False; filt = False

        # ER
        fig, axs = plot_elect_grid(epsilon_values, 
                                   epsilon_splits, 
                                   df, 
                                   "Valdez", 
                                   race, 
                                   "D_18R_Governor_pct", 
                                   "18R_Governor_D_tot", 
                                   figsize=(20,20), 
                                   filt=filt, 
                                   weight=weight, 
                                   n_samps=20, 
                                   title="ER: Votes for Valdez: TX Statewide 2018 Dem Runoff Governor | TopDown Noise with {}, {}".format(title_sett_str, title_hh_str))
        plt.savefig("TopDown_{}_{}_{}_er_Valdez.png".format(race, 
                                                            save_sett_str, 
                                                            save_hh_str), 
                    dpi=300)

        # Point estimates
        fig, axs = plot_point_estimate_grid(epsilon_values, 
                                            epsilon_splits, 
                                            df, 
                                            "Valdez", 
                                            race, 
                                            "2018 Dem Governor Runoff", 
                                            "D_18R_Governor_pct", 
                                            "18R_Governor_D_tot", 
                                            20, 
                                            figsize=(20,20), 
                                            filt=filt, 
                                            weight=weight, 
                                            title="Point Estimates: Votes for Valdez: TX Statewide 2018 Dem Runoff Governor | TopDown Noise with {}, {}".format(title_sett_str, title_hh_str))
        plt.savefig("TopDown_{}_{}_{}_point_estimates_Valdez.png".format(race, 
                                                                         save_sett_str, 
                                                                         save_hh_str), 
                    dpi=300)

        plt.close(fig="all")

In [10]:
df_without_hhs = combine_csvs(csv_dir_without_hhs, runoff_filepath, False)
df_with_hhs = combine_csvs(csv_dir_with_hhs, runoff_filepath, True)

plot_er_and_point_estimates(df_without_hhs, epsilon_values, epsilon_splits, "HVAP", with_hh=False)
plot_er_and_point_estimates(df_without_hhs, epsilon_values, epsilon_splits, "BVAP", with_hh=False)
plot_er_and_point_estimates(df_without_hhs, epsilon_values, epsilon_splits, "WVAP", with_hh=False)

plot_er_and_point_estimates(df_with_hhs, epsilon_values, epsilon_splits, "HVAP", with_hh=True)
plot_er_and_point_estimates(df_with_hhs, epsilon_values, epsilon_splits, "BVAP", with_hh=True)
plot_er_and_point_estimates(df_with_hhs, epsilon_values, epsilon_splits, "WVAP", with_hh=True)