# Virginia Analysis 
### December 2019 
### Aaron Barden, Hannah Wheelan, Hope Johnson

In [1]:
# set up libraries
import pandas as pd
import numpy as np
import os
import gerrymetrics as g
import pathlib

path = pathlib.Path.home() / "Documents" / "GitHub" / "gerryspam" / "VA"


# set up gerrymetrics variables
metric_dict = {'t_test_diff':            g.t_test_diff,
               'mean_median_diff':       g.mean_median,
               'declination':            g.declination,
               'efficiency_gap':         g.EG,
               'partisan_bias':          g.partisan_bias}
min_districts = 5
min_year=1972
competitiveness_threshold = .65 # needs to be above .5

In [2]:
base_dat_path =  path / "2019 Hypothetical Results - Gerrymetrics data.csv"
dat = pd.read_csv(base_dat_path, header=1)
dat.head()
dat = dat.rename(columns={
    "District": "District",
    "D votes": "Dem Votes-2017",
    "Dshare": "Dshare-2017", 
    "Rvotes": "GOP Votes-2017",
    "Rshare": "Rshare-2017",
    "D votes.1": "Dem Votes-2019",
    "Dshare.1": "Dshare-2019",
    "R votes.1": "GOP Votes-2019",
    "Rshare.1": "Rshare-2019",
    "affected": "affected"})

dat['Dshare-2017'] = dat['Dshare-2017'].str.replace('%', '')
dat['Dshare-2019'] = dat['Dshare-2019'].str.replace('%', '')
dat['Rshare-2017'] = dat['Rshare-2017'].str.replace('%', '')
dat['Rshare-2019'] = dat['Rshare-2019'].str.replace('%', '')

dat.head()


Unnamed: 0,District,Dem Votes-2017,Dshare-2017,R votes,Rshare-2017,Dem Votes-2019,Dshare-2019,GOP Votes-2019,Rshare-2019,affected
0,1,4639,23.81,14848,76.19,0,0.0,16748,100.0,0
1,2,13366,63.14,7803,36.86,11828,61.0,7563,39.0,0
2,3,3759,21.69,13572,78.31,0,0.0,17099,100.0,0
3,4,0,0.0,15282,100.0,8461,37.03,14389,62.97,0
4,5,0,0.0,18402,100.0,0,0.0,18490,100.0,0


In [3]:
dat_long = pd.wide_to_long(dat, ['Dshare', 'Rshare', 'Dem Votes', 'GOP Votes'], i='District', j='Year', sep='-')
dat_long[['Dshare', 'Rshare']] = dat_long[['Dshare', 'Rshare']].apply(pd.to_numeric)
dat_long['D Voteshare'] = dat_long['Dshare']/100
dat_long['Rshare'] = dat_long['Rshare']/100
dat_long = dat_long.reset_index()
dat_long['Party'] = np.where(dat_long['Dshare'] > .5, 'D', 'R')
# dat_long['Dem Votes'] = 0
# dat_long['GOP Votes'] = 0
dat_long['State'] = 'VA'
dat_long.head()


Unnamed: 0,District,Year,affected,R votes,Dshare,Rshare,Dem Votes,GOP Votes,D Voteshare,Party,State
0,1,2017,0,14848,23.81,0.7619,4639,,0.2381,D,VA
1,1,2019,0,14848,0.0,1.0,0,16748.0,0.0,R,VA
2,2,2017,0,7803,63.14,0.3686,13366,,0.6314,D,VA
3,2,2019,0,7803,61.0,0.39,11828,7563.0,0.61,D,VA
4,3,2017,0,13572,21.69,0.7831,3759,,0.2169,D,VA


## Create data sets

In [4]:
affected = dat_long['affected'] == 1
unaffected = dat_long['affected'] == 0
year_2017 = dat_long['Year'] == 2017
year_2019 = dat_long['Year'] == 2019

dat_2017 = dat_long[year_2017]
dat_2017_affected = dat_long[year_2017 & affected]
dat_2017_unaffected = dat_long[year_2017 & unaffected]

dat_2019 = dat_long[year_2019]
dat_2019_affected = dat_long[year_2019 & affected]
dat_2019_unaffected = dat_long[year_2019 & unaffected]

dat_path_2017 = path / "dat_2017.csv"
dat_path_2017_affected = path / "dat_2017_affected.csv"
dat_path_2017_unaffected = path / "dat_2017_unaffected.csv"

dat_path_2019 = path / "dat_2019.csv"
dat_path_2019_affected = path / "dat_2019_affected.csv"
dat_path_2019_unaffected = path / "dat_2019_unaffected.csv"

dat_2017.to_csv(dat_path_2017)
dat_2017_affected.to_csv(dat_path_2017_affected)
dat_2017_unaffected.to_csv(dat_path_2017_unaffected)

dat_2019.to_csv(dat_path_2019)
dat_2019_affected.to_csv(dat_path_2019_affected)
dat_2019_unaffected.to_csv(dat_path_2019_unaffected)


I manually added and slightly modified [gerrymetric](https://github.com/PrincetonUniversity/gerrymandertests/blob/master/gerrymetrics/utils.py) functions due to some errors I was receiving.

In [5]:
def parse_results(input_filepath, start_year=1948, coerce_odd_years=False):
    '''
    Read CSV of election results, return a Pandas DataFrame.
    '''
    
    df = pd.read_csv(input_filepath)
    
    df = df[df['Year'] >= start_year]
    df = df[df['Party'] != 'I'] # only include D and R wins
    
    if coerce_odd_years:
        df.loc[df['Year'] % 2 == 1, 'Year'] += 1
    
    def str_to_int(x):
        if isinstance(x, str):
            return int(x.replace(',', ''))
        else:
            return x
            
    for col in ['Dem Votes', 'GOP Votes']:
        df[col] = df[col].apply(str_to_int)
            
    grouped = df.groupby(['Year', 'State'])
    
    new = pd.DataFrame(grouped['D Voteshare'].apply(list))
    new['District Numbers'] = grouped['District'].apply(list)
    
#     if df.columns.contains('Dem Votes'):
    if 'Dem Votes' in df.columns:
        new['Weighted Voteshare'] = grouped['Dem Votes'].apply(sum) / (grouped['Dem Votes'].apply(sum) +
                                                         grouped['GOP Votes'].apply(sum))
        
    else:
        new['Weighted Voteshare'] = grouped['D Voteshare'].apply(np.mean)

    return new

In [6]:
def yearstatedf():
    '''
    Create a Pandas MultiIndex DataFrame, indexed by year and state.
    '''
    
    index = pd.MultiIndex(levels=[[], []],
                          labels=[[], []],
                          names=['Year', 'State'])

    df = pd.DataFrame(index=index, dtype=object)
    return df
    

def tests_df(tests_dict):
    '''
    Return tests dict as MultiIndex DataFrame.
    '''

    df = yearstatedf()

    for year in tests_dict:
        for state in tests_dict[year]:
            for col, val in tests_dict[year][state].items():
                if not isinstance(val, list):
                    df.at[(year, state), col] = val

    return df

In [7]:
def yearstatedf():
    '''
    Create a Pandas MultiIndex DataFrame, indexed by year and state.
    '''
    
    index = pd.MultiIndex(levels=[[], []],
                          codes=[[], []],
                          names=['Year', 'State'])

    df = pd.DataFrame(index=index, dtype=object)
    return df

In [8]:
def run_all_tests(all_results,
                  impute_val=1,
                  clip_impute=False,
                  save_unimputed=False,
                  metrics={'t_test_diff': g.t_test_diff,
                           'mean_median': g.mean_median,
                           'partisan_bias': g.partisan_bias,
                           'efficiency_gap': g.EG},
                  multimembers=None):
    '''
    Run a number of tests with parameters about how to deal with uncontested elections, return a nested dict of the results.
    
    Choices made here are for the website, gerrymander.princeton.edu, but might not be preferable in all cases.
    '''
    from collections import defaultdict
    from tqdm import tqdm
    
    np.seterr(all='ignore') # ignore warnings that come up from computing with nans.

    assert impute_val > .5 and impute_val <= 1.0, "Imputed voteshare in uncontested races must be between .5 and 1"
    
    
    tests = defaultdict(lambda: defaultdict(list))

    for year in tqdm(all_results.index.levels[0]):
        list_of_lists = [i for i in all_results.loc[year, 'D Voteshare'].values]
        national_results = np.array(sum(list_of_lists, []))
        national_results[national_results == 1] = impute_val
        national_results[national_results == 0] = 1 - impute_val

        states = all_results.loc[year].index

        for state in states:
            vs = np.array(list(all_results.loc[(year, state), 'D Voteshare']))

            if impute_val != 1:
                if clip_impute:
                    imputed = np.clip(vs, 1 - impute_val, impute_val)
                else:
                    imputed = vs.copy()
                    imputed[vs == 1] = impute_val
                    imputed[vs == 0] = 1 - impute_val
                
                if not save_unimputed:
                    vs = imputed
            else:
                imputed = vs.copy()
            
            tests[year][state] = {
                "voteshare": sum(vs) / len(vs),
                "dseats": sum(vs > 0.5),
                "seats": sum(vs > 0.5), # redundant but maybe necessary for backword compatibility
                "results": list(vs),
                "ndists": len(vs),
                "state": state,
                "year": year,
                "weighted_voteshare": all_results.loc[(year, state), 'Weighted Voteshare'],
                "district_numbers": all_results.loc[(year, state), 'District Numbers']
            }
            if multimembers is not None:
                tests[year][state]['multimember'] = state in multimembers

            for metric, f in metrics.items():
                if f.__name__ == 'bootstrap':# TODO: figure out a way to do this with decorators?
                    score = f(vs, national_results)
                    if isinstance(score, dict) and 'seat_hist' in score:
                        score['sim_seats'] = list(score['seat_hist'].values())[:-1] # figure out a way to not do this, on the website end
                else:
                    score = f(imputed)

                tests[year][state][metric] = score
                

    return tests


In [9]:
def run_tests(data, title):
    parsed_res = parse_results(data)
    out_file = tests_df(run_all_tests(parsed_res, impute_val=1, metrics=metric_dict))
    file_name = title + ".csv"
    out_path = pathlib.Path.home() / "Documents" / "GitHub" / "gerryspam" / "VA" / file_name
    out_file.to_csv(out_path)

run_tests(dat_path_2017_affected, "dat_2017")
run_tests(dat_path_2017_affected, "affected_2017")
run_tests(dat_path_2017_unaffected, "unaffected_2017")

run_tests(dat_path_2019, "statewide_2019")
run_tests(dat_path_2019_affected, "affected_2019")
run_tests(dat_path_2019_unaffected, "unaffected_2019")

100%|██████████| 1/1 [00:00<00:00, 198.90it/s]
100%|██████████| 1/1 [00:00<00:00, 347.47it/s]
100%|██████████| 1/1 [00:00<00:00, 277.38it/s]
100%|██████████| 1/1 [00:00<00:00, 254.31it/s]
100%|██████████| 1/1 [00:00<00:00, 256.08it/s]
100%|██████████| 1/1 [00:00<00:00, 228.62it/s]
