In [9]:
import os
import glob
import pandas as pd
import networkx as nx
import geopandas as gpd
from gerrychain import Graph
from fiona.errors import DriverError

In [10]:
chen_all_states = 'CA  FL  NJ  NY'.split()  #'AR  AZ  CA  DE  FL  GA  IL  LA  MD  MS  NC  NM  NV  NY  SC  TN  TX  VA  NJ'.split()
chen_ei_states = 'AL  AZ  DE  GA  LA  MS  NM  NY  TN  VA AR  CA  FL  IL  MD  NC  NV  SC  TX'.split()

# Chen's data is not included in this repository due to its size.
# Data is available from http://www-personal.umich.edu/~jowei/race/
# (sections 3 and 4)
chen_votes_root = '/Users/pjrule/data/chen-data/votes'
chen_ei_root = '/Users/pjrule/data/chen-data/ei'

In [11]:
# We ultimately did not use Chen's EI and vote data for our experiments in Figures 2 and 3.
chen_data_enabled = False

In [12]:
chen_columns = {
    'BRvotes': 'black_repub',
    'BDvotes': 'black_dem',
    'HRvotes': 'hispanic_repub',
    'HDvotes': 'hispanic_dem',
    'ORvotes': 'other_repub',
    'ODvotes': 'other_dem'
}

In [13]:
remove_nodes = {
    'FL': set(['120879801001']), # island (pop. 20)
    'NY': set(['360610001001'])  # island (pop. 2)
}

In [14]:
levels = ('CD', 'SD', 'HD')
suffixes = {
    'CD': 'CD',
    'SD': 'CD_SD',
    'HD': 'CD_SD_HD'
}

In [15]:
extra_hd_states = ['NY', 'CA', 'FL']  # use alternate seed plans for HD

In [16]:
for state in chen_all_states:
    print(state)
    for level in levels:
        try:
            state_graph = Graph.from_json(
                os.path.join('Seeding-Division-Splits',
                             'Output',
                             f'{level}_seed',
                             f'{state}.json')
            )
            suffix = suffixes[level]
        except FileNotFoundError:
            continue
            
    if state in extra_hd_states:
        print('Loading auxiliary graph...')
        suffix = suffixes['HD']
        state_hd_graph = Graph.from_json(
            os.path.join('Seeding-Division-Splits',
                         'Output',
                         'extra',
                         f'{state}.json')
        )
        # Join HD column with existing state graph.
        geoid_to_hd = {}
        for node in state_hd_graph.nodes:
            hd_assignment = state_hd_graph.nodes[node]['SEED_STATE_HOUSE']
            geoid_to_hd[state_hd_graph.nodes[node]['GEOID10']] = hd_assignment
        for node in state_graph.nodes:
            hd_assignment = geoid_to_hd[state_graph.nodes[node]['GEOID10']]
            state_graph.nodes[node]['SPLITS_SEED_HD'] = hd_assignment
        
    
    nodes_to_remove = []
    for node in state_graph.nodes:
        if state_graph.nodes[node]['GEOID10'] in remove_nodes.get(state, set()):
            nodes_to_remove.append(node)
    state_graph.remove_nodes_from(nodes_to_remove)
    if nodes_to_remove:
        print('removing', nodes_to_remove)
    
    nodes_by_geoid = {
        state_graph.nodes[n]['GEOID10']: state_graph.nodes[n]
        for n in state_graph.nodes
    }
    graph_bg = set(state_graph.nodes[n]['GEOID10'] for n in state_graph.nodes)
    
    # Map assignments to range [1, n_districts].
    for level in levels:
        if level in suffix:
            assign_col = f'SPLITS_SEED_{level}'
            districts = set()
            for node in state_graph.nodes:
                districts.add(state_graph.nodes[node][assign_col])
            district_map = {
                dist: idx + 1
                for idx, dist in enumerate(sorted(districts))
            }
            for node in state_graph.nodes:
                curr_assign = state_graph.nodes[node][assign_col]
                state_graph.nodes[node][assign_col] = district_map[curr_assign]
    
    # Add EI data.
    if chen_data_enabled and state in chen_ei_states:
        try:
            chen_df = pd.read_csv(os.path.join(chen_ei_root, f'{state}.txt'), sep='\t')
            chen_df = chen_df.rename(columns=chen_columns)
            chen_df = chen_df[['geoid'] + list(chen_columns.values())]
            chen_df['bg'] = chen_df['geoid'].astype(str).str.slice(stop=12)
            chen_bg_df = chen_df.groupby(by=['bg']).sum().drop(columns=['geoid'])
            chen_bg_df = chen_bg_df[~chen_bg_df.index.isin(remove_nodes.get(state, set()))]

            chen_bg = set(chen_bg_df.index)
            assert not chen_bg - graph_bg
            assert sum(nodes_by_geoid[geoid]['TOTPOP'] for geoid in graph_bg - chen_bg) == 0

            for bg, row in chen_bg_df.iterrows():
                for k, v in row.items():
                    nodes_by_geoid[bg][f'chen_{k}'] = v
            for bg in graph_bg - chen_bg:
                for col in list(chen_columns.values()):
                    nodes_by_geoid[bg][f'chen_{col}'] = 0
            suffix += '_ei'
        except AssertionError:
            print("Couldn't join EI data.")
        
    # Add Obama/Romney vote data.
    if chen_data_enabled:
      try:
          votes_df = gpd.read_file(os.path.join(chen_votes_root, f'{state}.dbf'))
          if 'Obama' in votes_df.columns and 'Romney' in votes_df.columns:
              votes_df = votes_df[['block', 'Obama', 'Romney']]
              votes_df['bg'] = votes_df['block'].astype(str).str.slice(stop=12)
              votes_df = votes_df.groupby(by=['bg']).sum()
              votes_df = votes_df[~votes_df.index.isin(remove_nodes.get(state, set()))]

              votes_bg = set(votes_df.index)
              print('votes_bg', len(votes_bg))
              print('graph_bg', len(graph_bg))
              print('votes_bg - graph_bg', len(votes_bg - graph_bg))
              print('graph_bg - votes_bg', len(graph_bg - votes_bg))
              assert not votes_bg - graph_bg
              assert sum(nodes_by_geoid[geoid]['TOTPOP'] for geoid in graph_bg - votes_bg) == 0

              for bg, row in votes_df.iterrows():
                  for k, v in row.items():
                      nodes_by_geoid[bg][k] = v
              for bg in graph_bg - votes_bg:
                  for col in ('Obama', 'Romney'):
                      nodes_by_geoid[bg][col] = 0     
              suffix += '_votes'
      except DriverError:
          print('Vote data not found. Skipping...')
    
    state_graph = nx.relabel_nodes(
        state_graph,
        {n: idx for idx, n in enumerate(state_graph.nodes)}
    )
    state_graph.to_json(os.path.join('Seeding-Division-Splits', 'processed', f'{state}_{suffix}.json'))

CA
Loading auxiliary graph...
FL


  "Found islands (degree-0 nodes). Indices of islands: {}".format(islands)


Loading auxiliary graph...
removing [914]
NJ
NY


  "Found islands (degree-0 nodes). Indices of islands: {}".format(islands)


Loading auxiliary graph...
removing [3053]
