mggg · pjrule · Dec 4, 2021 · Dec 4, 2021 · Dec 4, 2021 · Dec 10, 2021
diff --git a/evaltools/evaluation/__init__.py b/evaltools/evaluation/__init__.py
@@ -6,12 +6,14 @@
 from .splits import splits, pieces
 from .population import deviations, unassigned_population
 from .contiguity import unassigned_units, contiguous
+from .coi import block_level_coi_preservation
 
 __all__ = [
     "splits",
     "pieces",
     "deviations",
     "unassigned_population",
     "unassigned_units",
-    "contiguous"
+    "contiguous",
+    "block_level_coi_preservation"
 ]
diff --git a/evaltools/evaluation/coi.py b/evaltools/evaluation/coi.py
@@ -0,0 +1,109 @@
+"""Community of interest (COI) preservation scores."""
+from typing import Dict, Sequence, Any, Callable
+from scipy.sparse import csr_matrix
+from gerrychain import Partition
+import numpy as np
+
+
+def block_level_coi_preservation(
+    unit_blocks: Dict[Any, set],
+    coi_blocks: Dict[Any, set],
+    block_pops: Dict[Any, float],
+    thresholds: Sequence[float],
+    partial_districts: bool = False
+) -> Callable[[Partition], Dict[float, float]]:
+    """Makes a COI preservation score function.
+
+    We assume that dual graph units and communities of interest can both be
+    represented (ideally losslessly) with a smaller common unit
+    (typically Census blocks). For a given partitition :math:`P`, inclusion
+    threshold :math:`t` and communities of interest :math:`C_1, \dots, C_n`,
+    we compute the preservation score 
+        .. math:: f(P, t) = \sum_{i=1}^n (\max(f_1(C_i, P, t),f_2(C_i, P, t))
+    where:
+      * :math:`f_1(C_i, P, t) = 1` when :math:`100t\%%` of the population
+        of community of interest :math:`C_i` is inside of one district in
+        :math:`P` (0 otherwise). Intuitively, :math:`f_1` captures how much
+        a community of interest is split across districts. When the typical
+        COI population is much smaller than the typical district population,
+        it is relatively easier to satisfy this criterion.
+      * :math:`f_2(C_i, P, t) = 1` when :math:`100t\%%` of the population
+        of some (ideally sized) district in :math:`P` is inside of :math:`C_i`
+        (0 otherwise).  Intuitively, :math:`f_2` captures how districts are
+        split across communities of interest (though this notion is less easy
+        to interpret than :math:`f_1`). When the typical COI population is
+        much larger than the typical district population, it is easier to
+        satisfy this criterion.
+
+    When `partial_districts` is `True`, we use an alternative formula for
+    :math:`f2`. Specifically, :math:`f_2'(C_i, P, t)` is the number of
+    districts :math:`100t\%%` contained in :math:`C_i` divided by the
+    population of :math:`C_i` (in ideal districts).
+
+    COI-unit intersection populations are precomputed, so generating the
+    score function may be slow for large dual graphs and/or large collections
+    of COIs.
+
+    :param unit_blocks: A mapping from dual graph units to the blocks
+      contained in each unit. THe key must be the same as the nodes
+      in the dual graph.
+    :param coi_blocks: A mapping from COIs to the blocks contained in
+      each COI.
+    :param block_pops: The block populations.
+    :param thresholds: The threshold values to use (ranging in [0, 1]).
+    :param partial_districts: If `True`, an alternative (non-integer-valued)
+      formula is used to compute COI preservation scores.
+    :return: An updater that computes the COI preservation score for a
+      partition for each threshold in `thresholds`.
+    """
+    # We precompute a sparse COI-unit intersection matrix.
+    node_ordering = {k: idx for idx, k in enumerate(unit_blocks.keys())}
+    unit_coi_inter_pops = np.zeros((len(coi_blocks), len(unit_blocks)))
+    for unit_idx, (unit, blocks_in_unit) in enumerate(unit_blocks.items()):
+        for coi_idx, (coi, blocks_in_coi) in enumerate(coi_blocks.items()):
+            unit_coi_inter_pops[coi_idx, unit_idx] = sum(
+                block_pops[b] for b in blocks_in_coi & blocks_in_unit)
+    unit_coi_inter_pops = csr_matrix(unit_coi_inter_pops)
+
+    coi_pops = np.array(
+        [sum(block_pops[b] for b in blocks) for blocks in coi_blocks.values()])
+    unit_pops = np.array([
+        sum(block_pops[b] for b in blocks) for blocks in unit_blocks.values()
+    ])
+    total_pop = unit_pops.sum()
+
+    def score_fn(partition: Partition) -> Dict[float, float]:
+        # Convert the assignment to a matrix encoding.
+        dist_ordering = {
+            dist: idx
+            for idx, dist in enumerate(partition.parts.keys())
+        }
+        dist_mat = np.zeros((len(unit_blocks), len(dist_ordering)))
+        for node, dist in partition.assignment.items():
+            dist_mat[node_ordering[node], dist_ordering[dist]] = 1
+
+        coi_dist_pops = unit_coi_inter_pops @ dist_mat
+        max_district_pop_in_coi = np.max(coi_dist_pops, axis=1)
+        score_by_threshold = {}
+        ideal_dist_pop = total_pop / dist_mat.shape[1]
+        coi_ideal_districts = coi_pops / ideal_dist_pop
+
+        for threshold in thresholds:
+            # f_1: At least (100 * threshold)% of the population
+            # of the COI is contained within one district.
+            f1_scores = max_district_pop_in_coi >= threshold * coi_pops
+            if partial_districts:
+                # f_2: Number of districts (100 % threshold)% contained in
+                # in C_i, divided by the number of ideal districts in C_i.
+                over_threshold = (coi_dist_pops >=
+                                  threshold * ideal_dist_pop).sum(axis=1)
+                f2_scores = over_threshold / coi_ideal_districts
+            else:
+                # f_2: At least (100 * threshold)% of the population of
+                # an ideal district is contained within the COI.
+                f2_scores = max_district_pop_in_coi >= threshold * ideal_dist_pop
+            score_by_threshold[threshold] = np.maximum(f1_scores,
+                                                       f2_scores).sum()
+        return score_by_threshold
+
+    return score_fn
diff --git a/setup.py b/setup.py
@@ -3,7 +3,8 @@
 
 requirements = [
     "pandas", "scipy", "networkx", "geopandas", "shapely", "matplotlib",
-    "gerrychain", "sortedcontainers", "gurobipy", "jsonlines", "opencv-python"
+    "gerrychain", "sortedcontainers", "gurobipy", "jsonlines",
+    "opencv-python", "scipy"
 ]
 
 setup(

diff --git a/tests/test_coi.py b/tests/test_coi.py
@@ -0,0 +1,130 @@
+from evaltools.evaluation import block_level_coi_preservation
+from gerrychain.grid import Grid
+
+
+def test_block_level_coi_preservation_small_cois():
+    n = 10
+    grid = Grid((n, n))  # default plan is 4 squares
+
+    def coord_to_blocks(x, y):
+        return {(2 * n * (2 * x)) + (2 * y), (2 * n * (2 * x)) + (2 * y) + 1,
+                (2 * n * (2 * x + 1)) + (2 * y),
+                (2 * n * (2 * x + 1)) + (2 * y) + 1}
+
+    # Let the "blocks" be the 20x20 grid.
+    unit_blocks = {(x, y): coord_to_blocks(x, y) for x, y in grid.graph.nodes}
+    block_pops = {b: 1 / 4 for b in range(20**2)}
+
+    # Let the COIs be squares of size 3 tiling a 9x9 grid contained
+    # within the 10x10 grid.
+    coi_blocks = {
+        idx: set.union(*(coord_to_blocks(x + 3 * (idx // 3), y + 3 * (idx % 3))
+                         for x in range(3) for y in range(3)))
+        for idx in range(9)
+    }
+
+    thresholds = [0.6, 0.75, 1.]
+    # dist 1 is [0, 4] x [0, 4] (inclusive)
+    # dist 2 is [0, 4] x [5, 9]
+    # dist 3 is [5, 9] x [0, 4]
+    # dist 4 is [5, 9] x [5, 9]
+    # => [60% threshold, 75% threshold, 100% threshold]
+    # [1, 1, 1] COI 0 is [0, 2] x [0, 2] -> all pop in dist 1
+    # [1, 0, 0] COI 1 is [0, 2] x [3, 5] -> 2/3 pop in dist 1, 1/3 pop in dist 2
+    # [1, 1, 1] COI 2 is [0, 2] x [6, 8] -> all pop in dist 2
+    # [1, 0, 0] COI 3 is [3, 5] x [0, 2] -> 2/3 pop in dist 1, 1/3 pop in dist 3
+    # [0, 0, 0] COI 4 is [3, 5] x [3, 5] ->
+    #  [3, 4] x [3, 4] in dist 1 -> 4/9 pop in dist 1
+    #  [3, 4] x [5, 5] in dist 2 -> 2/9 pop in dist 2
+    #  [5, 5] x [3, 4] in dist 3 -> 2/9 pop in dist 3
+    #  [5, 5] x [5, 5] in dist 4 -> 1/9 pop in dist 4
+    # [1, 0, 0] COI 5 is [3, 5] x [6, 8] -> 2/3 pop in dist 2, 1/3 pop in dist 4
+    # [1, 1, 1] COI 6 is [6, 8] x [0, 2] -> all pop in dist 3
+    # [1, 0, 0] COI 7 is [6, 8] x [3, 5] -> 2/3 pop in dist 3, 1/3 pop in dist 4
+    # [1, 1, 1] COI 8 is [6, 8] x [6, 8] -> all pop in dist 4
+    expected_scores = {0.6: 8, 0.75: 4, 1.0: 4}
+    coi_score_fn = block_level_coi_preservation(unit_blocks, coi_blocks,
+                                                block_pops, thresholds)
+    assert coi_score_fn(grid) == expected_scores
+
+
+def test_block_level_coi_preservation_large_cois():
+    n = 10
+    grid = Grid((n, n))  # default plan is 4 squares
+
+    def coord_to_blocks(x, y):
+        return {(2 * n * (2 * x)) + (2 * y), (2 * n * (2 * x)) + (2 * y) + 1,
+                (2 * n * (2 * x + 1)) + (2 * y),
+                (2 * n * (2 * x + 1)) + (2 * y) + 1}
+
+    # Let the "blocks" be the 20x20 grid.
+    unit_blocks = {(x, y): coord_to_blocks(x, y) for x, y in grid.graph.nodes}
+    block_pops = {b: 1 / 4 for b in range(20**2)}
+
+    # Let the COIs be three 10x3 horizonal strips.
+    coi_blocks = {
+        idx: set.union(*(coord_to_blocks(x, y + 3 * idx) for x in range(10)
+                         for y in range(3)))
+        for idx in range(3)
+    }
+
+    thresholds = [0.5, 0.6, 0.75, 1]
+    # dist 1 is [0, 4] x [0, 4] (inclusive)
+    # dist 2 is [0, 4] x [5, 9]
+    # dist 3 is [5, 9] x [0, 4]
+    # dist 4 is [5, 9] x [5, 9]
+    #
+    # f_1 (COI containment within district):
+    # COI 0 is [0, 9] x [0, 2] -> 1/2 in dist 1, 1/2 in dist 2
+    # COI 1 is [0, 9] x [3, 5] ->
+    #   [0, 4] x [3, 4] -> 1/3 in dist 1
+    #   [5, 9] x [3, 4] -> 1/3 in dist 2
+    #   [0, 4] x [5, 5] -> 1/6 in dist 3
+    #   [5, 9] x [5, 5] -> 1/6 in dist 4
+    # COI 2 is [0, 9] x [6, 8] -> 1/2 in dist 3, 1/2 in dist 4
+    # Thus, f_1 contributes at most 2 points with a threshold of
+    # ≤50% and 0 points otherwise.
+
+    # f_2 (district containment within COI):
+    # COI 0 is [0, 9] x [0, 2] ->
+    #   [0, 4] x [0, 2] -> contains 60% of dist 1 pop
+    #   [5, 9] x [0, 2] -> contains 60% of dist 2 pop
+    # COI 1 is [0, 9] x [3, 5] ->
+    #   [0, 4] x [3, 4] -> contains 40% of dist 1 pop
+    #   [5, 9] x [3, 4] -> contains 40% of dist 2 pop
+    #   [0, 4] x [5, 5] -> contains 20% of dist 3 pop
+    #   [5, 9] x [5, 5] -> contains 20% of dist 4 pop
+    # COI 2 is [0, 9] x [6, 8] ->
+    #   [0, 4] x [6, 8] -> contains 60% of dist 3 pop
+    #   [5, 9] x [6, 8] -> contains 60% of dist 4 pop
+    # Thus, f_2 contributes at most 2 points with a threshold of
+    # ≤60% and 0 points otherwise.
+    expected_scores = {0.5: 2, 0.6: 2, 0.75: 0, 1.0: 0}
+    score_fn = block_level_coi_preservation(unit_blocks=unit_blocks,
+                                            coi_blocks=coi_blocks,
+                                            block_pops=block_pops,
+                                            thresholds=thresholds,
+                                            partial_districts=False)
+    assert score_fn(grid) == expected_scores
+
+    # For the alternative version of f_2, we have
+    #   2 districts ≤60% contained in COI 0 / 1.2 districts' pop in COI 0
+    #     => 2/1.2 for COI 0 when threshold ≤60%, 0 otherwise
+    #   ...and similarly for COI 2.
+    expected_scores_partial_dists = {
+        0.5: 4 / 1.2,
+        0.6: 4 / 1.2,
+        0.75: 0,
+        1.0: 0
+    }
+    score_fn_partial_dists = block_level_coi_preservation(
+        unit_blocks=unit_blocks,
+        coi_blocks=coi_blocks,
+        block_pops=block_pops,
+        thresholds=thresholds,
+        partial_districts=True)
+    scores_partial_dists = score_fn_partial_dists(grid)
+    assert scores_partial_dists.keys() == expected_scores_partial_dists.keys()
+    assert all(
+        abs(expected - scores_partial_dists[t]) < 1e-10
+        for t, expected in expected_scores_partial_dists.items())