From 44fb2e77b0bff24cc2c27d8ff3d3122802cbfd9b Mon Sep 17 00:00:00 2001
From: "Parker J. Rule" <parker.rule@tufts.edu>
Date: Fri, 3 Dec 2021 22:46:34 -0500
Subject: [PATCH 1/5] [WIP] COI preservation updater

---
 evaltools/evaluation/coi.py | 102 ++++++++++++++++++++++++++++++++++++
 setup.py                    |   3 +-
 tests/test_coi.py           |  48 +++++++++++++++++
 3 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 evaltools/evaluation/coi.py
 create mode 100644 tests/test_coi.py

diff --git a/evaltools/evaluation/coi.py b/evaltools/evaluation/coi.py
new file mode 100644
index 00000000..39d6bbf6
--- /dev/null
+++ b/evaltools/evaluation/coi.py
@@ -0,0 +1,102 @@
+"""Community of interest (COI) preservation scores."""
+from typing import Dict, Sequence, Any, Callable
+from scipy.sparse import csr_matrix
+from gerrychain import Partition
+import numpy as np
+
+
+def block_level_coi_preservation(
+    unit_blocks: Dict[Any, Sequence[Any]],
+    coi_blocks: Dict[Any, Sequence[Any]],
+    block_pops: Dict[Any, float],
+    thresholds: Sequence[float],
+    partial_districts: bool = False
+) -> Callable[[Partition], Dict[float, float]]:
+    """Makes a COI preservation score function.
+
+    We assume that dual graph units and communities of interest can both be
+    represented (ideally losslessly) with a smaller common unit
+    (typically Census blocks). For a given partitition :math:`P`, inclusion
+    threshold :math:`t` and communities of interest :math:`C_1, \dots, C_n`,
+    we compute the preservation score 
+        .. math:: f(P, t) = \sum_{i=1}^n (\max(f_1(C_i, P, t),f_2(C_i, P, t))
+    where:
+      * :math:`f_1(C_i, P, t) = 1` when :math:`100t\%%` of the population
+        of community of interest :math:`C_i` is inside of one district in
+        :math:`P` (0 otherwise). Intuitively, :math:`f_1` captures how much
+        a community of interest is split across districts. When the typical
+        COI population is much smaller than the typical district population,
+        it is relatively easier to satisfy this criterion.
+      * :math:`f_2(C_i, P, t) = 1` when :math:`100t\%%` of the population
+        of some district in :math:`P` is inside of :math:`C_i` (0 otherwise).
+        Intuitively, :math:`f_2` captures how districts are split across
+        communities of interest (though this notion is less easy to interpret
+        than :math:`f_1`). When the typical COI population is much larger
+        than the typical district population, it is relatively easier to
+        satisfy this criterion.
+
+    When `partial_districts` is `True`, we use an alternative formula for
+    :math:`f2`. Specifically, :math:`f_2'(C_i, P, t)` is the number of
+    districts :math:`100t\%%` contained in :math:`C_i` divided by the
+    population of :math:`C_i` (in ideal districts).
+
+    COI-unit intersection populations are precomputed, so generating the
+    score function may be slow for large dual graphs and/or large collections
+    of COIs.
+
+    :param unit_blocks: A mapping from dual graph units to the blocks
+      contained in each unit. THe key must be the same as the nodes
+      in the dual graph.
+    :param coi_blocks: A mapping from COIs to the blocks contained in
+      each COI.
+    :param block_pops: The block populations.
+    :param thresholds: The threshold values to use (ranging in [0, 1]).
+    :param partial_districts: If `True`, an alternative (non-integer-valued)
+      formula is used to compute COI preservation scores.
+    :return: An updater that computes the COI preservation score for a
+      partition for each threshold in `thresholds`.
+    """
+    if min(thresholds) <= 0.5:
+        raise ValueError('Minimum inclusion threshold must be >50%.')
+
+    # We precompute a sparse COI-unit intersection matrix.
+    node_ordering = {k: idx for idx, k in enumerate(unit_blocks.keys())}
+    unit_coi_inter_pops = np.zeros((len(coi_blocks), len(unit_blocks)))
+    for unit_idx, (unit, blocks_in_unit) in enumerate(unit_blocks.items()):
+        for coi_idx, (coi, blocks_in_coi) in enumerate(coi_blocks.items()):
+            unit_coi_inter_pops[coi_idx, unit_idx] = sum(
+                block_pops[b] for b in blocks_in_coi & blocks_in_unit)
+    unit_coi_inter_pops = csr_matrix(unit_coi_inter_pops)
+
+    coi_pops = np.array(
+        [sum(block_pops[b] for b in blocks) for blocks in coi_blocks.values()])
+    unit_pops = np.array([
+        sum(block_pops[b] for b in blocks)
+        for vtd, blocks in unit_blocks.items()
+    ])
+    total_pop = unit_pops.sum()
+
+    def score_fn(partition: Partition) -> Dict[float, float]:
+        # Convert the assignment to a matrix encoding.
+        dist_ordering = {
+            dist: idx
+            for idx, dist in enumerate(partition.parts.keys())
+        }
+        dist_mat = np.zeros((len(unit_blocks), len(dist_ordering)))
+        for node, dist in partition.assignment.items():
+            dist_mat[node_ordering[node], dist_ordering[dist]] = 1
+
+        coi_dist_pops = unit_coi_inter_pops @ dist_mat
+        max_district_pop_in_coi = np.max(coi_dist_pops, axis=1)
+        score_by_threshold = {}
+        ideal_dist_pop = total_pop / dist_mat.shape[1]
+        for threshold in thresholds:
+            if not partial_districts:
+                score_by_threshold[threshold] = np.logical_or(
+                    max_district_pop_in_coi >= threshold * ideal_dist_pop,
+                    max_district_pop_in_coi >= threshold * coi_pops).sum()
+            else:
+                raise ValueError('Partial score not implemented yet.')
+        return score_by_threshold
+
+    return score_fn
diff --git a/setup.py b/setup.py
index e4f5707f..79ed880e 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,8 @@
 
 requirements = [
     "pandas", "scipy", "networkx", "geopandas", "shapely", "matplotlib",
-    "gerrychain", "sortedcontainers", "gurobipy", "jsonlines", "opencv-python"
+    "gerrychain", "sortedcontainers", "gurobipy", "jsonlines",
+    "opencv-python", "scipy"
 ]
 
 setup(
diff --git a/tests/test_coi.py b/tests/test_coi.py
new file mode 100644
index 00000000..4ae85ec8
--- /dev/null
+++ b/tests/test_coi.py
@@ -0,0 +1,48 @@
+from evaltools.evaluation import block_level_coi_preservation
+from gerrychain.grid import Grid
+
+
+def test_block_level_coi_preservation_small_cois():
+    n = 10
+    grid = Grid((n, n))  # default plan is 4 squares
+
+    def coord_to_blocks(x, y):
+        return {(2 * n * (2 * x)) + (2 * y), (2 * n * (2 * x)) + (2 * y) + 1,
+                (2 * n * (2 * x + 1)) + (2 * y),
+                (2 * n * (2 * x + 1)) + (2 * y) + 1}
+
+    # Let the "blocks" be the 20x20 grid.
+    unit_blocks = {(x, y): coord_to_blocks(x, y) for x, y in grid.graph.nodes}
+    block_pops = {b: 1 / 4 for b in range(20**2)}
+
+    # As a contrived example, let the COIs be squares of size 3
+    # tiling a 9x9 grid contained within the 10x10 grid.
+    coi_blocks = {
+        idx: set.union(*(coord_to_blocks(x + 3 * (idx // 3), y + 3 * (idx % 3))
+                         for x in range(3) for y in range(3)))
+        for idx in range(9)
+    }
+
+    thresholds = [0.6, 0.75, 1.]
+    # dist 1 is [0, 4] x [0, 4] (inclusive)
+    # dist 2 is [0, 4] x [5, 9]
+    # dist 3 is [5, 9] x [0, 4]
+    # dist 4 is [5, 9] x [5, 9]
+    # => [60% threshold, 75% threshold, 100% threshold]
+    # [1, 1, 1] COI 0 is [0, 2] x [0, 2] -> all pop in dist 1
+    # [1, 0, 0] COI 1 is [0, 2] x [3, 5] -> 2/3 pop in dist 1, 1/3 pop in dist 2
+    # [1, 1, 1] COI 2 is [0, 2] x [6, 8] -> all pop in dist 2
+    # [1, 0, 0] COI 3 is [3, 5] x [0, 2] -> 2/3 pop in dist 1, 1/3 pop in dist 3
+    # [0, 0, 0] COI 4 is [3, 5] x [3, 5] ->
+    #  [3, 4] x [3, 4] in dist 1 -> 4/9 pop in dist 1
+    #  [3, 4] x [5, 5] in dist 2 -> 2/9 pop in dist 2
+    #  [5, 5] x [3, 4] in dist 3 -> 2/9 pop in dist 3
+    #  [5, 5] x [5, 5] in dist 4 -> 1/9 pop in dist 4
+    # [1, 0, 0] COI 5 is [3, 5] x [6, 8] -> 2/3 pop in dist 2, 1/3 pop in dist 4
+    # [1, 1, 1] COI 6 is [6, 8] x [0, 2] -> all pop in dist 3
+    # [1, 0, 0] COI 7 is [6, 8] x [3, 5] -> 2/3 pop in dist 3, 1/3 pop in dist 4
+    # [1, 1, 1] COI 8 is [6, 8] x [6, 8] -> all pop in dist 4
+    expected_scores = {0.6: 8, 0.75: 4, 1.0: 4}
+    coi_score_fn = block_level_coi_preservation(unit_blocks, coi_blocks,
+                                                block_pops, thresholds)
+    assert coi_score_fn(grid) == expected_scores

From b8ed6904763b5d43bcf8533ec1f6967ffec5356a Mon Sep 17 00:00:00 2001
From: "Parker J. Rule" <parker.rule@tufts.edu>
Date: Sat, 4 Dec 2021 16:24:31 -0500
Subject: [PATCH 2/5] Refine COI preservation score

* Implements partial districts variant
* Adds another test (for the case where COI size > district size)
* Refines documentation
---
 evaltools/evaluation/coi.py | 34 +++++++++------
 tests/test_coi.py           | 86 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/evaltools/evaluation/coi.py b/evaltools/evaluation/coi.py
index 39d6bbf6..4507b336 100644
--- a/evaltools/evaluation/coi.py
+++ b/evaltools/evaluation/coi.py
@@ -28,11 +28,11 @@ def block_level_coi_preservation(
         COI population is much smaller than the typical district population,
         it is relatively easier to satisfy this criterion.
       * :math:`f_2(C_i, P, t) = 1` when :math:`100t\%%` of the population
-        of some district in :math:`P` is inside of :math:`C_i` (0 otherwise).
-        Intuitively, :math:`f_2` captures how districts are split across
-        communities of interest (though this notion is less easy to interpret
-        than :math:`f_1`). When the typical COI population is much larger
-        than the typical district population, it is relatively easier to
+        of some (ideally sized) district in :math:`P` is inside of :math:`C_i`
+        (0 otherwise).  Intuitively, :math:`f_2` captures how districts are
+        split across communities of interest (though this notion is less easy
+        to interpret than :math:`f_1`). When the typical COI population is
+        much larger than the typical district population, it is easier to
         satisfy this criterion.
 
     When `partial_districts` is `True`, we use an alternative formula for
@@ -56,9 +56,6 @@ def block_level_coi_preservation(
     :return: An updater that computes the COI preservation score for a
       partition for each threshold in `thresholds`.
     """
-    if min(thresholds) <= 0.5:
-        raise ValueError('Minimum inclusion threshold must be >50%.')
-
     # We precompute a sparse COI-unit intersection matrix.
     node_ordering = {k: idx for idx, k in enumerate(unit_blocks.keys())}
     unit_coi_inter_pops = np.zeros((len(coi_blocks), len(unit_blocks)))
@@ -90,13 +87,24 @@ def score_fn(partition: Partition) -> Dict[float, float]:
         max_district_pop_in_coi = np.max(coi_dist_pops, axis=1)
         score_by_threshold = {}
         ideal_dist_pop = total_pop / dist_mat.shape[1]
+        coi_ideal_districts = coi_pops / ideal_dist_pop
+
         for threshold in thresholds:
-            if not partial_districts:
-                score_by_threshold[threshold] = np.logical_or(
-                    max_district_pop_in_coi >= threshold * ideal_dist_pop,
-                    max_district_pop_in_coi >= threshold * coi_pops).sum()
+            # f_1: At least (100 * threshold)% of the population
+            # of the COI is contained within one district.
+            f1_scores = max_district_pop_in_coi >= threshold * coi_pops
+            if partial_districts:
+                # f_2: Number of districts (100 % threshold)% contained in
+                # in C_i, divided by the number of ideal districts in C_i.
+                over_threshold = (coi_dist_pops >=
+                                  threshold * ideal_dist_pop).sum(axis=1)
+                f2_scores = over_threshold / coi_ideal_districts
             else:
-                raise ValueError('Partial score not implemented yet.')
+                # f_2: At least (100 * threshold)% of the population of
+                # an ideal district is contained within the COI.
+                f2_scores = max_district_pop_in_coi >= threshold * ideal_dist_pop
+            score_by_threshold[threshold] = np.maximum(f1_scores,
+                                                       f2_scores).sum()
         return score_by_threshold
 
     return score_fn
diff --git a/tests/test_coi.py b/tests/test_coi.py
index 4ae85ec8..aeed1fea 100644
--- a/tests/test_coi.py
+++ b/tests/test_coi.py
@@ -15,8 +15,8 @@ def coord_to_blocks(x, y):
     unit_blocks = {(x, y): coord_to_blocks(x, y) for x, y in grid.graph.nodes}
     block_pops = {b: 1 / 4 for b in range(20**2)}
 
-    # As a contrived example, let the COIs be squares of size 3
-    # tiling a 9x9 grid contained within the 10x10 grid.
+    # Let the COIs be squares of size 3 tiling a 9x9 grid contained
+    # within the 10x10 grid.
     coi_blocks = {
         idx: set.union(*(coord_to_blocks(x + 3 * (idx // 3), y + 3 * (idx % 3))
                          for x in range(3) for y in range(3)))
@@ -46,3 +46,85 @@ def coord_to_blocks(x, y):
     coi_score_fn = block_level_coi_preservation(unit_blocks, coi_blocks,
                                                 block_pops, thresholds)
     assert coi_score_fn(grid) == expected_scores
+
+
+def test_block_level_coi_preservation_large_cois():
+    n = 10
+    grid = Grid((n, n))  # default plan is 4 squares
+
+    def coord_to_blocks(x, y):
+        return {(2 * n * (2 * x)) + (2 * y), (2 * n * (2 * x)) + (2 * y) + 1,
+                (2 * n * (2 * x + 1)) + (2 * y),
+                (2 * n * (2 * x + 1)) + (2 * y) + 1}
+
+    # Let the "blocks" be the 20x20 grid.
+    unit_blocks = {(x, y): coord_to_blocks(x, y) for x, y in grid.graph.nodes}
+    block_pops = {b: 1 / 4 for b in range(20**2)}
+
+    # Let the COIs be three 10x3 horizonal strips.
+    coi_blocks = {
+        idx: set.union(*(coord_to_blocks(x, y + 3 * idx) for x in range(10)
+                         for y in range(3)))
+        for idx in range(3)
+    }
+
+    thresholds = [0.5, 0.6, 0.75, 1]
+    # dist 1 is [0, 4] x [0, 4] (inclusive)
+    # dist 2 is [0, 4] x [5, 9]
+    # dist 3 is [5, 9] x [0, 4]
+    # dist 4 is [5, 9] x [5, 9]
+    #
+    # f_1 (COI containment within district):
+    # COI 0 is [0, 9] x [0, 2] -> 1/2 in dist 1, 1/2 in dist 2
+    # COI 1 is [0, 9] x [3, 5] ->
+    #   [0, 4] x [3, 4] -> 1/3 in dist 1
+    #   [5, 9] x [3, 4] -> 1/3 in dist 2
+    #   [0, 4] x [5, 5] -> 1/6 in dist 3
+    #   [5, 9] x [5, 5] -> 1/6 in dist 4
+    # COI 2 is [0, 9] x [6, 8] -> 1/2 in dist 3, 1/2 in dist 4
+    # Thus, f_1 contributes at most 2 points with a threshold of
+    # ≤50% and 0 points otherwise.
+
+    # f_2 (district containment within COI):
+    # COI 0 is [0, 9] x [0, 2] ->
+    #   [0, 4] x [0, 2] -> contains 60% of dist 1 pop
+    #   [5, 9] x [0, 2] -> contains 60% of dist 2 pop
+    # COI 1 is [0, 9] x [3, 5] ->
+    #   [0, 4] x [3, 4] -> contains 40% of dist 1 pop
+    #   [5, 9] x [3, 4] -> contains 40% of dist 2 pop
+    #   [0, 4] x [5, 5] -> contains 20% of dist 3 pop
+    #   [5, 9] x [5, 5] -> contains 20% of dist 4 pop
+    # COI 2 is [0, 9] x [6, 8] ->
+    #   [0, 4] x [6, 8] -> contains 60% of dist 3 pop
+    #   [5, 9] x [6, 8] -> contains 60% of dist 4 pop
+    # Thus, f_2 contributes at most 2 points with a threshold of
+    # ≤60% and 0 points otherwise.
+    expected_scores = {0.5: 2, 0.6: 2, 0.75: 0, 1.0: 0}
+    score_fn = block_level_coi_preservation(unit_blocks=unit_blocks,
+                                            coi_blocks=coi_blocks,
+                                            block_pops=block_pops,
+                                            thresholds=thresholds,
+                                            partial_districts=False)
+    assert score_fn(grid) == expected_scores
+
+    # For the alternative version of f_2, we have
+    #   2 districts ≤60% contained in COI 0 / 1.2 districts' pop in COI 0
+    #     => 2/1.2 for COI 0 when threshold ≤60%, 0 otherwise
+    #   ...and similarly for COI 2.
+    expected_scores_partial_dists = {
+        0.5: 4 / 1.2,
+        0.6: 4 / 1.2,
+        0.75: 0,
+        1.0: 0
+    }
+    score_fn_partial_dists = block_level_coi_preservation(
+        unit_blocks=unit_blocks,
+        coi_blocks=coi_blocks,
+        block_pops=block_pops,
+        thresholds=thresholds,
+        partial_districts=True)
+    scores_partial_dists = score_fn_partial_dists(grid)
+    assert scores_partial_dists.keys() == expected_scores_partial_dists.keys()
+    assert all(
+        abs(expected - scores_partial_dists[t]) < 1e-10
+        for t, expected in expected_scores_partial_dists.items())

From ce58ac848e690ba482b882014ad84718f5c7398a Mon Sep 17 00:00:00 2001
From: "Parker J. Rule" <parker.rule@tufts.edu>
Date: Sat, 4 Dec 2021 16:31:35 -0500
Subject: [PATCH 3/5] Tighten type annotations for unit_blocks and coi_blocks

---
 evaltools/evaluation/coi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evaltools/evaluation/coi.py b/evaltools/evaluation/coi.py
index 4507b336..bc2c8db4 100644
--- a/evaltools/evaluation/coi.py
+++ b/evaltools/evaluation/coi.py
@@ -6,8 +6,8 @@
 
 
 def block_level_coi_preservation(
-    unit_blocks: Dict[Any, Sequence[Any]],
-    coi_blocks: Dict[Any, Sequence[Any]],
+    unit_blocks: Dict[Any, set],
+    coi_blocks: Dict[Any, set],
     block_pops: Dict[Any, float],
     thresholds: Sequence[float],
     partial_districts: bool = False

From e37881980543d81012ca5b9f03d7a2bb99222b98 Mon Sep 17 00:00:00 2001
From: "Parker J. Rule" <parker.rule@tufts.edu>
Date: Fri, 10 Dec 2021 15:20:54 -0500
Subject: [PATCH 4/5] Update module for block_level_coi_processing

---
 evaltools/evaluation/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/evaltools/evaluation/__init__.py b/evaltools/evaluation/__init__.py
index af7ff93a..4b617328 100644
--- a/evaltools/evaluation/__init__.py
+++ b/evaltools/evaluation/__init__.py
@@ -6,6 +6,7 @@
 from .splits import splits, pieces
 from .population import deviations, unassigned_population
 from .contiguity import unassigned_units, contiguous
+from .coi import block_level_coi_preservation
 
 __all__ = [
     "splits",
@@ -13,5 +14,6 @@
     "deviations",
     "unassigned_population",
     "unassigned_units",
-    "contiguous"
+    "contiguous",
+    "block_level_coi_preservation"
 ]

From 6211b4ec56f6cbef80d4be37f11615dacf9d9812 Mon Sep 17 00:00:00 2001
From: "Parker J. Rule" <parker.rule@tufts.edu>
Date: Fri, 10 Dec 2021 16:08:33 -0500
Subject: [PATCH 5/5] Cleanup: replace .items() with .values()

---
 evaltools/evaluation/coi.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/evaltools/evaluation/coi.py b/evaltools/evaluation/coi.py
index bc2c8db4..92eb1fec 100644
--- a/evaltools/evaluation/coi.py
+++ b/evaltools/evaluation/coi.py
@@ -68,8 +68,7 @@ def block_level_coi_preservation(
     coi_pops = np.array(
         [sum(block_pops[b] for b in blocks) for blocks in coi_blocks.values()])
     unit_pops = np.array([
-        sum(block_pops[b] for b in blocks)
-        for vtd, blocks in unit_blocks.items()
+        sum(block_pops[b] for b in blocks) for blocks in unit_blocks.values()
     ])
     total_pop = unit_pops.sum()