In [1]:
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from toydown import ToyDown

filename = "Blocks/dallas_county_blocks10.shp"
toy = ToyDown(filename, "GEOID10", "TOTPOP10")

tract_recom = np.load("100_recom_tract_parts_comp.npy", allow_pickle=True)[0]["dicts"]

for district_type, ds in [("tract_recom", tract_recom)]:
    print("Starting {} calculations".format(district_type), flush=True)
#     print(ds[0])
    var = toy.assign_district_tree_variance(ds[0])
    print(var)
    break

Starting tract_recom calculations
Node(tag=48, identifier=48, data=GeoUnit(name=48, unnoised_pop=2368139, parent=None, identifier=48, level=0))
400.48350313213473


In [None]:
toy.get_node(toy.root).data

In [None]:
toy.

In [None]:
hierarchy = [("Country", 1, 810.), 
             ("State", 3, 270.), 
             ("County", 3, 90.), 
             ("Dist", 3, 30.)]

eps_budget = 0.2                  
eps_splits = [0.25, 0.25, 0.25, 0.25] 

toy = ToyDown.from_hierarchy(hierarchy, eps_budget, eps_splits)
toy.show() 

In [3]:
# This is where everything from Proposition 1 happens.
eps_budget = 0.2                  
eps_splits = [0.2, 0.2, 0.2, 0.2, 0.2] 

toy.set_eps_budget_and_splits(eps_budget, eps_splits)
toy.noise_and_adjust()

In [None]:
# How to look at the node's attributes
toy.get_node("CountryState3").data.__dict__

In [None]:
toy

In [None]:
tract_recom[0]

In [None]:
# A demonstration of Proposition 1, calculated for CountryState1
branching = 3
exp_1 = toy.get_node("CountryState1").data.noise
exp_2 = ( toy.get_node("CountryState1").data.noise \
        + toy.get_node("CountryState2").data.noise \
        + toy.get_node("CountryState3").data.noise ) / branching
exp_3 = toy.get_node("Country").data.error / branching

math.isclose(exp_1 - exp_2 + exp_3,
             toy.get_node("CountryState1").data.error,
             abs_tol=0.00001)

In [None]:
# 6 -level Trees
hierarchy = [("Country", 1, 900000.), 
             ("State", 3, 300000.), 
             ("County", 3, 100000.), 
             ("Tract", 10, 10000.),
             ("BlockGroup", 10, 1000.),
             ("Block", 10, 100.)]

eps_budget = 0.4998 # I picked a random number from one of Peter's runs
eps_splits = [0, 0, 0.25, 0.25, 0.25, 0.25] 

toy = ToyModel(hierarchy, eps_budget, eps_splits)
toy.noise_and_adjust()
toy.show() 

In [None]:
# First, a sanity check that the equation at the end of Prop 4 works
# We will do this check on CountryState1County1Tract1
p = 0.5
dist_blocks = []
other_blocks = []
tract = toy.get_node("CountryState1County1Tract1")
block_groups = toy.children(tract.identifier)

def set_diff(a, b):
    return list(set(a).difference(set(b)))

for bg in block_groups:
    blocks = toy.children(bg.identifier)
    chosen_blocks = np.random.choice(blocks, int(p*10), replace=False)
    
    dist_blocks.extend(chosen_blocks)
    other_blocks.extend(set_diff(blocks, chosen_blocks))

dist_blk_noises = [blk.data.noise for blk in dist_blocks]
other_blk_noises = [blk.data.noise for blk in other_blocks]
blk_errors = [blk.data.error for blk in dist_blocks]

math.isclose((1-p) * sum(dist_blk_noises) - p * sum(other_blk_noises) + p * tract.data.error,
             sum(blk_errors),
             abs_tol = 0.000001)

In [None]:
# get blk errors for p from 0 to 1
def get_district_errors_at_p(p, tract_name):
    dist_blocks = []
    other_blocks = []
    tract = toy.get_node(tract_name)
    block_groups = toy.children(tract.identifier)

    for bg in block_groups:
        blocks = toy.children(bg.identifier)
        chosen_blocks = np.random.choice(blocks, int(p*10), replace=False)
        dist_blocks.extend(chosen_blocks)

    raw_errors = [blk.data.error for blk in dist_blocks]
    abs_errors = [abs(err) for err in raw_errors]
    
    return sum(raw_errors), sum(abs_errors)

In [None]:
raw_error_sums = []
abs_error_sums = []
ps = np.linspace(0,1,11)
for p in ps:
    raw_err, abs_err = get_district_errors_at_p(p, "CountryState1County1Tract1")
    raw_error_sums.append(raw_err)
    abs_error_sums.append(abs_err)

In [None]:
plt.plot(ps, raw_error_sums)

In [None]:
# we are probably more interested in the absolute value of the errors than the actual sum of errors
plt.plot(ps, abs_error_sums)