In [139]:
import matplotlib.pyplot as plt # for plotting maps
import maup # mggg's library for proration, see documentation here: https://github.com/mggg/maup
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np 
from statistics import mean, median
from functools import reduce
import matplotlib.pyplot as plt
from textwrap import wrap
from collections import defaultdict
from tqdm.notebook import tqdm

from IPython.display import clear_output

%matplotlib inline

pd.set_option("display.max_rows", 150)
pd.set_option("display.max_columns", 150)
plt.rcParams.update({'figure.max_open_warning': 0})

DATA_PATH = "raw-from-source/"
CRS = 3857

### General Summary:


#### VEST Documentation

-----
Election results from IA Secretary of State: https://sos.iowa.gov/elections/results/index.html
Precinct shapefile from IA Secretary of State: https://sos.iowa.gov/shapefiles/

<font color="Coral">The following precincts were merged to match the 2016 election reports:</font>


Appanoose: Udell/Union

Black Hawk: Cedar Falls W2P2/Cedar Falls Twp

Des Moines: Burlington 1/Tama, Burlington 8/Concordia

Dickinson: Precinct 6/Precinct 7

Fremont: Hamburg/Washington, Farragut/Shenandoah1

Polk: Grimes 2/Urbandale 12

##### <font color="Coral">Offices:</font>


G16PRERTRU - Donald J. Trump (Republican Party)

G16PREDCLI - Hillary Clinton (Democratic Party)

G16PRELJOH - Gary Johnson (Libertarian Party)

G16PREGSTE - Jill Stein (Green Party)

G16PRECCAS - Darrell Castle (Constitution Party)

G16PREOVAC - Dan Vacek (Legal Marijuana Now Party)

G16PREOKAH - Lynn Kahn (New Independent Party Iowa)

G16PREIMCM - Evan McMullin (Independent)

G16PREIFUE - Rocky De La Duente (Independent)

G16PREORIV - Gloria La Riva (Party for Socialism and Liberation)

G16PREOWRI - Write-in Votes


G16USSRGRA - Chuck Grassley (Republican Party)

G16USSDJUD - Patty Judge (Democratic Party)

G16USSLALD - Charles Aldrich (Libertarian Party)

G16USSOHEN - Jim Hennager (New Independent Party Iowa)

G16USSILUI - Michael Luick-Thrams (Independent)

G16USSOWRI - Write-in Votes



#### VEST Data

----

VEST dataframe, **ia_2016.sh** downloaded [here](https://dataverse.harvard.edu/dataverse/electionscience/?q=vt_2018).


#### Raw Election Data

----
Election results pulled from .pdf file that was located by navigating to [this site](https://sos.iowa.gov/elections/results/index.html). 

#### Raw Shapefile Data

---- 
Raw shapefile data was found by navigating to [this site](https://sos.iowa.gov/shapefiles/) and downloading the data named "Statewide Precinct Layer". 

#### Summary

----
We are able to validate that all election results and all shapefile shapes match between VEST's reported data and the raw data. 

<font color="Coral">The following VEST precinct shapefiles are composed of 2 or more raw precinct shapefiles:</font>

Cedar Falls ward 2 precinct 2/CF Twp 
 
Burlington 1-T 
 
Burlington 8-C 
 
Dickinson 6/7 
 
Farragut 
 
Grimes Precinct 2 
 
Hamburg/Washington 
 
Union/Udell 

### Part 1: Election Results

#### Read in files:

In [2]:
master_vest_df = gp.read_file("raw-from-source/vest/ia_2016.shp")
master_sos_df = pd.read_excel("raw-from-source/sos/statewide.xlsx")

In [3]:
vest_df = master_vest_df.copy()
vest_df

Unnamed: 0,COUNTY,DISTRICT,NAME,G16PRERTRU,G16PREDCLI,G16PRELJOH,G16PREGSTE,G16PRECCAS,G16PREOVAC,G16PREOKAH,G16PREIMCM,G16PREIFUE,G16PREORIV,G16PREOWRI,G16USSRGRA,G16USSDJUD,G16USSLALD,G16USSOHEN,G16USSILUI,G16USSOWRI,geometry
0,Dickinson,DICKINSON8,Dickinson 8,1281,512,61,12,6,3,1,9,2,0,18,1403,405,40,21,3,1,"POLYGON ((-95.15024 43.25548, -95.15790 43.255..."
1,Dickinson,DICKINSON9,Dicksinon 9,83,24,6,0,0,0,0,0,0,0,0,89,16,0,1,2,0,"POLYGON ((-95.22636 43.34245, -95.22637 43.342..."
2,Clay,LAFR,Lake/Freeman,204,83,9,1,1,0,0,3,0,0,0,212,73,11,0,1,0,"POLYGON ((-95.03285 43.25530, -95.02263 43.255..."
3,Clay,SRSM,Summit/Riverton/Sioux/Meadow,610,179,31,1,2,1,1,6,0,0,4,610,160,28,9,4,1,"POLYGON ((-95.26926 43.25537, -95.26140 43.255..."
4,Clay,WFLT,Waterford/Lone Tree,396,99,15,3,0,0,0,4,0,0,4,418,72,11,6,4,0,"POLYGON ((-95.38809 43.08415, -95.38808 43.084..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,Des Moines,BURL8,Burlington 8-C,823,808,52,10,9,3,3,14,2,2,18,931,726,43,6,4,2,"POLYGON ((-91.09161 40.78979, -91.09125 40.786..."
1676,Dickinson,DICKINSON6/7,Dickinson 6/7,272,125,14,2,1,0,0,1,0,0,5,294,100,16,4,0,0,"POLYGON ((-94.91399 43.34146, -94.91398 43.338..."
1677,Fremont,HAMBURG,Hamburg/Washington,432,189,16,2,7,1,3,7,1,0,4,475,138,15,10,3,0,"POLYGON ((-95.77414 40.64447, -95.77416 40.646..."
1678,Fremont,FAR,Farragut,450,184,17,3,2,2,1,2,2,0,11,475,151,23,13,1,0,"POLYGON ((-95.37392 40.58050, -95.37395 40.580..."


#### Pivot and rename raw election data to better match VEST:

In [4]:
sos_df = master_sos_df.copy()
sos_df = sos_df.iloc[:17]
sos_df = sos_df[[col for col in sos_df if "Polling" not in col and "Absentee" not in col \
                 and col != "RaceTitle" and "Party" not in col]]

# Easy pivot, just flip rows and columns
sos_df = sos_df.T

sos_df.columns = sos_df.iloc[0]
sos_df.drop(["CandidateName"], inplace=True)
sos_df.reset_index(inplace=True)
sos_df.rename(columns={"index" : "NAME", "Charles E. Grassley " : "Chuck E. Grassley"}, inplace=True)

display(sos_df)

can2vest = {l.split(',')[1].strip() : l.split(',')[0].strip() for l in open("raw-from-source/can2vest.txt", "r").readlines()}

columns = ["NAME"]
seen_before = False
for col in sos_df:
    if "Write-in" in col:
        if seen_before:
            columns.append("G16USSOWRI")
        else:
            seen_before = True
            columns.append("G16PREOWRI")
            
        continue
        
    name = col.split()[0]
    for can in can2vest:
        if name in can:
            columns.append(can2vest[can])
            continue
            
print(columns)
            
sos_df.columns = columns
sos_df["NAME"] = sos_df["NAME"].str.replace(" Total", "")
sos_df["NAME"] = sos_df["NAME"].str.replace("-", " ")

for col in sos_df:
    if "G16" not in col:
        continue
    sos_df[col] = sos_df[col].astype('int')
display(sos_df.head())
# sos_df("Sum of PRE write in", sos_df["G16PREOWRI"].sum())

CandidateName,NAME,Donald J. Trump and Michael R. Pence,Hillary Clinton and Tim Kaine,Darrell L. Castle and Scott N. Bradley,Jill Stein and Ajamu Baraka,Dan R. Vacek and Mark G. Elworth,Gary Johnson and Bill Weld,Lynn Kahn and Jay Stolba,Rocky Roque De La Fuente and Michael Steinberg,Evan McMullin and Nathan Johnson,Gloria La Riva and Dennis J. Banks,Write-in,Chuck E. Grassley,Patty Judge,Charles Aldrich,Jim Hennager,Michael Luick-Thrams,Write-in.1
0,Adair-1NW Total,549,174,1,3,2,21,0,1,1,0,6,562,145,29,11,5,0
1,Adair-2NE Total,455,250,3,6,3,26,1,0,3,0,11,508,198,27,8,1,1
2,Adair-3SW Total,545,200,3,0,2,23,2,2,2,0,10,600,156,21,7,2,0
3,Adair-4SE Total,433,226,3,0,3,27,2,0,3,0,3,490,170,18,12,2,1
4,Adair-5GF Total,479,283,0,5,0,30,0,0,1,0,8,548,225,16,8,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,Wright-Eagle Grove - Ward #2 Total,195,121,1,0,0,4,0,0,0,0,3,223,91,6,2,0,0
1676,Wright-Eagle Grove - Ward #3 Total,273,140,3,2,0,13,0,0,1,0,2,295,121,15,0,1,0
1677,Wright-Eagle Grove - Ward #4 Total,209,114,1,2,1,12,1,0,3,0,6,243,92,9,2,0,0
1678,Wright-Goldfield Precinct Total,323,99,2,0,0,10,0,0,1,0,2,342,72,18,5,0,0


['NAME', 'G16PRERTRU', 'G16PREDCLI', 'G16PRECCAS', 'G16PREGSTE', 'G16PREOVAC', 'G16PRELJOH', 'G16PREOKAH', 'G16PREIFUE', 'G16PREIMCM', 'G16PREORIV', 'G16PREOWRI', 'G16USSRGRA', 'G16USSDJUD', 'G16USSLALD', 'G16USSOHEN', 'G16USSILUI', 'G16USSOWRI']


Unnamed: 0,NAME,G16PRERTRU,G16PREDCLI,G16PRECCAS,G16PREGSTE,G16PREOVAC,G16PRELJOH,G16PREOKAH,G16PREIFUE,G16PREIMCM,G16PREORIV,G16PREOWRI,G16USSRGRA,G16USSDJUD,G16USSLALD,G16USSOHEN,G16USSILUI,G16USSOWRI
0,Adair 1NW,549,174,1,3,2,21,0,1,1,0,6,562,145,29,11,5,0
1,Adair 2NE,455,250,3,6,3,26,1,0,3,0,11,508,198,27,8,1,1
2,Adair 3SW,545,200,3,0,2,23,2,2,2,0,10,600,156,21,7,2,0
3,Adair 4SE,433,226,3,0,3,27,2,0,3,0,3,490,170,18,12,2,1
4,Adair 5GF,479,283,0,5,0,30,0,0,1,0,8,548,225,16,8,1,0


#### Let's compare lengths and vote sums between each dataframe! 

In [5]:
assert(len(sos_df) == len(vest_df))

e_cols = [col for col in sos_df if "G16" in col]

sos_votes = sos_df[e_cols]
vest_votes = vest_df[e_cols]

assert(len(sos_votes.columns) == len(vest_votes.columns))

for col in sos_votes:
    assert(sos_votes[col].sum() == vest_votes[col].sum())

#### Try to find a unique axis to merge on:

In [6]:
sos_df["UNIQUE"] = sos_df["G16PRERTRU"].astype('str') + " " + sos_df["G16PREDCLI"].astype('str') + " " + sos_df["G16USSRGRA"].astype('str')
vest_df["UNIQUE"] = vest_df["G16PRERTRU"].astype('str') + " " + vest_df["G16PREDCLI"].astype('str') + " " + + vest_df["G16USSRGRA"].astype('str')

In [7]:
print(len(sos_df["UNIQUE"]), len(sos_df["UNIQUE"].unique()))
print(len(vest_df["UNIQUE"]), len(vest_df["UNIQUE"].unique()))
print(len(sos_df[~sos_df["UNIQUE"].isin(vest_df["UNIQUE"])]))

1680 1680
1680 1680
0


#### Merge election and VEST data:

In [8]:
election_df = vest_df.merge(sos_df, on="UNIQUE", how="outer")
print(election_df.shape)

(1680, 40)


#### Compare Vote Totals:

In [9]:
def validater_row_vect(df, name_column, column_list, verbose=False):
    matching_rows = 0
    different_rows = 0
    
    county_join_cols = (df[name_column]).to_numpy()
    
    # it's because before, we were checking that a row was entirely consistent
    # here, we are double checking a lot
    # so, let's just keep one track of the rows that are messed up
    
    wrong_idxs = np.zeros(len(df))
    for i in column_list:
        left_data = df[i + "_x"].to_numpy()
        right_data = df[i + "_y"].to_numpy()
        
        local_idxs = np.where(left_data != right_data)
        wrong_idxs[local_idxs] = 1
        #print("Wrong idxs", wrong_idxs)
        
    # we are close, we get the same result, but are double adding lots of rows
    different_rows += np.sum(wrong_idxs)
    matching_rows += len(df) - different_rows
    
    diff_list = county_join_cols[np.where(wrong_idxs == 1)]
    diff_counties = list(set([county[:2] for county in diff_list]))
    
    if int(different_rows) != 0 or verbose:
        print("There are ", len(df.index)," total rows")
        print(f"{int(different_rows)} of these rows have election result differences")
        print(f"{int(matching_rows)} of these rows are the same")
        print(diff_list)
        print("")
        
    return (int(different_rows) == 0, diff_list)

print("Are the election results the same between files?")
validater_row_vect(election_df, "NAME_x", set([col for col in vest_df if "G16" in col]))[0]

Are the election results the same between files?


True

### Part 2: Shapefile comparison

#### Load in raw shapefiles:

In [72]:
master_shape_df = gp.read_file("raw-from-source/raw_shapes/Precincts041714.shp")

In [91]:
shape_df = master_shape_df.copy()
shape_df

Unnamed: 0,OBJECTID,OBJECTID_1,ID,AREA,DISTRICT,MEMBERS,LOCKED,NAME,POPULATION,IDEAL_VALU,DEVIATION,House_Dist,Senate_Dis,Congressio,Shape_Leng,Shape_Le_1,Shape_Area,geometry
0,1,1,6.0,35.873722,DICKINSON6,1.0,,Dickinson 6,576.0,1.0,575.0,2.0,1,4,0.411027,0.411027,0.010325,"POLYGON ((-94.91399 43.34146, -94.91398 43.338..."
1,2,2,7.0,29.885220,DICKINSON7,1.0,,Dickinson 7,212.0,1.0,211.0,2.0,1,4,0.462693,0.462693,0.008601,"POLYGON ((-95.03285 43.25530, -95.03953 43.255..."
2,3,3,8.0,29.847010,DICKINSON8,1.0,,Dickinson 8,3366.0,1.0,3365.0,2.0,1,4,0.589906,0.589906,0.008592,"POLYGON ((-95.15024 43.25548, -95.15790 43.255..."
3,4,4,9.0,48.195625,DICKINSON9,1.0,,Dicksinon 9,162.0,1.0,161.0,2.0,1,4,0.535877,0.535877,0.013872,"POLYGON ((-95.22636 43.34245, -95.22637 43.342..."
4,5,5,2.0,71.427567,LAFR,1.0,,Lake/Freeman,595.0,1.0,594.0,2.0,1,4,0.582407,0.582407,0.020525,"POLYGON ((-95.03285 43.25530, -95.02263 43.255..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1684,0,0,5.0,2.317871,CLARINDA1,1.0,,Clarinda 1,2785.0,1.0,2784.0,0.0,,,0.000000,0.000000,0.000000,"POLYGON ((-95.05988 40.74228, -95.05962 40.742..."
1685,0,0,6.0,2.903984,CLARINDA2,1.0,,Clarinda 2,2787.0,1.0,2786.0,0.0,,,0.000000,0.000000,0.000000,"MULTIPOLYGON (((-95.05988 40.74228, -95.06105 ..."
1686,0,0,7.0,1.105263,SHENANDOAH1,1.0,,Shenandoah 1,1657.0,1.0,1656.0,0.0,,,0.000000,0.000000,0.000000,"POLYGON ((-95.38509 40.76406, -95.38509 40.764..."
1687,0,0,8.0,0.969236,SHENANDOAH2,1.0,,Shenandoah 2,1748.0,1.0,1747.0,0.0,,,0.000000,0.000000,0.000000,"POLYGON ((-95.38524 40.75082, -95.38519 40.752..."


In [92]:
print(f"There are {len(vest_df['geometry'].dropna())} VEST precincts.")
print(f"There are {len(shape_df['geometry'].dropna())} raw shape precincts.")
print(f"There are {len(sos_df)} raw election precincts.")

There are 1680 VEST precincts.
There are 1689 raw shape precincts.
There are 1680 raw election precincts.


#### Too many naming differences between raw shapefile and election files, going to need to use VEST to join.

In [149]:
geoms = []
vest_ids = []

shape_df.crs = CRS
election_df.crs = CRS

idx2area = defaultdict(list)
vestidx2rawidx = defaultdict(list)

for raw_id, geom in enumerate(tqdm(shape_df["geometry"])):
    vest_matches = election_df[election_df["geometry"].geom_almost_equals(geom)].copy()
    geoms.append(geom)
    
    # we got a perfect match
    if len(vest_matches) == 1:
        vest_ids.append(vest_matches.index[0])
        continue
        
    # no perfect match, let's scan the surrounding area
    vest_matches = election_df[election_df["geometry"].intersects(geom)].copy()
    orig_idxs = vest_matches.index
    
    vest_matches.reset_index(inplace=True)
    
    shared = vest_matches.intersection(geom).area.to_numpy() / vest_matches.area.to_numpy()
    
    # choose the precinct that overlapped the most with the raw shapefile
    correct_idx = orig_idxs[np.argmax(shared)]
    
    idx2area[correct_idx].append(np.amax(shared))
    vestidx2rawidx[correct_idx].append(raw_id)
    
    
    vest_ids.append(correct_idx)

    


  0%|          | 0/1689 [00:00<?, ?it/s]

#### All raw precincts that are assigned to the same VEST precinct sum to the VEST precincts' area. (The lists sum to 1.)

In [179]:
doubled_idxs = {key : val for key, val in idx2area.items() if len(val) > 1}
for idx, lst in doubled_idxs.items():
    print(lst, "Sum:", sum([float(l) for l in lst]))

[0.8638398261704965, 0.13616017382950432] Sum: 1.0000000000000009
[0.10579072362313927, 0.8942092763768601] Sum: 0.9999999999999993
[0.12931048791654223, 0.870689512083456] Sum: 0.9999999999999982
[0.5455458341138446, 0.45445416588615495] Sum: 0.9999999999999996
[0.9999501246485349, 4.987535146822659e-05] Sum: 1.000000000000003
[0.9597614752665348, 0.04023852473346539] Sum: 1.0000000000000002
[0.0042830702041175015, 0.009105434567440122, 0.9866114952284425] Sum: 1.0000000000000002
[0.5050685673789276, 0.49493143262107264] Sum: 1.0000000000000002


In [180]:
# assign raw geoms to vest geoms
to_join = gp.GeoDataFrame(data={"index" : vest_ids}, geometry=geoms)
to_join.sort_values(by="index", inplace=True)
to_join = to_join.dissolve("index")

display(to_join)

Unnamed: 0_level_0,geometry
index,Unnamed: 1_level_1
0,"POLYGON ((-95.15024 43.25548, -95.15790 43.255..."
1,"POLYGON ((-95.22636 43.34245, -95.22637 43.342..."
2,"POLYGON ((-95.03285 43.25530, -95.02263 43.255..."
3,"POLYGON ((-95.26926 43.25537, -95.26140 43.255..."
4,"POLYGON ((-95.38809 43.08415, -95.38808 43.084..."
...,...
1675,"POLYGON ((-91.11296 40.69661, -91.11355 40.696..."
1676,"POLYGON ((-95.03953 43.25532, -95.03954 43.255..."
1677,"POLYGON ((-95.77423 40.64869, -95.77431 40.652..."
1678,"POLYGON ((-95.38505 40.76683, -95.38505 40.766..."


In [181]:
election_df["geometry_y"] = to_join["geometry"]

#### Compare the geometries between the 2 files:

In [182]:
vest_shapes = gp.GeoSeries(election_df["geometry"])
raw_shapes = gp.GeoSeries(election_df["geometry_y"])

vest_shapes.crs = CRS
raw_shapes.crs = CRS

In [187]:
print(f"There are {len(election_df)} precincts.")

same_shapes = vest_shapes[vest_shapes.geom_almost_equals(raw_shapes, decimal=0)]

print(f"{len(same_shapes)} of those precincts' shapefiles match exactly.\n")

different_vest = vest_shapes[~vest_shapes.geom_almost_equals(raw_shapes, decimal=0)]
different_raw = raw_shapes[~raw_shapes.geom_almost_equals(vest_shapes, decimal=0)]

different_vest_area = different_shapes.area.to_numpy() 
different_raw_area = different_raw.area.to_numpy()

area_diff = np.abs(different_vest_area - different_raw_area)

print(f"Of the remaining {len(area_diff)} precincts:")
print(f"{len(area_diff[area_diff < .1])} precincts contain a difference of less than 0.1 km^2.")


print("\nThe shapefiles and election results match exactly between the raw files and VEST.")

There are 1680 precincts.
1644 of those precincts' shapefiles match exactly.

Of the remaining 36 precincts:
36 precincts contain a difference of less than 0.1 km^2.

The shapefiles and election results match exactly between the raw files and VEST.


In [191]:
### which VEST precincts were composed of multiple raw precincts?
print(election_df["NAME_x"].iloc[list(doubled_idxs.keys())].to_list())

['Cedar Falls ward 2 precinct 2/CF Twp', 'Burlington 1-T', 'Burlington 8-C', 'Dickinson 6/7', 'Farragut', 'Grimes Precinct 2', 'Hamburg/Washington', 'Union/Udell']
