In [None]:
import matplotlib.pyplot as plt # for plotting maps
#import maup # mggg's library for proration, see documentation here: https://github.com/mggg/maup
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np 
from statistics import mean, median
from functools import reduce
import matplotlib.pyplot as plt
from textwrap import wrap
from collections import defaultdict
from tqdm.notebook import tqdm
import tabula
import os

from IPython.display import clear_output

%matplotlib inline

pd.set_option("display.max_rows", 150)
pd.set_option("display.max_columns", 150)
plt.rcParams.update({'figure.max_open_warning': 0})

DATA_PATH = "raw-from-source/"
CRS = 3857

## Summary - VEST IA Democratic Caucus Results 2016

### VEST documentation:

---
Election results from IA Democratic Party: http://iowademocrats.org/final-precinct-results-for-2016-iowa-democratic-party-caucuses/ (original url)
Precinct shapefile from IA Secretary of State: https://sos.iowa.gov/shapefiles/

The Iowa Democratic Party did not report individual vote tallies for the 2016 Iowa caucuses. The caucus results are State Delegate Equivalents that represent the number of state convention delegates that the candidates received based on the caucus results. The SDE figures are multiplied by 100 following the standard practice of the Associated Press since precinct-level SDE figures are provided as very small fractions in the actual caucus reports.

The precinct results do not include the state delegates awarded via satellite locations for participants with hardship exceptions or via tele-caucus for voters abroad. The satellite locations awarded 2 state delegates to Hillary Clinton and 1 state delegate to Bernie Sanders. The tele-caucus awarded 1 state delegate to Hillary Clinton and 1 state delegate to Bernie Sanders.

The following precincts were merged to match the 2016 caucus results:

Appanoose: Udell/Union
Black Hawk: Cedar Falls W2P2/Cedar Falls Twp
Des Moines: Burlington 1/Tama, Burlington 8/Concordia
Fremont: Hamburg/Washington, Farragut/Shenandoah1
Polk: Grimes 2/Urbandale 12

C16PREDCLI - Hillary Clinton \
C16PREDSAN - Bernie Sanders \
C16PREDOMA - Martin O'Malley \
C16PREDUNC - Uncommitted


### VEST data:

---

**ia_2016_demcaucus.shp** 

Found on the standard [VEST page](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/NH5S2I).


### Election data:

----
**caucusresults020116d.pdf**

Election data taken from [this site](http://www.p2016.org/chrniowa/caucusresultsrxn.html) as the link provided by VEST no longer functions. 

The site above also describes the subsequent modifications that were made to the original caucus results, which are presented below:

_Marion County, Knoxville 3 Precinct:
Reported As: 5 county convention delegates for Clinton, 4 county convention delegates for Sanders
Confirmed As: 4 county convention delegates for Clinton, 5 county convention delegates for Sanders
Net Change: Sanders gains 0.13 state delegate equivalents (SDEs); Clinton loses 0.13 SDEs_
 
_Woodbury County, 43 Oto/Oto Township Precinct:
Reported As: 1 county convention delegate for Clinton
Confirmed As: 1 county convention delegate for Sanders
Net Change: Sanders gains 0.15 SDEs, Clinton loses 0.15 SDEs_
 
_Osceola County, Ashton Precinct:
Reported As: 3 county convention delegates for O’Malley, 4 county convention delegates for Sanders
Confirmed As: 4 county convention delegates for O’Malley, 3 county convention delegates fro Sanders
Net Change: O’Malley gains 0.0167 SDEs, Sanders loses 0.0167 SDEs_
 
_Story County, Sherman Township Precinct:
Reported As: 1 county convention delegate for Sanders
Confirmed As: 1 county convention delegate for Clinton
Net Change: Clinton gains 0.23 SDEs, Sanders loses 0.23 SDEs_
 
_Poweshiek County, 1st Ward Grinnell:
Reported As: 18 county convention delegates for Sanders, 8 county convention delegates for Clinton
Confirmed As: 19 county convention delegates for Sanders, 7 county convention delegates for Clinton
Net Change: Sanders gains 0.072 SDEs, Clinton loses 0.072 SDEs_
 
_Total net Change:
Sanders gains 0.1053 SDEs
Clinton loses 0.122 SDEs
O’Malley gains 0.0167 SDEs_
 
Updated Results:\
Clinton: 700.47 SDEs (--0.122 SDEs) 49.84% \ 
Sanders: 696.92 SDEs (+0.1053 SDEs) 49.59% \
O’Malley: 7.63 SDEs (+0.0167 SDEs) 0.54% \
Uncommitted: 0.46 SDEs (unchanged) 0.03%


#### Raw Shapefile Data

---- 
**Precincts041714.shp**

Raw shapefile data was found by navigating to [this site](https://sos.iowa.gov/shapefiles/) and downloading the data named "Statewide Precinct Layer". 

#### Summary

----
We are able to validate that all 1680 precincts' election results and shapefile shapes match between VEST's reported data and the raw data. 

<font color="Coral">The following VEST precinct shapefiles are composed of 2 or more raw precinct shapefiles:</font>

Cedar Falls ward 2 precinct 2/CF Twp 
 
Burlington 1-T 
 
Burlington 8-C 
 
Dickinson 6/7 
 
Farragut 
 
Grimes Precinct 2 
 
Hamburg/Washington 
 
Union/Udell 

### Part 1 - Election data:

In [None]:
master_vest_df = gp.read_file("raw-from-source/vest/ia_2016_demcaucus.shp")

In [None]:
vest_df = master_vest_df.copy()
vest_df.sort_values("DISTRICT").head()
vest_df["ID"] = vest_df["COUNTY"] + "<->" + vest_df["DISTRICT"]
assert(len(vest_df["ID"]) == len(vest_df["ID"].unique()))

#### Process .pdf if not already, processed, otherwise load saved version. 

In [None]:
rename = {
    "Clinton":"C16PREDCLI",	
    "Sanders":"C16PREDSAN",	
    "O'Malley":"C16PREDOMA",	
    "Uncommitted":"C16PREDUNC"	
}
if os.path.exists("raw-from-source/processed_election_data.csv"):
    master_election_df = pd.read_csv("raw-from-source/processed_election_data.csv")
else:
    column_tables = tabula.read_pdf(file, pages=1)
    columns = column_tables[0].columns
    
    tables = tabula.read_pdf(file, pages = "all", multiple_tables = True, pandas_options={'header': None, "columns":columns})

    master_election_df = pd.concat(tables, axis=0)
    master_election_df.reset_index(inplace=True)
    master_election_df.drop(columns=["Unnamed: 0"], index=[0], inplace=True)

    # replace middle Fs with spaces
    master_election_df["Precinct'Name"] = master_election_df["Precinct'Name"].astype('str').str.slice(start=0, stop=1) + \
                                            master_election_df["Precinct'Name"].astype('str').str.slice(start=1).str.split("F").str.join(" ")

    master_election_df["Candidate"] = master_election_df["Candidate"].apply(lambda name: \
                                        "Clinton" if "Clinton" in name else name)

    # create unique ID for merging with VEST
    master_election_df["ID"] = master_election_df["County'Name"] + "<->" + master_election_df["Precinct'Name"]


    # nans are appearing after here for some reason
    master_election_df = pd.pivot_table(master_election_df, index="ID", columns="Candidate", 
                           values="Candidate'State'Delegate'Equivalents", aggfunc="first")

    # get ready for merge with VEST
    master_election_df.rename(columns=rename, inplace=True)
    master_election_df.reset_index(inplace=True)

    master_election_df.to_csv("raw-from-source/processed_election_data.csv")

#### Alright so there is a bug occuring, the columns of the dataframe are being "renamed" when a duplicate value occurs

#### Per VEST documentation, multiply SDE by 100

In [None]:
races = ["C16PREDCLI", "C16PREDSAN", "C16PREDOMA", "C16PREDUNC"]
election_df = master_election_df.copy()
election_df = election_df[["ID"] + races]
display(election_df.head())

for v in rename.values():
    election_df[v] = np.round(election_df[v].astype('float').to_numpy() * 100, 2)
    

#### Create county column to match VEST, we are going to need to validate on a county-by-county basis:

In [None]:
election_df["County"] = election_df["ID"].str.split('<->').str[0].replace(r"\BF\B", " ", regex=True)
election_df["County"] = election_df["County"].apply(lambda c: "Obrien" if c == "O'Brien" else c)

print(set(election_df["County"].to_list()).difference(vest_df["COUNTY"].to_list()))
print(set(vest_df["COUNTY"].to_list()).difference(election_df["County"].to_list()))

# we may need to look at the caucus expansion results later, but for now we will get rid of them:
election_df = election_df[election_df["County"] != 'Caucus Expansion Results'].copy()

In [None]:
vest_ids = []
raw_idxs = []

for county in tqdm(vest_df["COUNTY"].unique()):

    v = vest_df[vest_df["COUNTY"] == county].copy()
    e = election_df[election_df["County"] == county].copy()
    
    v.reset_index(inplace=True)
    e.reset_index(inplace=True)
    
    v["SORT"] = v[races[0]].astype('str') + " " + v[races[1]].astype('str') + " " + str(v[races].sum(axis=1)) + " "  + v["ID"]
    e["SORT"] = e[races[0]].astype('str') + " " + e[races[1]].astype('str') + " " + str(e[races].sum(axis=1)) + " "  + e["ID"]
    
    v = v.sort_values(by="SORT")
    e = e.sort_values(by="SORT")
     
    assert(len(v) == len(e))
    
    v_votes = v[races].to_numpy()
    e_votes = e[races].to_numpy()
    
    if np.array_equal(v_votes, e_votes):
        vest_ids.extend(v["ID"].to_list())
        raw_idxs.extend(e["ID"].to_list())
        continue
        
    wrong_idxs_v = []
    wrong_idxs_e = []
        
    print(county)
    print('-' * 20)
    print("Some errors in sorting, no further output means we sorted it out:\n")
    
    for idx in range(len(v)):
        if np.array_equal(v_votes[idx], e_votes[idx]):
            vest_ids.append(v["ID"].iloc[idx])
            raw_idxs.append(e["ID"].iloc[idx])
        
        else:
            wrong_idxs_v.append(idx)
            wrong_idxs_e.append(idx)
            
    used_e = set()
            
    for v_idx in wrong_idxs_v:
        for e_idx in wrong_idxs_e:
            if e_idx in used_e:
                continue
                
            if np.array_equal(v_votes[v_idx], e_votes[e_idx]):
                vest_ids.append(v["ID"].iloc[v_idx])
                raw_idxs.append(e["ID"].iloc[e_idx])
                used_e.add(e_idx)
                continue
                
    if len(used_e) != len(wrong_idxs_e):
        print("Something has still gone awry :(")
            
#     print(np.array_equal(v_votes, e_votes))
    print()


In [None]:
print(len(vest_ids))
print(len(raw_idxs))

In [None]:
raw2vest = {raw : vest for raw, vest in zip(raw_idxs, vest_ids)}
election_df["ORIG_ID"] = election_df["ID"].to_numpy()
election_df["ID"] = election_df["ID"].apply(lambda name: raw2vest[name] if name in raw2vest else name)

in_e = election_df[~election_df["ID"].isin(vest_df["ID"])]
in_v = vest_df[~vest_df["ID"].isin(election_df["ID"])]

print(len(in_e))
print(len(in_v))

election_df["ORIG_PRC"] = election_df["ORIG_ID"].str.split('<->').str[1]

#### Merge raw election results to VEST data.

In [None]:
recreated_df = vest_df.merge(election_df, on="ID", how="outer")
recreated_df.shape

### Part 2 - Shapfile data:

In [None]:
master_shape_df = gp.read_file("raw-from-source/raw_shapes/Precincts041714.shp")

In [None]:
shape_df = master_shape_df.copy()

In [None]:
print(f"There are {len(vest_df['geometry'].dropna())} VEST precincts.")
print(f"There are {len(shape_df['geometry'].dropna())} raw shape precincts.")
print(f"There are {len(election_df)} raw election precincts.")

In [None]:
geoms = []
vest_ids = []
geom_names = []

shape_df.crs = CRS
recreated_df.crs = CRS

idx2area = defaultdict(list)
vestidx2rawidx = defaultdict(list)

for raw_idx, geom in enumerate(tqdm(shape_df["geometry"])):
    vest_matches = recreated_df[recreated_df["geometry"].geom_almost_equals(geom)].copy()
    geoms.append(geom)
    geom_names.append(shape_df["NAME"].iloc[raw_idx])
    # we got a perfect match
    if len(vest_matches) == 1:
        vest_ids.append(vest_matches.index[0])
        continue
        
    # no perfect match, let's scan the surrounding area
    vest_matches = recreated_df[recreated_df["geometry"].intersects(geom)].copy()
    orig_idxs = vest_matches.index
    
    vest_matches.reset_index(inplace=True)
    
    shared = vest_matches.intersection(geom).area.to_numpy() / vest_matches.area.to_numpy()
    
    # choose the precinct that overlapped the most with the raw shapefile
    correct_idx = orig_idxs[np.argmax(shared)]
    
    idx2area[correct_idx].append(np.amax(shared))
    vestidx2rawidx[correct_idx].append(raw_idx)
    
    
    vest_ids.append(correct_idx)

In [None]:
doubled_idxs = {key : val for key, val in idx2area.items() if len(val) > 1}
for idx, lst in doubled_idxs.items():
    print(lst, "Sum:", sum([float(l) for l in lst]))

In [None]:
to_join = gp.GeoDataFrame(data={"index" : vest_ids, "NAMES" : geom_names}, geometry=geoms)
to_join.sort_values(by="index", inplace=True)
to_join = to_join.dissolve("index", aggfunc="first")

In [None]:
recreated_df["geometry_y"] = to_join["geometry"]
recreated_df["RAW_GEOM_NAMES"] = to_join["NAMES"]

### Part 3: Validation

#### Check Vote Totals:

In [None]:
def validater_row_vect(df, name_column, column_list, verbose=False):
    matching_rows = 0
    different_rows = 0
    
    county_join_cols = (df[name_column]).to_numpy()
    
    # it's because before, we were checking that a row was entirely consistent
    # here, we are double checking a lot
    # so, let's just keep one track of the rows that are messed up
    
    wrong_idxs = np.zeros(len(df))
    for i in column_list:
        left_data = df[i + "_x"].to_numpy()
        right_data = df[i + "_y"].to_numpy()
        
        local_idxs = np.where(left_data != right_data)
        wrong_idxs[local_idxs] = 1
        #print("Wrong idxs", wrong_idxs)
        
    # we are close, we get the same result, but are double adding lots of rows
    different_rows += np.sum(wrong_idxs)
    matching_rows += len(df) - different_rows
    
    diff_list = county_join_cols[np.where(wrong_idxs == 1)]
    diff_counties = list(set([county[:2] for county in diff_list]))
    
    if int(different_rows) != 0 or verbose:
        print("There are ", len(df.index)," total rows")
        print(f"{int(different_rows)} of these rows have election result differences")
        print(f"{int(matching_rows)} of these rows are the same")
        print(diff_list)
        print("")
        
    return (int(different_rows) == 0, diff_list)

print("Are all vote totals correct?")
validater_row_vect(recreated_df, "ID", races)[0]

#### Check shapefile geometries:

In [None]:
vest_shapes = gp.GeoSeries(recreated_df["geometry"])
raw_shapes = gp.GeoSeries(recreated_df["geometry_y"])

vest_shapes.crs = CRS
raw_shapes.crs = CRS

print(f"There are {len(election_df)} precincts.")

same_shapes = vest_shapes[vest_shapes.geom_almost_equals(raw_shapes, decimal=0)]

print(f"{len(same_shapes)} of those precincts' shapefiles match exactly.\n")

different_vest = vest_shapes[~vest_shapes.geom_almost_equals(raw_shapes, decimal=0)]
different_raw = raw_shapes[~raw_shapes.geom_almost_equals(vest_shapes, decimal=0)]

different_vest_area = different_vest.area.to_numpy() 
different_raw_area = different_raw.area.to_numpy()

area_diff = np.abs(different_vest_area - different_raw_area)

print(f"Of the remaining {len(area_diff)} precincts:")
print(f"{len(area_diff[area_diff < .1])} precincts contain a difference of less than 0.1 km^2.")


print("\nThe shapefiles and election results match exactly between the raw files and VEST.")


#### Dataframe containing the precinct names that were assigned to one another from the VEST, raw election, and raw shapefile data. 

In [None]:
name_comparison = recreated_df[["NAME", "ORIG_PRC", "RAW_GEOM_NAMES"]]
name_comparison.head()