In [1]:
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np 
import os
import fiona
from statistics import mean, median
from pandas import read_csv
gp.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw' #To load KML files
import string
import xml.etree.ElementTree as et

# Alaska

## Import VEST File

In [2]:
vest_ak_20 = gp.read_file("./raw-from-source/VEST/ak_2020/ak_2020.shp")
print(vest_ak_20.shape)

(441, 15)


In [3]:
vest_ak_20.head(2)

Unnamed: 0,DISTRICT,NAME,G20PRERTRU,G20PREDBID,G20PRELJOR,G20PREGJAN,G20PRECBLA,G20PREIPIE,G20PREOFUE,G20USSRSUL,G20USSDGRO,G20USSOHOW,G20HALRYOU,G20HALDGAL,geometry
0,11-075,11-075 Palmer City No. 2,685,397,46,12,7,0,0,656,421,71,686,441,"POLYGON ((-149.13365 61.61026, -149.13361 61.6..."
1,12-220,12-220 Butte,1808,689,62,16,4,3,0,1761,673,96,1780,751,"POLYGON ((-148.52103 61.62682, -148.52020 61.6..."


## VEST Documentation

### Data Sources

> - Election results from Alaska Division of Elections (http://www.elections.alaska.gov/results/20GENR/index.php) 
> - Precinct shapefile from Alaska Division of Elections (http://www.elections.alaska.gov/Core/districtmaps.php)

### Processing

> Early, Absentee, and Questioned votes are only reported at the State House district level (since Alaska has portions of the state with no lower level of government, HDs serve a similar purpose as counties do in other states for the purpose of reporting votes). These votes are apportioned to precincts by candidate in the same shares that the Election Day vote was split among precincts within an HD. Similarly, federal-only ballots that are reported at the statewide level were apportioned to precincts by candidate based on their share of the precinct-level vote.


### Races 

> G20PRERTRU - Donald J. Trump (Republican Party)  
G20PREDBID - Joseph R. Biden (Democratic Party)  
G20PRELJOR - Jo Jorgensen (Libertarian Party)  
G20PREGJAN - James G. "Jesse Ventura" Janos (Green Party)  
G20PRECBLA - Don Blankenship (Conservative Party)  
G20PREIPIE - Brock Pierce (Independent)  
G20PREOFUE - Roque "Rocky" De La Fuente (Alliance Party)  
  
> G20USSRSUL - Dan Sullivan (Republican Party)  
G20USSDGRO - Al Gross (Democratic Party)  
G20USSOHOW - John Wayne Howe (Alaskan Independence Party)  
  
> G20HALRYOU - Don Young (Republican Party)  
G20HALDGAL - Alyse S. Galvin (Democratic Party)  

## Election Results

### Load and Clean Election Results File

In [4]:
ak_2020 = pd.read_csv("./raw-from-source/Election_Results/resultsbyprecinct.txt",header=None)
ak_2020.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,HD99 Fed Overseas Absentee,Race Statistics,NP,NP,Number of Precincts,NP,Total,0,


#### Name and Filter Down to Relevant Columns

In [5]:
ak_2020.columns = ["Precinct","Race","Blank_1","Blank_2","Choice","Party","Type","Votes","Extra"]
ak_2020 = ak_2020 [["Precinct","Race","Choice","Party","Votes"]]

#### Filter Down to Relevant Races

In [6]:
print(ak_2020["Race"].unique())
race_list = ['U.S. President / Vice President""', 'U.S. Senator""','U.S. Representative""']
ak_2020 = ak_2020[ak_2020["Race"].isin(race_list)]

['Race Statistics' 'U.S. President / Vice President""' 'U.S. Senator""'
 'U.S. Representative""' 'House District 1""' 'Supreme Court - Carney""'
 'Court of Appeals - Wollenberg""' 'Superior Court JD4 - Peters""'
 'District Court JD4 - Christian""' 'District Court JD4 - Montgomery""'
 'Ballot Measure No. 1 - 19OGTX""' 'Ballot Measure No. 2 - 19AKBE""'
 'House District 2""' 'Senate District B""' 'House District 3""'
 'House District 4""' 'House District 5""' 'House District 6""'
 'Superior Court JD3 - Crosby""' 'Superior Court JD3 - Guidi""'
 'Superior Court JD3 - Henderson""' 'Superior Court JD3 - Lamoureux""'
 'Superior Court JD3 - Miller""' 'Superior Court JD3 - Reigh""'
 'Superior Court JD3 - Wells""' 'Superior Court JD3 - Woodman""'
 'District Court JD3 - Dickson""' 'District Court JD3 - Franciosi""'
 'District Court JD3 - Hanley""' 'District Court JD3 - Logue""'
 'District Court JD3 - McCrea""' 'District Court JD3 - Wallace""'
 'District Court JD3 - Washington""' 'Senate District D

#### Filter Down to Relevant Candidates

In [7]:
print(ak_2020["Choice"].unique())
choice_list = ['Biden  Joseph R. Jr. / Harris  Kamala D.""',
       'Blankenship  Don / Mohr  William""',
       'De La Fuente  Rocque "Rocky" / Richardson  Darcy G.""',
       'Janos  James G. "Jesse Ventura" / McKinney  Cynthia""',
       'Jorgensen  Jo / Cohen  Jeremy "Spike"""',
       'Pierce  Brock / Ballard  Karla""',
       'Trump  Donald J. / Pence  Michael R.""', 'Gross  Al""',
       'Howe  John Wayne""', 'Sullivan  Dan""', 'Galvin  Alyse S.""',
       'Young  Don""']
ak_2020 = ak_2020[ak_2020["Choice"].isin(choice_list)]

['Number of Precincts for Race' 'Number of Precincts Reporting'
 'Registered Voters' 'Times Counted'
 'Biden  Joseph R. Jr. / Harris  Kamala D.""'
 'Blankenship  Don / Mohr  William""'
 'De La Fuente  Rocque "Rocky" / Richardson  Darcy G.""'
 'Janos  James G. "Jesse Ventura" / McKinney  Cynthia""'
 'Jorgensen  Jo / Cohen  Jeremy "Spike"""'
 'Pierce  Brock / Ballard  Karla""'
 'Trump  Donald J. / Pence  Michael R.""' 'Gross  Al""'
 'Howe  John Wayne""' 'Sullivan  Dan""' 'Galvin  Alyse S.""'
 'Young  Don""']


### Pivot Election Results File

In [8]:
#Perform the pivot
ak_2020_pvt = pd.pivot_table(ak_2020,index=["Precinct"],values=["Votes"],columns=["Choice"],aggfunc=sum)

#Clean up columns / index
ak_2020_pvt.columns=ak_2020_pvt.columns.droplevel(0)
ak_2020_pvt.reset_index(inplace=True,drop=False)

#Take a look
ak_2020_pvt.head(1)

Choice,Precinct,"Biden Joseph R. Jr. / Harris Kamala D.""""","Blankenship Don / Mohr William""""","De La Fuente Rocque ""Rocky"" / Richardson Darcy G.""""","Galvin Alyse S.""""","Gross Al""""","Howe John Wayne""""","Janos James G. ""Jesse Ventura"" / McKinney Cynthia""""","Jorgensen Jo / Cohen Jeremy ""Spike""""""","Pierce Brock / Ballard Karla""""","Sullivan Dan""""","Trump Donald J. / Pence Michael R.""""","Young Don"""""
0,01-446 Aurora,239,3,0,293,246,46,2,43,3,465,466,452


#### Clean up pivoted file columns

In [9]:
print(ak_2020_pvt.columns)
column_changes_dict = {'Biden  Joseph R. Jr. / Harris  Kamala D.""':"G20PREDBID",
                       'Jorgensen  Jo / Cohen  Jeremy "Spike"""':"G20PRELJOR",
                       'Trump  Donald J. / Pence  Michael R.""':"G20PRERTRU",
                       'Blankenship  Don / Mohr  William""':"G20PRECBLA",
       'De La Fuente  Rocque "Rocky" / Richardson  Darcy G.""':"G20PREOFUE",
        'Janos  James G. "Jesse Ventura" / McKinney  Cynthia""':"G20PREGJAN",
                       'Pierce  Brock / Ballard  Karla""':"G20PREIPIE", 
                        'Gross  Al""':"G20USSDGRO", 
                        'Howe  John Wayne""':"G20USSOHOW",
                         'Sullivan  Dan""':"G20USSRSUL",     
                           'Galvin  Alyse S.""':"G20HALDGAL",               
                            'Young  Don""':"G20HALRYOU"}
ak_2020_pvt = ak_2020_pvt.rename(columns=column_changes_dict)

Index(['Precinct', 'Biden  Joseph R. Jr. / Harris  Kamala D.""',
       'Blankenship  Don / Mohr  William""',
       'De La Fuente  Rocque "Rocky" / Richardson  Darcy G.""',
       'Galvin  Alyse S.""', 'Gross  Al""', 'Howe  John Wayne""',
       'Janos  James G. "Jesse Ventura" / McKinney  Cynthia""',
       'Jorgensen  Jo / Cohen  Jeremy "Spike"""',
       'Pierce  Brock / Ballard  Karla""', 'Sullivan  Dan""',
       'Trump  Donald J. / Pence  Michael R.""', 'Young  Don""'],
      dtype='object', name='Choice')


#### Re-order the columns to match what VEST has

In [10]:
ak_2020_pvt = ak_2020_pvt[["Precinct",'G20PRERTRU', 'G20PREDBID', 'G20PRELJOR',
       'G20PREGJAN', 'G20PRECBLA', 'G20PREIPIE', 'G20PREOFUE', 'G20USSRSUL',
       'G20USSDGRO', 'G20USSOHOW', 'G20HALRYOU', 'G20HALDGAL']]

## Check Races Totals Against VEST (before any allocation)

In [11]:
data_columns = ['G20PRERTRU', 'G20PREDBID', 'G20PRELJOR',
       'G20PREGJAN', 'G20PRECBLA', 'G20PREIPIE', 'G20PREOFUE', 'G20USSRSUL',
       'G20USSDGRO', 'G20USSOHOW', 'G20HALRYOU', 'G20HALDGAL']

print("Printing differences below:")
for race in data_columns:
    if (sum(vest_ak_20[race])-sum(ak_2020_pvt[race]) != 0):
        print(race+" has a difference of "+str(sum(vest_ak_16[race])-sum(ak_2020_pvt[race]))+" votes")
        print("\tVEST: "+str(sum(vest_ak_20[race]))+" votes")
        print("\tSOURCES: "+str(sum(ak_2020_pvt[race]))+" votes")
print("")
print("All other races are equal")

Printing differences below:

All other races are equal


## Allocate Votes (so we can check district totals and precinct totals against VEST)

### VEST's Documentation

> - Early, Absentee, and Questioned votes are only reported at the State House district level (since Alaska has portions of the state with no lower level of government, HDs serve a similar purpose as counties do in other states for the purpose of reporting votes). These votes are apportioned to precincts by candidate in the same shares that the Election Day vote was split among precincts within an HD. 
> - Similarly, federal-only ballots that are reported at the statewide level were apportioned to precincts by candidate based on their share of the precinct-level vote.

Note: Vote allocation will be broken into two separate steps:
1. Allocating early, absentee, and questioned votes (reported at state house district level)
2. Allocating federal-only ballots

In [12]:
#races=['G20PRERTRU', 'G20PREDBID', 'G20PRELJOR', 'G20PREGJAN',
       #'G20PRECBLA', 'G20PREIPIE', 'G20PREOFUE', 'G20USSRSUL', 'G20USSDGRO',
       #'G20USSOHOW', 'G20HALRYOU', 'G20HALDGAL']
#for race in races:
    #ak_2020_pvt[race] = ak_2020_pvt[race].astype(int)

### Early, Absentee and Questioned Ballots

#### Create DataFrames for Allocation

In [13]:
#A list of all Precincts that contain "District in the name", these are the votes to be allocated
to_allocate = list(ak_2020_pvt[ak_2020_pvt["Precinct"].str.contains("District")]["Precinct"])

# Early, absentee and questioned votes dataframe (reported at the district level)
ak_2020_to_allocate = ak_2020_pvt[ak_2020_pvt["Precinct"].isin(to_allocate)]

#Precinct level votes dataframe
ak_2020_precinct = ak_2020_pvt[~(ak_2020_pvt["Precinct"].isin(to_allocate)) & (ak_2020_pvt["Precinct"]!="HD99 Fed Overseas Absentee ")]

#State-level reported votes, these will not be allocated any votes in this step
ak_2020_fed_overseas = ak_2020_pvt[ak_2020_pvt["Precinct"]=="HD99 Fed Overseas Absentee "]

#### Check shapes to make sure all votes are being accounted for

In [14]:
#The first three numbers should add to the fourth
print(ak_2020_to_allocate.shape[0])
print(ak_2020_precinct.shape[0])
print(ak_2020_fed_overseas.shape[0])
print(ak_2020_pvt.shape[0])

138
441
1
580


#### Add identifying data to Allocation DataFrames

In [15]:
# Add in a district number to the precinct-level reported votes
ak_2020_precinct.loc[:,"District"] = ak_2020_precinct.loc[:,"Precinct"].apply(lambda x: x[0:2])

#Add in a district number to the district-level reported votes
ak_2020_to_allocate.loc[:,"col_names"]=ak_2020_to_allocate.loc[:,"Precinct"].str.split(" - ")
ak_2020_to_allocate.loc[:,"District"]=ak_2020_to_allocate.loc[:,"col_names"].apply(lambda x: x[0].strip()[9:].zfill(2))

#Confirm we are getting the right number of districts (should be 40)
print(len(ak_2020_to_allocate["District"].unique()))
print(len(ak_2020_precinct["District"].unique()))

40
40


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


#### Create a dataframe of total votes to allocate by district

In [16]:
to_dole_out_totals = pd.DataFrame(ak_2020_to_allocate.groupby(["District"]).sum())
to_dole_out_totals.reset_index(inplace=True,drop=False)

#### Create a dataframe of total votes reported at the precinct-level by district

In [17]:
precinct_specific_totals = pd.DataFrame(ak_2020_precinct.groupby(["District"]).sum())
precinct_specific_totals.reset_index(inplace=True,drop=False)

#### Perform allocation

In [19]:
races=['G20PRERTRU', 'G20PREDBID', 'G20PRELJOR', 'G20PREGJAN',
       'G20PRECBLA', 'G20PREIPIE', 'G20PREOFUE', 'G20USSRSUL', 'G20USSDGRO',
       'G20USSOHOW', 'G20HALRYOU', 'G20HALDGAL']

#Create some new columns for each of these races to deal with the allocation
for race in races:
    add_var = race+"_add"
    rem_var = race+"_rem"
    floor_var = race+"_floor"
    ak_2020_precinct.loc[:,add_var]=0.0
    ak_2020_precinct.loc[:,rem_var]=0.0
    ak_2020_precinct.loc[:,floor_var]=0.0

#Iterate over the rows
#Note this function iterates over the dataframe two times so the rounded vote totals match the totals to allocate
for index, row in ak_2020_precinct.iterrows():
    for race in races:
        add_var = race+"_add"
        rem_var = race+"_rem"
        floor_var = race+"_floor"
        #Grab the district
        county_id = row["District"]
        #Get the denominator for the allocation (the precinct vote totals)
        denom = precinct_specific_totals.loc[precinct_specific_totals["District"]==county_id][race]
        #Get one of the numerators, how many districtwide votes to allocate
        numer = to_dole_out_totals.loc[to_dole_out_totals["District"]==county_id][race]
        #Get the vote totals for this race in this precinct
        val = ak_2020_precinct.at[index,race]
        #Get the vote share, the precincts % of total precinct votes in the district times votes to allocate
        if (float(denom==0)):
            vote_share = 0
        else:
            vote_share = (float(val)/float(denom))*float(numer)
        ak_2020_precinct.at[index,add_var] = vote_share
        #Take the decimal remainder of the allocation
        ak_2020_precinct.at[index,rem_var] = vote_share%1
        #Take the floor of the allocation
        ak_2020_precinct.at[index,floor_var] = np.floor(vote_share)

#After the first pass through, get the sums of the races by district to assist in the rounding            
first_allocation = pd.DataFrame(ak_2020_precinct.groupby(["District"]).sum())

#Now we want to iterate district by district to work on rounding
county_list = list(to_dole_out_totals["District"].unique()) 

#Iterate over the district
for county in county_list:
    for race in races:
        add_var = race+"_add"
        rem_var = race+"_rem"
        floor_var = race+"_floor"
        #County how many votes still need to be allocated (because we took the floor of all the initial allocations)
        to_go = int(np.round((int(to_dole_out_totals.loc[to_dole_out_totals["District"]==county][race])-first_allocation.loc[first_allocation.index==county,floor_var])))
        #Grab the n precincts with the highest remainders and round these up, where n is the # of votes that still need to be allocated
        for index in ak_2020_precinct.loc[ak_2020_precinct["District"]==county][rem_var].nlargest(to_go).index:
            ak_2020_precinct.at[index,add_var] = np.ceil(ak_2020_precinct.at[index,add_var])

#Iterate over every race again
for race in races:
    add_var = race+"_add"
    #Round every allocation down to not add fractional votes
    ak_2020_precinct.loc[:,add_var]=np.floor(ak_2020_precinct.loc[:,add_var])
    ak_2020_precinct.loc[:,race]+=ak_2020_precinct.loc[:,add_var]

#Print out any instances where the allocation, as written, won't work
for index, row in precinct_specific_totals.iterrows():
    for race in races:
        if (row[race]==0):
            race_district = row["District"]
            to_allocate = int(to_dole_out_totals.loc[to_dole_out_totals["District"]==race_district,race])
            if (to_allocate != 0):
                print("Weren't able to allocate "+str(to_allocate)+" votes in " +str(race)+" District "+str(race_district))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Weren't able to allocate 4 votes in G20PREOFUE District 21
Weren't able to allocate 3 votes in G20PREOFUE District 30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


#### Look at votes unable to be allocated

These handful of votes could not be allocated because there were 0 precinct-level votes and a non-zero number of votes to allocate

In [24]:
print(precinct_specific_totals[precinct_specific_totals["G20PREOFUE"]==0][["District","G20PREOFUE"]])

print(to_dole_out_totals[to_dole_out_totals["District"].isin(["21","30"])][["District","G20PREOFUE"]])

Choice District  G20PREOFUE
20           21           0
29           30           0
Choice District  G20PREOFUE
20           21           4
29           30           3


In [None]:
ak_2020_precinct["Total Votes"] = 0
for race in race:
    ak_2020_precinct.loc[:,"Total Votes"]

In [21]:
print(ak_2020_precinct[ak_2020_precinct["District"].isin(["21","30"])])

Choice                 Precinct  G20PRERTRU  G20PREDBID  G20PRELJOR  \
175     21-600 Turnagain No. 1        651.0      2799.0        35.0   
176     21-605 Sand Lake No. 1       1338.0      2705.0        52.0   
177     21-610 Sand Lake No. 2       1001.0      3143.0        68.0   
178     21-615 Sand Lake No. 3        937.0      2080.0        39.0   
179        21-620 Lake Spenard        736.0      1953.0        48.0   
180           21-625 Lake Hood        972.0      3175.0        48.0   
181     21-630 Turnagain No. 2        543.0      2001.0        35.0   
182     21-635 Turnagain No. 3        953.0      3551.0        68.0   
248             30-200 Central       2172.0      1291.0        75.0   
249             30-210 K-Beach       3031.0      1770.0       113.0   
250         30-220 Kenai No. 1       2476.0      2069.0        96.0   
251         30-230 Kenai No. 2       1396.0      1158.0        39.0   
252         30-240 Kenai No. 3       1183.0       896.0        52.0   
253   

In [None]:
totals = pd.DataFrame(ak_2020_precinct.sum())
precinct_vote_total_across_districts = totals.T

In [None]:
print(precinct_vote_total_across_districts)
print(ak_2020_fed_overseas)

In [None]:
for index, row in precinct_vote_total_across_districts.iterrows():
    for race in races:
        if (row[race]==0):
            print("yes")

In [None]:
#Create some new columns for each of these races to deal with the allocation
races=['G20PRERTRU', 'G20PREDBID', 'G20PRELJOR', 'G20USSRSUL', 'G20USSDGRO',
       'G20USSOHOW', 'G20HALRYOU', 'G20HALDGAL']

for race in races:
    add_var = race+"_fed_add"
    rem_var = race+"_fed_rem"
    floor_var = race+"_fed_floor"
    ak_2020_precinct.loc[:,add_var]=0.0
    ak_2020_precinct.loc[:,rem_var]=0.0
    ak_2020_precinct.loc[:,floor_var]=0.0

for index, row in ak_2020_precinct.iterrows():
    for race in races:
            add_var = race+"_fed_add"
            rem_var = race+"_fed_rem"
            floor_var = race+"_fed_floor"
            #Get the denominator for the allocation (the precinct vote totals)
            denom = precinct_vote_total_across_districts.iloc[0][race]
            #Get one of the numerators, how many countywide votes to allocate
            numer = ak_2020_fed_overseas.iloc[0][race]
            #Get the vote totals for this race in this precinct
            val = ak_2020_precinct.at[index,race]
            #Get the vote share, the precincts % of total precinct votes in the county times votes to allocate
            vote_share = (float(val)/float(denom))*float(numer)
            ak_2020_precinct.at[index,add_var] = vote_share
            #Take the decimal remainder of the allocation
            ak_2020_precinct.at[index,rem_var] = vote_share%1
            #Take the floor of the allocation
            ak_2020_precinct.at[index,floor_var] = np.floor(vote_share)
            
#After the first pass through, get the sums of the races by county 
totals = pd.DataFrame(ak_2020_precinct.sum())
first_allocation = totals.T
 
for race in races:
    add_var = race+"_fed_add"
    rem_var = race+"_fed_rem"
    floor_var = race+"_fed_floor"
    #County how many votes still need to be allocated (because we took the floor of all the initial allocations)
    to_go = int(np.round((int(ak_2020_fed_overseas.iloc[0][race])-first_allocation.iloc[0][floor_var])))
    #Grab the n precincts with the highest remainders and round these up, where n is the # of votes that still need to be allocated
    for index in ak_2020_precinct[rem_var].nlargest(to_go).index:
        ak_2020_precinct.at[index,add_var] = np.ceil(ak_2020_precinct.at[index,add_var])

#Iterate over every race again
for race in races:
    add_var = race+"_fed_add"
    #Round every allocation down
    ak_2020_precinct.loc[:,add_var]=np.floor(ak_2020_precinct.loc[:,add_var])

In [None]:
print("Printing issues below: ")
for race in races:
    add_var = race+"_fed_add"
    add_var_2 = race+"_add"
    diff_1 = int(ak_2020_fed_overseas.iloc[0][race]-ak_2020_precinct.loc[:,add_var].sum())
    diff_2 = int(to_dole_out_totals.loc[:,race].sum()-ak_2020_precinct.loc[:,add_var_2].sum())
    if((diff_1 != 0) & (diff_2 !=0)):
        print(diff_1)
        print(diff_2)
        print("Issue with "+ race)
print("")
print("All others match")

In [None]:
print("Printing issues below: ")
for race in races:
    add_var = race+"_fed_add"
    add_var_2 = race+"_add"
    diff_1 = int(ak_2020_fed_overseas.iloc[0][race]-ak_2020_precinct.loc[:,add_var].sum())
    diff_2 = int(to_dole_out_totals.loc[:,race].sum()-ak_2020_precinct.loc[:,add_var_2].sum())
    if((diff_1 != 0) & (diff_2 !=0)):
        print(diff_1)
        print(diff_2)
        print("Issue with "+ race)
print("")
print("All others match")

In [None]:
for race in races:
    add_var = race+"_fed_add"
    #add_var_2 = race+"_add"
    ak_2020_precinct.loc[:,race]+=ak_2020_precinct.loc[:,add_var]
    #ak_2020_precinct.loc[:,race]+=ak_2020_precinct.loc[:,add_var_2]

rem_races = ["G20PREGJAN","G20PRECBLA","G20PREIPIE","G20PREOFUE"]
    
#for race in rem_races:
    #add_var_2 = race+"_add"
    #ak_2020_precinct.loc[:,race]+=ak_2020_precinct.loc[:,add_var_2]  

In [None]:
print(sum(vest_ak_20["G20PREOFUE"]))

In [None]:
ak_2020_precinct.columns

In [None]:
ak_2020_precinct = ak_2020_precinct[['Precinct','District', 'G20PRERTRU', 'G20PREDBID', 'G20PRELJOR', 'G20PREGJAN',
       'G20PRECBLA', 'G20PREIPIE', 'G20PREOFUE', 'G20USSRSUL', 'G20USSDGRO',
       'G20USSOHOW', 'G20HALRYOU', 'G20HALDGAL']]

In [None]:
print("Printing differences below:")
for race in data_columns:
    if (sum(vest_ak_20[race])-sum(ak_2020_precinct[race]) != 0):
        print(race+" has a difference of "+str(sum(vest_ak_20[race])-sum(ak_2020_precinct[race]))+" votes")
        print("\tVEST: "+str(sum(vest_ak_20[race]))+" votes")
        print("\tSOURCES: "+str(sum(ak_2020_precinct[race]))+" votes")
print("")
print("All other races are equal")

In [None]:
print(vest_ak_20)

In [None]:
print(ak_2020_precinct["District"].unique())
vest_ak_20["District_mod"] = vest_ak_20["DISTRICT"].str[0:2]
print(vest_ak_20["District_mod"].unique())

In [None]:
diff_counties=[]
for i in data_columns:
    diff = ak_2020_precinct.groupby(["District"]).sum()[i]-vest_ak_20.groupby(["District_mod"]).sum()[i]
    for val in diff[diff != 0].index.values.tolist():
        if val not in diff_counties:
            diff_counties.append(val)
    if len(diff[diff != 0]!=0):
        print(i)
        print(diff[diff != 0].to_string(header=False))
print("All other races in all counties are equal")

## Shapefile

### Load the shapefile, take a look

In [None]:
ak_shp = gp.read_file("./raw-from-source/Shapefiles/SW Proc Shape Files/2013-SW-Proc-Shape-files.shp")
ak_shp.head(1)

### Look at election results and shapefile to try to find a unique ID

In [None]:
#Notice that the full Name has a bit of odd formatting in the shapefile, but the first 6 digits appear to match

print(ak_shp["NAME"])
print(ak_2020_precinct["Precinct"])

### Use just the first 6 digits of Name and Precinct as an alternate unique ID, check if unique

In [None]:
ak_2020_precinct["join_col"]=ak_2020_precinct["Precinct"].str[0:6]
ak_shp["join_col"]=ak_shp["NAME"].str[0:6]

print(ak_shp["join_col"].value_counts())
print(ak_2020_precinct["join_col"].value_counts())

### Attempt to join using this - looks good!

In [None]:
first_join = pd.merge(ak_2020_precinct,ak_shp,how="outer",on="join_col",indicator=True)
print(first_join["_merge"].value_counts())

## Joining to VEST file

### Use VEST's district as a join column

In [None]:
vest_ak_20["join_col"]=vest_ak_20["DISTRICT"]

### Attempt to join

In [None]:
final_merge = pd.merge(vest_ak_20,first_join,how="outer",on="join_col",indicator="final_merge")
print(final_merge["final_merge"].value_counts())

## Check election results precinct-by-precinct

In [None]:
def validater_row (df, column_List):
    matching_rows = 0
    different_rows = 0
    diff_list=[]
    diff_values = []
    max_diff = 0
    for j in range(0,len(df.index)):
        same = True
        for i in column_List:
            left_Data = i + "_x"
            right_Data = i + "_y"
            diff = abs(df.iloc[j][left_Data]-df.iloc[j][right_Data])
            if(diff >0):
                if(diff>0):
                    print(i, "{:.>72}".format(df.iloc[j]["join_col"]), "(V)","{:.>5}".format(int(df.iloc[j][left_Data]))," (S){:.>5}".format(int(df.iloc[j][right_Data])),"(D):{:>5}".format(int(df.iloc[j][right_Data])-int(df.iloc[j][left_Data])))           
                #print(df.iloc[j]['countypct'])
                
                diff_values.append(abs(diff))
                same = False
                if(np.isnan(diff)):
                    print("NaN value at diff is: ", df.iloc[j]["join_col"])
                    print(df.iloc[j][left_Data])
                    print(df.iloc[j][right_Data])
                if (diff>max_diff):
                    max_diff = diff
                    #print("New max diff is: ", str(max_diff))
                    #print(df.iloc[j]['cty_pct'])
        if(same != True):
            different_rows +=1
            diff_list.append(df.iloc[j]["join_col"])
        else:
            matching_rows +=1
    print("")
    print("There are ", len(df.index)," total rows")
    print(different_rows," of these rows have election result differences")
    print(matching_rows," of these rows are the same")
    print("")
    print("The max difference between any one shared column in a row is: ", max_diff)
    if(len(diff_values)!=0):
        print("The average difference is: ", str(sum(diff_values)/len(diff_values)))
    count_big_diff = len([i for i in diff_values if i > 10])
    print("There are ", str(count_big_diff), "precinct results with a difference greater than 10")
    diff_list.sort()
    print(diff_list)

In [None]:
validater_row(final_merge,races)

Note: If you scroll down in the above block, you can read the following:
        
> There are  441  total rows
439  of these rows have election result differences
2  of these rows are the same

> The max difference between any one shared column in a row is:  4.0
The average difference is:  1.183591795897949

## Check geometries

In [None]:
both = final_merge[final_merge["final_merge"]=="both"]
vest_geoms = gp.GeoDataFrame(both,geometry="geometry_x",crs=vest_ak_20.crs)
source_geoms = gp.GeoDataFrame(both,geometry="geometry_y",crs=vest_ak_20.crs)
source_geoms = source_geoms.to_crs(3857)
vest_geoms = vest_geoms.to_crs(3857)
source_geoms["geometry_x"]=source_geoms.buffer(0)
vest_geoms["geometry_y"]=vest_geoms.buffer(0)
vals = source_geoms.geom_almost_equals(vest_geoms,decimal=0)
print(vals.value_counts())

In [None]:
count = 0
area_list = []
for i in range(0,len(source_geoms)):
    diff = source_geoms.iloc[[i]].symmetric_difference(vest_geoms.iloc[[i]])
    intersection = source_geoms.iloc[[i]].intersection(vest_geoms.iloc[[i]])
    area = float(diff.area/10e6)
    area_list.append(area)
    #print("Area is " + str(area))

    if (area > .1):
        count += 1
        name = source_geoms.iat[i,2]
        
        print(str(count)+") For " + name + " difference in area is " + str(area))
        if (intersection.iloc[0].is_empty):
            base = diff.plot(color="red")
            source_geoms.iloc[[i]].plot(color="orange",ax=base)
            vest_geoms.iloc[[i]].plot(color="blue",ax=base)
            base.set_title(name)
        else:
            base = diff.plot(color="red")
            source_geoms.iloc[[i]].plot(color="orange",ax=base)
            vest_geoms.iloc[[i]].plot(color="blue",ax=base)
            intersection.plot(color="green",ax=base)
            base.set_title(name)

In [None]:
df = pd.DataFrame(area_list)
print(df.shape)

print(str(len(df[df[0]==0]))+" precincts w/ a difference of 0 km^2")
print(str(len(df[(df[0]<.1) & (df[0]>0)]))+ " precincts w/ a difference between 0 and .1 km^2")
print(str(len(df[(df[0]<.5) & (df[0]>=.1)]))+ " precincts w/ a difference between .1 and .5 km^2")
print(str(len(df[(df[0]<1) & (df[0]>=.5)]))+ " precincts w/ a difference between .5 and 1 km^2")
print(str(len(df[(df[0]<2) & (df[0]>=1)]))+ " precincts w/ a difference between 1 and 2 km^2")
print(str(len(df[(df[0]<5) & (df[0]>=2)]))+ " precincts w/ a difference between 2 and 5 km^2")
print(str(len(df[(df[0]>=5)]))+ " precincts w/ a difference greater than 5 km^2")