In [1]:
import matplotlib.pyplot as plt # for plotting maps
import maup # mggg's library for proration, see documentation here: https://github.com/mggg/maup
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np 
from statistics import mean, median
from pandas import read_csv

**Load datasets**

In [2]:
#Load VTD shapefile
#NOTE: This has all the relevant data from the MGGG file except for '12 Presidential & '16 Election results, demographic data

vtds=gp.read_file("./raw-from-files/VTD _shapefile/R2016_Assignment_Layers.gdb")

#Load Block-VTD Key File to Aggregate Demographic Data to the VTD Level

block_keys=pd.read_csv("./raw-from-files/block_keys/Block_Level_GeoKeys.tab",sep="\t",engine='python')

#2012 Election Data 

election_data_2012=pd.read_csv("./raw-from-files/election_data/2012 _election/results_pct_20121106.txt",delimiter=",")

#2016 Election Data 

election_data_2016=pd.read_csv("./raw-from-files/election_data/2016_election/results_pct_20161108!.txt", sep="\t",engine='python')

#MGGG Assigns districts from 2011, 2016, and new judge enacted plan 
#Taken from NCLEG Congressional Districts Shapefile

newplan= gp.read_file('./raw-from-files/district_plan/newplan/shp (3)/2016_Contingent_Congressional_Plan_Corrected.shp')

oldplan=gp.read_file('./raw-from-files/district_plan/oldplan/shp (3)/Rucho_Lewis_Congress_3.shp')

judge=gp.read_file('./raw-from-files/district_plan/judge/HB1029 3rd Edition - Shapefile/C-Goodwin-A-1-TC.shp')

#MGGG's File (Loading now to perform a quick check that the vtd shapefile is the correct one)

final_mggg=gp.read_file('./raw-from-files/mggg_file/NC_VTD/NC_VTD.shp')


In [None]:
print(vtds.plot())

In [None]:
final_mggg.newplan.plot()

In [None]:
print(vtds.columns)

**Check to Make Sure Shapefile is the Right One**

In [None]:
final_mggg["geometry"]=final_mggg.buffer(0)
vtds["geometry"]=vtds.buffer(0)
proj = vtds.crs
final_mggg = final_mggg.to_crs(proj)
file = vtds.difference(final_mggg)
print(sum(file.area)/sum(vtds.area))
print(final_mggg.geom_almost_equals(vtds,decimal=6))

**Take a Look at VTD File as well as MGGG File to see what is present / missing**

In [3]:
#print(vtds.columns)
print(final_mggg.columns)

Index(['ALAND10', 'AWATER10', 'VTD', 'County', 'VTD_Key', 'VTD_Name',
       'PL10AA_TOT', 'PL10VA_TOT', 'EL08G_GV_D', 'EL08G_GV_R', 'EL08G_GV_L',
       'EL08G_GV_T', 'EL08G_USS_', 'EL08G_US_1', 'EL08G_US_2', 'EL08G_US_3',
       'EL08G_US_4', 'EL10G_USS_', 'EL10G_US_1', 'EL10G_US_2', 'EL10G_US_3',
       'EL10G_US_4', 'EL12G_GV_D', 'EL12G_GV_R', 'EL12G_GV_L', 'EL12G_GV_W',
       'EL12G_GV_1', 'EL12G_GV_T', 'EL14G_USS_', 'EL14G_US_1', 'EL14G_US_2',
       'EL14G_US_3', 'EL14G_US_4', 'Shape_Leng', 'Shape_Area', 'EL12G_PR_D',
       'EL12G_PR_R', 'EL12G_PR_L', 'EL12G_PR_W', 'EL12G_PR_1', 'EL12G_PR_T',
       'EL16G_PR_R', 'EL16G_PR_D', 'EL16G_PR_L', 'EL16G_PR_W', 'EL16G_PR_T',
       'EL16G_USS_', 'EL16G_US_1', 'EL16G_US_2', 'EL16G_US_3', 'EL16G_GV_D',
       'EL16G_GV_R', 'EL16G_GV_L', 'EL16G_GV_T', 'BPOP', 'nBPOP', 'judge',
       'newplan', 'oldplan', 'TOTPOP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN',
       'NH_ASIAN', 'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'HISP', 'H_WHITE',
       'H_BLACK

In [4]:
#pd.set_option('display.max_columns', None)
#display(vtds)
#display(final_mggg)

**Rename and Clean Columns in VTD File**

In [5]:
#Renaming the 2008 Election Data
vtds = vtds.rename(columns={"EL08G_GV_TOT": "EL08G_GV_T","EL08G_USS_D": "EL08G_USS_","EL08G_USS_R": "EL08G_US_1","EL08G_USS_L": "EL08G_US_2","EL08G_USS_W": "EL08G_US_3","EL08G_USS_TOT": "EL08G_US_4"})

#Renaming the 2010 Election Data
vtds = vtds.rename(columns={"EL10G_USS_D": "EL10G_USS_","EL10G_USS_R": "EL10G_US_1","EL10G_USS_L": "EL10G_US_2","EL10G_USS_W": "EL10G_US_3","EL10G_USS_TOT": "EL10G_US_4"})

#Renaming the 2012 Election Data Columns to match MGGG-NC
#(ASK ABOUT THIS) From merge_data: column EL12G_GV_W2 > MGGG-NC Column EL12G_GV_1 (Total number of votes for 2012 gubernatorial candidates)
#(ASK ABOUT THIS) From merge_data: column EL12G_GV_W1 > MGGG-NC Column EL12G_GV_W (Total number of votes for Donald Kreamer as 2012 gubernatorial candidates)
vtds = vtds.rename(columns={"EL12G_GV_W1": "EL12G_GV_W","EL12G_GV_W2": "EL12G_GV_1","EL12G_GV_TOT": "EL12G_GV_T"})

#Renaming the 2014 Election Data
vtds = vtds.rename(columns={"EL14G_USS_R": "EL14G_USS_","EL14G_USS_D": "EL14G_US_1","EL14G_USS_L": "EL14G_US_2","EL14G_USS_WI": "EL14G_US_3","EL14G_USS_TOT": "EL14G_US_4"})

#Rename shapefile data
vtds = vtds.rename(columns={"Shape_Length":"Shape_Leng"})

In [6]:
print(vtds.columns)

Index(['ALAND10', 'AWATER10', 'VTD', 'County', 'VTD_Key', 'VTD_Code',
       'VTD_Name', 'PL10AA_TOT', 'PL10VA_TOT', 'EL08G_AG_D', 'EL08G_AG_R',
       'EL08G_AG_TOT', 'EL08G_AD_D', 'EL08G_AD_R', 'EL08G_AD_TOT',
       'EL08G_CA_D', 'EL08G_CA_R', 'EL08G_CA_TOT', 'EL08G_CI_D', 'EL08G_CI_R',
       'EL08G_CI_L', 'EL08G_CI_W', 'EL08G_CI_TOT', 'EL08G_CL_D', 'EL08G_CL_R',
       'EL08G_CL_TOT', 'EL08G_GV_D', 'EL08G_GV_R', 'EL08G_GV_L', 'EL08G_GV_T',
       'EL08G_LG_D', 'EL08G_LG_R', 'EL08G_LG_L', 'EL08G_LG_TOT', 'EL08G_SPI_D',
       'EL08G_SPI_R', 'EL08G_SPI_TOT', 'EL08G_USS_', 'EL08G_US_1',
       'EL08G_US_2', 'EL08G_US_3', 'EL08G_US_4', 'EL10G_USS_', 'EL10G_US_1',
       'EL10G_US_2', 'EL10G_US_3', 'EL10G_US_4', 'EL12G_GV_D', 'EL12G_GV_R',
       'EL12G_GV_L', 'EL12G_GV_W', 'EL12G_GV_1', 'EL12G_GV_T', 'EL12G_LG_D',
       'EL12G_LG_R', 'EL12G_LG_TOT', 'EL12G_AD_D', 'EL12G_AD_R',
       'EL12G_AD_TOT', 'EL12G_CA_D', 'EL12G_CA_R', 'EL12G_CA_TOT',
       'EL12G_CI_D', 'EL12G_CI_R', 'EL12G

In [7]:
vtds = vtds[['ALAND10', 'AWATER10', 'VTD', 'County', 'VTD_Key', 'VTD_Name',
       'PL10AA_TOT', 'PL10VA_TOT', 'EL08G_GV_D', 'EL08G_GV_R', 'EL08G_GV_L',
       'EL08G_GV_T', 'EL08G_USS_', 'EL08G_US_1', 'EL08G_US_2', 'EL08G_US_3',
       'EL08G_US_4', 'EL10G_USS_', 'EL10G_US_1', 'EL10G_US_2', 'EL10G_US_3',
       'EL10G_US_4', 'EL12G_GV_D', 'EL12G_GV_R', 'EL12G_GV_L', 'EL12G_GV_W',
       'EL12G_GV_1', 'EL12G_GV_T', 'EL14G_USS_', 'EL14G_US_1', 'EL14G_US_2',
       'EL14G_US_3', 'EL14G_US_4', 'Shape_Leng', 'Shape_Area','geometry']]

Remaining work: Add in demographic data, district assignments from 3 plans and '12 presidential + '16 results.

**Load Census API**

Retrieve demographic data from Census API

To see the full list of census variables, look here: https://api.census.gov/data/2010/dec/sf1/variables.html

In [None]:
# Code chunk by Melisande Teng - VRDI 2019
variables = [
    # pop
    "P005001",
    "P005003",
    "P005004",
    "P005005",
    "P005006",
    "P005007",
    "P005008",
    "P005009",
    "P005010",
    "P005011",
    "P005012",
    "P005013",
    "P005014",
    "P005015",
    "P005016",
    "P005017",
    # vap
    "P011001",
    "P011002",
    "P011005",
    "P011006",
    "P011007",
    "P011008",
    "P011009",
    "P011010",
    "P011011",
]

keys = [
    # pop
    "TOTPOP",  # these follow from above, so TOTPOP = P005001
    "NH_WHITE",
    "NH_BLACK",
    "NH_AMIN",
    "NH_ASIAN",
    "NH_NHPI",
    "NH_OTHER",
    "NH_2MORE",
    "HISP",
    "H_WHITE",
    "H_BLACK",
    "H_AMIN",
    "H_ASIAN",
    "H_NHPI",
    "H_OTHER",
    "H_2MORE",
    # vap
    "VAP",
    "HVAP",
    "WVAP",
    "BVAP",
    "AMINVAP",
    "ASIANVAP",
    "NHPIVAP",
    "OTHERVAP",
    "2MOREVAP",
]

import requests

def counties(state_fips):
    """Inputs: state fips code
    Process: Retrieves a list of counties in the given state from the Census API.  
    Outputs: A list of county fips codes in the state. """
    resp = requests.get(
        "https://api.census.gov/data/2010/dec/sf1"
        "?get=NAME&for=county:*&in=state:{}".format(state_fips)
    )
    header, *rows = resp.json()
    county_column_index = header.index("county")
    county_fips_codes = set(row[county_column_index] for row in rows)
    return county_fips_codes


def block_data_for_county(state_fips, county_fips, variables=variables, keys=keys):
    """Inputs: state fips code, county fips code, list of variables and corresponding keys. 
    Process: formats the query to call the Census API. 
    Outputs: data for the county in a pandas dataframe. """
    url = (
        "https://api.census.gov/data/2010/dec/sf1"
        + "?get={},NAME&for=block:*".format(",".join(variables))
        + "&in=state:{}&in=county:{}&in=tract:*".format(state_fips, county_fips)
    )
    resp = requests.get(url)
    header, *rows = resp.json()
    variable_lookup = dict(zip(variables, keys))
    columns = [variable_lookup.get(column_name, column_name) for column_name in header]
    dtypes = {key: int for key in keys}
    dtypes.update({key: str for key in ["state", "county", "tract", "block"]})
    data = pd.DataFrame.from_records(rows, columns=columns).astype(dtypes)
    data["geoid"] = data["state"] + data["county"] + data["tract"] + data["block"]
    return data


def block_data_for_state(state_fips):
    """Input: state_fips
    Process: Retrieve a list of county fips codes in the state. 
        The Census API only lets you query one county at a time, so you first need the full list. 
        Then call block_data_for_county() to retrieve the data at the county level. 
    Outputs: block-level data for the state fips for the list of variables defined above. 
    """
    from tqdm.auto import tqdm
    county_fips_codes = counties(state_fips)
    return pd.concat(
        [
            block_data_for_county(state_fips, county_fips)
            for county_fips in tqdm(county_fips_codes)
        ]
    )

In [None]:
#if running script for the first time, use this chunk
nc = block_data_for_state('37')
nc = nc.rename(columns={"geoid": "GEOID10"})
nc.to_csv('nc_census_2010_blocks2.csv')

Example of how you would write the census data to a csv and then re-load it so you don't have to query it every time you re-run this script


In [8]:
nc = pd.read_csv('./raw-from-files/census_data/nc_census_2010_blocks2.csv')
nc = nc.astype({"GEOID10": str})

Take a look at the census data

Variables that aren't aren't added (BPOP and nBPOP) can be calculated from the data (Will see that this calculation is the same as the one the census performs)

In [9]:
nc["BPOP"]=nc["NH_BLACK"]+nc["H_BLACK"]
nc["nBPOP"]=nc["TOTPOP"]-nc["NH_BLACK"]-nc["H_BLACK"]

In [None]:
nc.head(2)

**Prorate Demographic and Voting Population Data from the Block Level to VTD using Block_Keys**

Using Block Keys because that was the method MGGG specified. 

In [10]:
print(block_keys.shape)
print(nc.shape)
nc = nc.rename(columns={"GEOID10": "Block_Key"})
nc.head(5)
block_keys.head(5)

(288987, 17)
(288987, 34)


Unnamed: 0,Block_Key,Block_Code,BG_Key,BG_Code,Tract_Key,Tract_Code,Tract_Name,VTD_Key,VTD_Code,VTD_Name,Cnty_Key,Cnty_Code,Cnty_Name,Place_Key,Place_Code,Place_Name,ZCTA5
0,370010201001000,1000,370010201001,1,37001020100,20100,201.0,3700112W,12W,12W,37001,1,Alamance,3709060.0,9060.0,Burlington,27215.0
1,370010201001001,1001,370010201001,1,37001020100,20100,201.0,3700112W,12W,12W,37001,1,Alamance,3709060.0,9060.0,Burlington,27215.0
2,370010201001002,1002,370010201001,1,37001020100,20100,201.0,3700112W,12W,12W,37001,1,Alamance,3709060.0,9060.0,Burlington,27215.0
3,370010201001003,1003,370010201001,1,37001020100,20100,201.0,3700112W,12W,12W,37001,1,Alamance,3709060.0,9060.0,Burlington,27217.0
4,370010201001004,1004,370010201001,1,37001020100,20100,201.0,3700112W,12W,12W,37001,1,Alamance,3709060.0,9060.0,Burlington,27215.0


Convert both "Block_Key" columns to integers so they can be joined

In [11]:
nc["Block_Key"]=nc["Block_Key"].astype(int)
block_keys["Block_Key"]=block_keys["Block_Key"].astype(int)

In [12]:
block_joined=pd.merge(nc,block_keys,on='Block_Key',how='outer',indicator=True)

In [13]:
print(block_joined.head(10))
print(block_joined.shape)

   Unnamed: 0  TOTPOP  NH_WHITE  NH_BLACK  NH_AMIN  NH_ASIAN  NH_NHPI  \
0           0      22         0        22        0         0        0   
1           1      10         2         8        0         0        0   
2           2       0         0         0        0         0        0   
3           3       0         0         0        0         0        0   
4           4       0         0         0        0         0        0   
5           5       0         0         0        0         0        0   
6           6       0         0         0        0         0        0   
7           7      27        25         1        0         0        0   
8           8      39        32         1        0         0        0   
9           9      18         2        16        0         0        0   

   NH_OTHER  NH_2MORE  HISP  ...  VTD_Code  VTD_Name  Cnty_Key  Cnty_Code  \
0         0         0     0  ...        C1        C1     37015         15   
1         0         0     0  ...        C1

In [14]:
block_joined.columns

Index(['Unnamed: 0', 'TOTPOP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN', 'NH_ASIAN',
       'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'HISP', 'H_WHITE', 'H_BLACK',
       'H_AMIN', 'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE', 'VAP', 'HVAP',
       'WVAP', 'BVAP', 'AMINVAP', 'ASIANVAP', 'NHPIVAP', 'OTHERVAP',
       '2MOREVAP', 'NAME', 'state', 'county', 'tract', 'block', 'Block_Key',
       'BPOP', 'nBPOP', 'Block_Code', 'BG_Key', 'BG_Code', 'Tract_Key',
       'Tract_Code', 'Tract_Name', 'VTD_Key', 'VTD_Code', 'VTD_Name',
       'Cnty_Key', 'Cnty_Code', 'Cnty_Name', 'Place_Key', 'Place_Code',
       'Place_Name', 'ZCTA5', '_merge'],
      dtype='object')

Both dataframes originally had 288,987 rows so join looks good

In [None]:
#list(block_joined.columns)

**Going to pivot this dataframe by the "VTD_Key" column

**Pivot Block Level Census Data to VTD Level by VTD Key**

In [15]:
census_vtd=pd.pivot_table(block_joined,index="VTD_Key",aggfunc=sum)
census_vtd.reset_index(inplace=True)

**Check to make sure that VTD row number makes sense**

In [16]:
print(census_vtd.shape)
print(vtds.shape)

(2692, 45)
(2692, 36)


**Filter Down to Relevant Census Info**

In [17]:
census_vtd.columns

Index(['VTD_Key', '2MOREVAP', 'AMINVAP', 'ASIANVAP', 'BG_Code', 'BG_Key',
       'BPOP', 'BVAP', 'Block_Code', 'Block_Key', 'Cnty_Code', 'Cnty_Key',
       'HISP', 'HVAP', 'H_2MORE', 'H_AMIN', 'H_ASIAN', 'H_BLACK', 'H_NHPI',
       'H_OTHER', 'H_WHITE', 'NHPIVAP', 'NH_2MORE', 'NH_AMIN', 'NH_ASIAN',
       'NH_BLACK', 'NH_NHPI', 'NH_OTHER', 'NH_WHITE', 'OTHERVAP', 'Place_Code',
       'Place_Key', 'TOTPOP', 'Tract_Code', 'Tract_Key', 'Tract_Name',
       'Unnamed: 0', 'VAP', 'WVAP', 'ZCTA5', 'block', 'county', 'nBPOP',
       'state', 'tract'],
      dtype='object')

In [18]:
column_List = ['VTD_Key','BPOP','nBPOP','TOTPOP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN','NH_ASIAN', 'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'HISP', 'H_WHITE','H_BLACK', 'H_AMIN', 'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE', 'VAP','HVAP', 'WVAP', 'BVAP', 'AMINVAP', 'ASIANVAP', 'NHPIVAP', 'OTHERVAP','2MOREVAP']

In [20]:
print(census_vtd.head())
census_vtd=census_vtd[column_List]
print(census_vtd.head())

    VTD_Key  2MOREVAP  AMINVAP  ASIANVAP  BG_Code          BG_Key  BPOP  BVAP  \
0   3700101        19        8         8      243  51431420305473   309   257   
1   3700102        15       17         9      237  56981573564267   104    81   
2  37001035        58       16        62      298  45511256123688   514   421   
3  3700103C        16        1        57      155  33300918762265   207   148   
4  3700103N        46        1        77      231  45141246366251   426   350   

   Block_Code          Block_Key  ...  Tract_Name  Unnamed: 0   VAP  WVAP  \
0      246909  51431420305476880  ...    30305.23      530268  3686  3309   
1      241978  56981573564271952  ...    33564.03      562893  3483  3321   
2      300625  45511256123690592  ...    26123.39      280251  5660  4927   
3      157056  33300918762267044  ...    18762.11      125764  2153  1884   
4      232844  45141246366252808  ...    26366.02      373950  4955  4378   

       ZCTA5   block  county  nBPOP  state    trac

**Merge Census VTD Data with the VTD Shapefile by "VTD Key"**

In [21]:
vtds_joined = pd.merge(census_vtd,vtds,on='VTD_Key',how='outer',indicator=True)

**Check to make sure that joined VTD row number makes sense**

In [22]:
print(vtds_joined.shape)

(2692, 64)


**Next Step: Tie VTDs to their District Assignment Across the 3 Plans**

MGGG Assigns districts from 2011, 2016, and new judge enacted plan 
Taken from NCLEG Congressional Districts Shapefile

"newplan" is the 2016 plan  
"oldplan" is the 2011 plan  
"judge" is the new judge plan  

**Set Shapefile Projections**

In [23]:
vtds_joined = gp.GeoDataFrame(vtds_joined, geometry='geometry')
vtds_joined["geometry"]=vtds_joined.buffer(0)
proj=vtds_joined.crs

***2016 plan - "newplan"***

In [24]:
import warnings; warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning)
#print(newplan)
newplan=newplan.to_crs(proj)
newplan["geometry"] = newplan.buffer(0)
assignments=maup.assign(vtds_joined,newplan)
assignments=assignments+1
#There are 13 Congressional Districts in North Carolina, but there are no 0 districts, 
#We have to add 1.0 to each districts because Python indexing starts at 0.
vtds_joined["newplan"]=assignments

**2011 plan - "oldplan"**

In [25]:
import warnings; warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning)
#print(oldplan)
oldplan=oldplan.to_crs(proj)
oldplan["geometry"] = oldplan.buffer(0)
assignments=maup.assign(vtds_joined,oldplan)
assignments=assignments+1
#There are 13 Congressional Districts in North Carolina, but there are no 0 districts, 
#We have to add 1.0 to each districts because Python indexing starts at 0.
vtds_joined["oldplan"]=assignments

**Judge plan -"judge"**

In [26]:
import warnings; warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning)
#print(oldplan)
judge=judge.to_crs(proj)
judge["geometry"] = judge.buffer(0)
assignments=maup.assign(vtds_joined,judge)
assignments=assignments+1
#There are 13 Congressional Districts in North Carolina, but there are no 0 districts, 
#We have to add 1.0 to each districts because Python indexing starts at 0.
vtds_joined["judge"]=assignments

**Next Step: Joining '12 Presidential and '16 Election Data**

In [None]:
election_data_2012.head(2)
election_data_2016.head(2)

In [None]:
#filtering the 2012 data
office_list_2012=["PRESIDENT AND VICE PRESIDENT OF THE UNITED STATES"]
party_list=["REP","DEM","LIB","Write_In"]
election_data_2012=election_data_2012[election_data_2012['contest'].isin(office_list_2012)]
election_data_2012=election_data_2012[election_data_2012['party'].isin(party_list)]

#pivoting the 2012 data
election_data_2012_p=pd.pivot_table(election_data_2012,index="VTD_Key",columns=["party"],values='total votes',aggfunc=sum)
election_data_2012_p.columns=["EL12G_PR_D","EL12G_PR_L","EL12G_PR_R"]
election_data_2012_p.reset_index(inplace=True)

#filtering the 2016 data
office_list_2016=["US PRESIDENT","US SENATE","NC GOVERNOR"]
party_list=["REP","DEM","LIB"]
election_data_2016=election_data_2016[election_data_2016['Contest Name'].isin(office_list_2016)]
election_data_2016=election_data_2016[election_data_2016['Choice Party'].isin(party_list)]
#election_2016=election_2016_filter[election_2016_filter['Choice'].isin(write_in)]

#pivoting the 2016 data
election_data_2016_p=pd.pivot_table(election_data_2016,index="VTD_Key",columns=["Contest Name","Choice Party"],values='Total Votes',aggfunc=sum)
election_data_2016_p.columns=["EL16G_GV_D","EL16G_GV_L","EL16G_GV_R","EL16G_PR_D","EL16G_PR_L","EL16G_PR_R","EL16G_US_1","EL16G_US_2","EL16G_USS_"]
election_data_2016_p.reset_index(inplace=True)
election_data_2016_p.head(2)


In [None]:
#create dictionary of state + county fips

my_county= {
    'ALAMANCE':'37001', 'ALEXANDER':'37003','ANSON':'37007','ASHE':'37009', 'AVERY':'37011', 'BEAUFORT':'37013', 'BERTIE':'37015', 'BLADEN':'37017',  'BRUNSWICK':'37019',
    'BUNCOMBE':'37021','BURKE':'37023','CABARRUS':'37025','CALDWELL':'37027','CAMDEN':'37029','CARTERET':'37031','CASWELL':'37033','CATAWBA':'37035','CHATHAM':'37037','CHEROKEE':'37039','CHOWAN':'37041','CLAY':'37043','CLEVELAND':'37045',
    'COLUMBUS':'37047','CRAVEN':'37049','CUMBERLAND':'37051','CURRITICK':'37053','DARE':'37055','DAVIDSON':'37057','DAVIE':'37059','DUPLIN':'37061','DURHAM':'37063','EDGECOMBE':'37065',
    'FORSYTH':'37067','FRANKLIN':'37069','GASTON':'37071','GATES':'37073','GRAHAM':'37075','GRANVILLE':'37077',
    'GREENE':'37079','GUILFORD':'37081','HALIFAX':'37083','HARNETT':'37085','HAYWOOD':'37087','HENDERSON':'37089','HERTFORD':'37091','HOKE':'37093','HYDE':'37095',
    'IREDELL':'37097','JACKSON':'37099','JOHNSTON':'37101','JONES':'37103','LEE':'37105','LENOIR':'37107','LINCOLN':'37109','MCDOWELL':'37111','MACON':'37113','MADISON':'37115',
    'MERTIN':'37117','MECKLENBURG':'37119','MITCHELL':'37121','MONTGOMERY':'37123','MOORE':'37125','NASH':'37127','NEW HANOVER':'37129','NORTHAMPTON':'37131','ONSLOW':'37133','ORANGE':'37135','PAMLICO':'37137','PASQUOTANK':'37139','PENDER':'37141','PERQUIMANS':'37143','PERSON':'37145','PITT':'37147','POLK':'37149','RANDOLPH':'37151',
    'RICHMOND':'37153','ROCKINGHAM':'37157','ROWAN':'37159','RUTHERFORD':'37161','SAMPSON':'37163','SCOTLAND':'37165','STANLY':'37167','STOKES':'37169','SURRY':'37171','SWAIN':'37173','TRANSYLVANIA':'37175','TYRRELL':'37177','UNION':'37179','VANCE':'37181','WAKE':'37183','WARREN':'37185','WASHINGTON':'37187','WATAUGA':'37189','WAYNE':'37191','WILKES':'37193','WILSON':'37195','YADKIN':'37197',  'YANCEY':'37199'
}

*Election Data 2012*

In [None]:
election_data_2012['countyID']=election_data_2012['county'].map(my_county)
election_data_2016['countyID']=election_data_2012['county'].map(my_county)

#Preparation to string together VTD_Key
election_data_2012['countyID']=election_data_2012['countyID'].astype(str)
election_data_2012['precinct']=election_data_2012['precinct'].astype(str)
election_data_2012['precinctID']=election_data_2012['precinct'].str.zfill(2)
election_data_2012["precinctID"]=election_data_2012["precinct"].str.split("_", n=1, expand=True) 

#Combining state id, county fips, and precinct id to make VTD_Key
election_data_2012['VTD_Key']=election_data_2012['countyID']+election_data_2012['precinctID']

#Preparation to string together VTD_Key
election_data_2016['countyID']=election_data_2016['countyID'].astype(str)
election_data_2016['Precinct']=election_data_2016['Precinct'].astype(str)
election_data_2016['precinctID']=election_data_2016['Precinct'].str.zfill(2)

#Combining state id, county fips, and precinct id to make VTD_Key
election_data_2016['VTD_Key']=election_data_2016['countyID']+election_data_2016['precinctID']

In [None]:
#Removing rows that has 'ONE STOP', 'ABSENTEE BY MAIL','PROVISIONAL', 'TRANSFER','CURBSIDE', 'ACCUMULATED'
#create a dataframe that has the above words

searchfor=['ABSENTEE','PROVISIONAL','TRANSFER']
#nan=['nan']
#remove rows that contains words from searchfor dataframe

election_data_2012=election_data_2012[~election_data_2012.precinct.str.contains('|'.join(searchfor))]
#election_data_2012=election_data_2012[~election_data_2012.county.str.contains('|'.join(nan))]

#Filling in empty columns in party column to write in 
#election_data_2012.loc[election_data_2012["party"].isnull(),'party']="Write_In"

In [None]:
election_data_2012_p.shape

In [None]:
contain_values=election_data_2012[election_data_2012['VTD_Key'].str.contains('3715920')&election_data_2012['total votes']==None]
print(contain_values)

*Election Data 2016*

In [None]:
#Removing rows that has 'ONE STOP', 'ABSENTEE BY MAIL','PROVISIONAL', 'TRANSFER','CURBSIDE', 'ACCUMULATED'
#create a dataframe that has the above words

#searchfor=['ABSENTEE','PROVISIONAL','TRANSFER']
#nan=['nan']
#remove rows that contains words from searchfor dataframe

#election_data_2016=election_data_2016[~election_data_2016.Precinct.str.contains('|'.join(searchfor))]
#election_data_2016=election_data_2016[~election_data_2016.County.str.contains('|'.join(nan))]

In [None]:
contain_values=election_data_2012[election_data_2012['VTD_Key'].str.contains('3715920')]
print(contain_values)


In [None]:
election_data=pd.merge(vtd_election_data,election_data_2012_p,on='VTD_Key',how='outer',indicator='EL12')
election_data=pd.merge(election_data,election_data_2016_p,on='VTD_Key',how='outer',indicator='EL16')


In [None]:
election_data.shape

In [None]:
election_data.to_csv('election_data_joined.csv')

**Look at MGGG FINAL FILE (loaded above)**

In [27]:
final_mggg.head(2)

Unnamed: 0,ALAND10,AWATER10,VTD,County,VTD_Key,VTD_Name,PL10AA_TOT,PL10VA_TOT,EL08G_GV_D,EL08G_GV_R,...,VAP,HVAP,WVAP,BVAP,AMINVAP,ASIANVAP,NHPIVAP,OTHERVAP,2MOREVAP,geometry
0,1951716.0,32157.0,3700106W,37001,3700106W,06W,1973,1505,330,301,...,1505,211,899,338,4,32,0,0,21,"POLYGON ((572318.656 259054.912, 572364.420 25..."
1,2887286.0,0.0,3700112E,37001,3700112E,12E,3391,2503,586,263,...,2503,622,1085,712,19,38,0,2,25,"POLYGON ((572318.656 259054.912, 572310.834 25..."


In [28]:
final_mggg.shape
print(final_mggg.columns.tolist())
column_Order = final_mggg.columns.tolist()

['ALAND10', 'AWATER10', 'VTD', 'County', 'VTD_Key', 'VTD_Name', 'PL10AA_TOT', 'PL10VA_TOT', 'EL08G_GV_D', 'EL08G_GV_R', 'EL08G_GV_L', 'EL08G_GV_T', 'EL08G_USS_', 'EL08G_US_1', 'EL08G_US_2', 'EL08G_US_3', 'EL08G_US_4', 'EL10G_USS_', 'EL10G_US_1', 'EL10G_US_2', 'EL10G_US_3', 'EL10G_US_4', 'EL12G_GV_D', 'EL12G_GV_R', 'EL12G_GV_L', 'EL12G_GV_W', 'EL12G_GV_1', 'EL12G_GV_T', 'EL14G_USS_', 'EL14G_US_1', 'EL14G_US_2', 'EL14G_US_3', 'EL14G_US_4', 'Shape_Leng', 'Shape_Area', 'EL12G_PR_D', 'EL12G_PR_R', 'EL12G_PR_L', 'EL12G_PR_W', 'EL12G_PR_1', 'EL12G_PR_T', 'EL16G_PR_R', 'EL16G_PR_D', 'EL16G_PR_L', 'EL16G_PR_W', 'EL16G_PR_T', 'EL16G_USS_', 'EL16G_US_1', 'EL16G_US_2', 'EL16G_US_3', 'EL16G_GV_D', 'EL16G_GV_R', 'EL16G_GV_L', 'EL16G_GV_T', 'BPOP', 'nBPOP', 'judge', 'newplan', 'oldplan', 'TOTPOP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN', 'NH_ASIAN', 'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'HISP', 'H_WHITE', 'H_BLACK', 'H_AMIN', 'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE', 'VAP', 'HVAP', 'WVAP', 'BVAP', 'AMINVAP',

In [None]:
#TO DO: Have the columns match order

#vtds_joined.shape
#print(vtds_joined.columns.tolist())

**Validation**

In [29]:
validate=pd.merge(vtds_joined,final_mggg, on=['VTD_Key'],how='inner')

In [30]:
#rows before and after are the same
print(validate.shape)
print(final_mggg.shape)

(2692, 151)
(2692, 85)


In [32]:
#Column List should just be a list of columns to be compared with quantitative values

def validater(df,column_List):
    for i in column_List:
        left_Data = i + "_x"
        right_Data = i + "_y"
        if(sum(df[left_Data]-df[right_Data]) != 0):
            print("For " + i + " total difference is: " + str(sum(df[left_Data]-df[right_Data])))
        
validater(validate,['ALAND10', 'AWATER10', 'PL10AA_TOT', 'PL10VA_TOT', 'EL08G_GV_D', 'EL08G_GV_R', 'EL08G_GV_L', 'EL08G_GV_T', 'EL08G_USS_', 'EL08G_US_1', 'EL08G_US_2', 'EL08G_US_3', 'EL08G_US_4', 'EL10G_USS_', 'EL10G_US_1', 'EL10G_US_2', 'EL10G_US_3', 'EL10G_US_4', 'EL12G_GV_D', 'EL12G_GV_R', 'EL12G_GV_L', 'EL12G_GV_W', 'EL12G_GV_1', 'EL12G_GV_T', 'EL14G_USS_', 'EL14G_US_1', 'EL14G_US_2', 'EL14G_US_3', 'EL14G_US_4', 'Shape_Leng', 'Shape_Area','BPOP', 'nBPOP', 'judge', 'newplan', 'oldplan', 'TOTPOP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN', 'NH_ASIAN', 'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'HISP', 'H_WHITE', 'H_BLACK', 'H_AMIN', 'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE', 'VAP', 'HVAP', 'WVAP', 'BVAP', 'AMINVAP', 'ASIANVAP', 'NHPIVAP', 'OTHERVAP', '2MOREVAP'])       


For judge total difference is: 4001.0
For newplan total difference is: 2785.0
For oldplan total difference is: -788991.0


**Check population**

In [None]:
validate['pop_diff']=validate.TOTPOP_x-validate.TOTPOP_y
validate[validate.pop_diff.abs()<10].shape[0]/validate.shape[0]

**Check Voting age population**

In [None]:
validate['bvap_diff']=validate.BVAP_x-validate.BVAP_y
validate[validate.bvap_diff.abs()<10].shape[0]/validate.shape[0]

**Check 08 election data**

In [None]:
validate['election_08_diff']=validate.EL08G_GV_D_x-validate.EL08G_GV_D_y
validate[validate.election_08_diff.abs()<10].shape[0]/validate.shape[0]

**Validate Enacted Plans**

In [33]:
validate['newplan']=validate.newplan_x-validate.newplan_y
validate[validate.newplan.abs()<10].shape[0]/validate.shape[0]
print(validate.newplan_y.unique())
print(validate.newplan_x.unique())
#validate['newplan_y_reindex']=validate['newplan_y']+1
update_newplan= {11:6, 2:5,9:9,0:3,7:1,3:7, 4:10, 12:11,5:8,6:13,1:4,10:2,8:12}
validate['newplan_y_reindex']=validate['newplan_y'].map(update_newplan)
validate['newplan_diff_corrected']=validate.newplan_x-validate.newplan_y_reindex
print(sum(validate['newplan_diff_corrected']))

#print(validate[validate['newplan_diff_corrected'] != 0])

[11.  2.  9.  0.  7.  3.  4. 12.  5.  6.  1. 10.  8.]
[ 6  5  9  3  1  7 10 11  8 13  4  2 12]
0


In [37]:
validate['judge']=validate.judge_x-validate.judge_y
validate[validate.judge.abs()<10].shape[0]/validate.shape[0]
print(validate.judge_y.unique())
print(validate.judge_x.unique())
update_judge= {5:10, 7:11,3:8,4:13,6:3,1:4, 10:7, 9:6,11:12,2:5,12:9,8:1,0:2}
validate['judge_y_reindex']=validate['judge_y'].map(update_judge)
validate['judge_diff_corrected']=validate.judge_x-validate.judge_y_reindex
print(sum(validate['judge_diff_corrected']))

[ 5.  7.  3.  4.  6.  1. 10.  9. 11.  2. 12.  8.  0.]
[10 11  8 13  3  4  7  6 12  5  9  1  2]
901


In [None]:
validate['oldplan']=validate.oldplan_x-validate.oldplan_y
validate[validate.oldplan.abs()<10].shape[0]/validate.shape[0]
validate[['oldplan_x','oldplan_y']].head(10)
validate.oldplan_y.unique()
validate.oldplan_x.unique()
update_oldplan= {296:6, 297:10,300:8,301:9, 305:12,303:3, 298:7, 294:1,295:11,299:2,306:4,302:5,304:13}
validate['oldplan_y_reindex']=validate['oldplan_y'].map(update_oldplan)
print(validate.oldplan_y_reindex.unique())
validate['oldplan_diff_corrected']=validate.oldplan_x-validate.oldplan_y_reindex
print(sum(validate['oldplan_diff_corrected']))

MGGG's dataset was not reseted, there are 13 congressional districts here

**Check 16 election data**

In [None]:
validate[['VTD','EL16G_PR_R_x','EL16G_PR_R_y']].head(10)

In [None]:
validate[['VTD','EL16G_US_2_x','EL16G_US_2_y']].head(10)

Getting sum of election results by party from MGGG file

In [None]:
#total votes for presidential republican party based on MGGG file
total_r=final_mggg['EL16G_PR_R'].sum()
print(total_r)

In [None]:
total_d=final_mggg['EL16G_PR_D'].sum()
print(total_d)

In [None]:
total_l=final_mggg['EL16G_PR_L'].sum()
print(total_l)

Getting sum of election results by party for election data as obtained from state board of elections

In [None]:

total_r_p=election_data['EL16G_PR_R'].sum()
print(total_r_p)

In [None]:

total_d_p=election_data['EL16G_PR_D'].sum()
print(total_d_p)

In [None]:
total_l_p=election_data['EL16G_PR_L'].sum()
print(total_l_p)

Total Election Votes from 2016 from MGGG file does not match up with the data available from the state board of elections (validated on the website and from the State Board of Election datafil)

**Check 12 election data**

In [None]:
#validate['election_12_diff']=validate.EL12G_PR_D_x-validate.EL12G_PR_D_y
#validate[validate.election_12_diff.abs()<10].shape[0]/validate.shape[0]

In [None]:
validate[['VTD_Key','EL12G_PR_D_x','EL12G_PR_D_y']].head(10)

#Election Data Join Looks Good