# ``mggg-states`` Data QA

In [None]:
!pip install numpy
!pip install pandas
!pip install geopandas

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import json

import dataqa

## Data Collection

In [None]:
# This is going to take a while to complete
dataqa.clone_repos(account='mggg-states', 
                   account_type='orgs', 
                   outpath='shps/')

In [None]:
mggg_shapefiles = dataqa.list_files_of_type('.zip', 'shps/')
mggg_shapefiles

## Data Standardization Check

### Generate Standards

In [None]:
with open('naming_convention.json') as json_file:
    standards_raw = json.load(json_file)

standards_raw

In [None]:
geographies = dataqa.get_keys_by_category(standards_raw, 'geographies')
geographies

In [None]:
offices = dataqa.get_keys_by_category(standards_raw, 'offices')
offices

In [None]:
parties = dataqa.get_keys_by_category(standards_raw, 'parties')
parties

In [None]:
demographics = dataqa.get_keys_by_category(standards_raw, 'demographics')
demographics

In [None]:
districts = dataqa.get_keys_by_category(standards_raw, 'districts')
districts

In [None]:
counts = dataqa.get_keys_by_category(standards_raw, 'counts')
counts

In [None]:
other = dataqa.get_keys_by_category(standards_raw, 'other')
other

In [23]:
elections = []
years = [18, 16, 14, 12]

elections = [office + str(year) + party for office in offices
                                        for year in years
                                        for party in parties 
                                        if not (office == 'PRES' and
                                                year % 4 != 0)]

counts = [count + str(year) for count in counts for year in years]

In [None]:
standards = geographies + elections + demographics + districts + counts + other
standards

### Compare Data with Standards

In [12]:
def gdf_from_zip(path: str) -> gpd.GeoDataFrame:
    return gpd.read_file('zip://' + path)

In [15]:
mggg_shapefiles

['shps/OR-shapefiles.git/OR_precincts.zip',
 'shps/PA-shapefiles.git/PA_VTDs.zip',
 'shps/MN-shapefiles.git/MN12.zip',
 'shps/MN-shapefiles.git/MN16.zip',
 'shps/MN-shapefiles.git/MN14.zip',
 'shps/MN-shapefiles.git/MN12_18.zip',
 'shps/MN-shapefiles.git/Archived/MN_16.zip',
 'shps/MN-shapefiles.git/Archived/MN_14.zip',
 'shps/MN-shapefiles.git/Archived/MN_10.zip',
 'shps/MN-shapefiles.git/Archived/MN_12.zip',
 'shps/MN-shapefiles.git/Archived/MN_08.zip',
 'shps/MN-shapefiles.git/Archived/mn_08_16.zip',
 'shps/DE-shapefiles.git/DE_precincts.zip',
 'shps/WI-shapefiles.git/WI_wards_12_16.zip',
 'shps/UT-shapefiles.git/UT_precincts.zip',
 'shps/MI-shapefiles.git/MI_precincts.zip',
 'shps/MA-shapefiles.git/MA_no_islands_02_10.zip',
 'shps/MA-shapefiles.git/MA_precincts_12_16.zip',
 'shps/MA-shapefiles.git/MA_no_islands_12_16.zip',
 'shps/MA-shapefiles.git/MA_precincts_02_10.zip',
 'shps/VA-shapefiles.git/VA_precincts.zip',
 'shps/MD-shapefiles.git/MD_precincts_abs.zip',
 'shps/MD-shapefile

#### Alaska

In [None]:
ak_gdf = gdf_from_zip('shps/AK-shapefiles.git/AK_precincts.zip')
ak_gdf.columns # for file validation

In [None]:
(ak_matches, ak_discrepancies) = dataqa.compare_column_names(ak_gdf, standards)

In [None]:
ak_matches # manually check that the matches are good for summing

In [None]:
ak_discrepancies # manually check if the discrepancies are acceptable

#### Arizona

In [None]:
az_gdf = gdf_from_zip('shps/AZ-shapefiles.git/az_precincts.zip')
az_gdf.columns

In [None]:
(az_matches, az_discrepancies) = dataqa.compare_column_names(az_gdf, standards)
az_matches

In [None]:
az_discrepancies

#### 

## Election Data Summation

In [None]:
mggg_path_to_file = ''

all_totals = {}

offices_to_sum = ['PRES', 'SEN']
years_to_sum = ['18', '16', '14', '12']
parties_to_sum = ['D', 'R']

generated_columns = {office + year + party for office in offices_to_sum 
                                           for year in years_to_sum 
                                           for party in parties_to_sum}
generated_columns

### Sums

Note: Done manually one-by-one due to inconsistencies across files

In [None]:
# Alaska
mggg_gdf = get_mggg_gdf( 'AK-shapefiles/AK_precincts.zip')
mggg_gdf.columns # for file validation and column checking

In [None]:
all_totals['Alaska'] = sum_values(generated_columns, mggg_gdf.head())
all_totals

In [None]:
# Arizona
mggg_gdf = get_mggg_gdf( 'AZ-shapefiles/az_precincts.zip')
mggg_gdf.columns

In [None]:
all_totals['Arizona'] = sum_values(generated_columns, mggg_gdf.head())
all_totals

## Clean Directory

In [None]:
#!echo y | rm -r ./*-shapefiles/
dataqa.remove_repos('.')