gdutils.dataqa
==============

``dataqa`` is a module in package ``gdutils`` that provides functions used in data quality assurance.

*Status:* In development

---

__Examples Setup__

In [None]:
# Install ``gdutils`` package
!pip install git+https://github.com/KeiferC/gdutils.git > /dev/null

In [None]:
import gdutils.dataqa as dq # imports the ``dataqa`` module

import geopandas as gpd
import pandas as pd
import json

import gdutils.datamine as dm
import gdutils.extract as et

In [None]:
# Gather example datasets
dm.clone_gh_repos('mggg-states', 'orgs', ['AK-shapefiles'])
dm.clone_gh_repos('MEDSL', 'users', ['official-precinct-returns'])
    # ^ this is a very large dataset -- will take some time to clone

In [None]:
# List available datasets to use

In [None]:
dm.list_files_of_type('.zip', 'AK-shapefiles/')

In [None]:
dm.list_files_of_type('.zip', 'official-precinct-returns/')
    # output not displayed since it takes up too much screen space

In [None]:
# Extract applicable data subsets

In [None]:
mggg_gdf = et.read_file('AK-shapefiles/AK_precincts.zip', column='NAME').extract()

In [None]:
medsl_et = et.read_file('official-precinct-returns/2016-precinct-president/2016-precinct-president.zip')
medsl_et.column = 'state'
medsl_et.value = 'Alaska'
medsl_gdf = medsl_et.extract()

In [None]:
# Data-wrangle MEDSL data
medsl_pvt = medsl_gdf.pivot_table(index='precinct', 
                                  columns=['office', 'party'], 
                                  values='votes')
medsl_pvt.columns = [' '.join(col).strip() for col in medsl_pvt.columns.values]
medsl_gdf = et.ExtractTable(medsl_pvt).extract()

In [None]:
# Load and generate naming convention standards
with open('naming_convention.json') as json_file:
    standards_raw = json.load(json_file)
    
offices = dm.get_keys_by_category(standards_raw, 'offices')
parties = dm.get_keys_by_category(standards_raw, 'parties')
counts = dm.get_keys_by_category(standards_raw, 'counts')
others = dm.get_keys_by_category(standards_raw, 
            ['geographies', 'demographics', 'districts', 'other'])

elections = [office + format(year, '02') + party 
                 for office in offices
                 for year in range(0, 21)
                 for party in parties 
                 if not (office == 'PRES' and year % 4 != 0)]

counts = [count + format(year, '02') for count in counts 
                                     for year in range(0, 20)]

standards = elections + counts + others

---

Example 1. Compare column names against a standard
-----------------------------------------------------------------

In [None]:
# Ex. 1.

(in_standards, not_in_standards) = dq.compare_column_names(mggg_gdf, standards)

In [None]:
in_standards # renders set of column names in mggg_gdf that fit the standards

In [None]:
not_in_standards # renders set of column names in mggg_gdf that don't fit the standards

Example 2. Aggregate column values
--------------------------------------------

__Example 2.1.__ Sum the values of one column

In [None]:
# Ex. 2.1

aggregates = dq.sum_column_values(mggg_gdf, ['PRES16D'])
aggregates

__Example 2.1.__ Sum the values of multiple columns

In [None]:
# Ex. 2.2.
columns_to_aggregate = ['PRES16D', 'PRES16G', 'PRES16R', 'PRES16L']
aggregates = dq.sum_column_values(mggg_gdf, columns_to_aggregate)
aggregates

In [None]:
# Print results in a pretty format
print('Column name : Sum of column values')
print('----------------------------------')

for aggregate in aggregates:
    (column_name, column_sum) = aggregate
    print('{:11} : {}'.format(column_name, column_sum))

Example 3. Compare every value of a column against every value of another column
--------------------------------------------------------------------------------------------------

In [None]:
# Setup sample tables for Example 3
df1 = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]],
                   columns=['COL1', 'COL2', 'COL3'])
df2 = pd.DataFrame(data=[[4, 5], [1, 2]], columns=['col2', 'col1'])

__Example 3.1.__ Compare one column from one tables against one column from another table

In [None]:
# Ex. 3.1.

results = dq.compare_column_values(df1, df2, ['COL3'], ['col2'])
results

__Example 3.2.__ Compare multiple columns from two tables against each other

In [None]:
# Ex. 3.2.

results = dq.compare_column_values(df1, df2, 
                                   ['COL1', 'COL2'],
                                   ['col1', 'col2'])
results

In [None]:
# Print results in a pretty format
for column_to_column in results:
    print('{} ========'.format(column_to_column))
    for row_to_row, absolute_difference in results[column_to_column]:
        print('{:3} : {}'.format(row_to_row, absolute_difference))

__Example 3.3.__ Compare a row in a column against a specific row in another column

In [None]:
# Ex. 3.3.
results = dq.compare_column_values(mggg_gdf, medsl_gdf, 
                                   ['PRES16D'], ['US President democratic'],
                                   ['01-446 AURORA'], ['01-446 Aurora'])
results

__Example 3.4.__ Compare specific rows in a column against specific rows in another column

In [None]:
# Ex. 3.4.
results = dq.compare_column_values(mggg_gdf, medsl_gdf, 
                                   ['PRES16D'], ['US President democratic'],
                                   ['01-446 AURORA', '01-455 FAIRBANKS NO. 1', '01-465 FAIRBANKS NO. 2'], 
                                   ['01-446 Aurora', '01-455 Fairbanks No. 1', '01-465 Fairbanks No. 2'])
results

In [None]:
# Print results in a pretty format
for column_to_column in results:
    print('{} ========'.format(column_to_column))
    for row_to_row, absolute_difference in results[column_to_column]:
        print('{:45} : {}'.format(row_to_row, absolute_difference))

---

__Examples Cleanup__

The following commands are used to reset and clean up the examples above.

In [None]:
# Remove cloned repos
dm.remove_repos('.')

In [None]:
# Remove outputs
!rm -r outputs

In [None]:
# Uninstall Package
!echo y | pip uninstall gdutils

In [None]:
# Reset Jupyter Notebook IPython Kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")