In [None]:
import us
import os
import json
import maup
import pickle
import pandas as pd
import geopandas as gpd
import copy
import pickle
import warnings
from ast import literal_eval
from collections import defaultdict
import submission_analysis.fetch as fetch
from submission_analysis.crosswalk import Crosswalk

In [None]:
warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning)

In [None]:
ids_url = "https://k61e3cz2ni.execute-api.us-east-2.amazonaws.com/prod/submissions/districtr-ids/missouri"
csv_url = "https://k61e3cz2ni.execute-api.us-east-2.amazonaws.com/prod/submissions/csv/missouri"
data_dir = '../../MO/data'
output_path = 'mo_submissions_20210902.csv'
labels_path = '../../MO/data/MO_dump_20210815.jsonl'
block_2010_to_block_2020_crosswalk_path = '../../MO/data/tab2010_tab2020_st29_mo.txt'
bg_2010_shapefile_path = '../../MO/data/tl_2010_29_bg10'
exclude_prefixes = []
exclude_units = []
crs = 'EPSG:32615'
state_name = 'Missouri'
crosswalk_paths = {}

In [None]:
plan = csv_url + "?type=plan&length=10000"
cois = csv_url + "?type=coi&length=10000"
written = csv_url + "?type=written&length=10000"

In [None]:
def new_pivot(df, state):
    df['units'] = df['districtr_data'].apply(lambda x: x['plan']['units']['id'])
        
    fips = us.states.lookup(state).fips
    
    cols = ['districtr_id', 'submission_title','submission_text', 'area_name', 'area_text', 'unit', 'tiles']
    pivot = pd.DataFrame(columns = cols)

    for _idx, row in df.iterrows():        
        # get all info
        unit = row['units']
        plan_id = row['plan_id']
        sub_type = row['type']
        sub_id = row['id']

        row_key = row['districtr_data']['plan']['idColumn']['key']
        if state == "Wisconsin" and row_key == "GEOID10" and unit == "wards":
            continue

        try:
            asn = row['districtr_data']['plan']['assignment']
        except KeyError: # empty plan
            continue

        sub_title = row['title']
        sub_text = row['text']
        parts = row['districtr_data']['plan']['parts']
        titles = {p['id']: p.get('name', '') for p in parts}
        texts = {p['id']: p.get('description', '') for p in parts}

        # make lists
        assigned = asn.keys()
        distinct_cois = {}
        for tile in assigned:
            tmp = asn[tile]
            if not isinstance(tmp, list):
                tmp = [tmp]
            for coi in tmp:
                if coi not in distinct_cois.keys():
                    distinct_cois[coi] = {
                        'sub_title': sub_title,
                        'sub_text': sub_text,
                        'area_title': titles[coi] if coi in titles else "",
                        'area_text': texts[coi] if coi in texts else "",
                        'unit': unit,
                        'tiles': []
                    }
                distinct_cois[coi]['tiles'].append(tile)


        primary_keys = [f'{sub_type[0]}{sub_id}-{d+1}' for d in distinct_cois.keys()]
        acc = pd.DataFrame(index = primary_keys, columns = cols)
        for (d, p) in zip(distinct_cois.keys(), primary_keys):
            acc.at[p, 'districtr_id'] = f'{plan_id}-{d+1}'
            acc.at[p, 'submission_title'] = distinct_cois[d]['sub_title']
            acc.at[p, 'submission_text'] = distinct_cois[d]['sub_text']
            acc.at[p, 'area_text'] = distinct_cois[d]['area_text']
            acc.at[p, 'area_name'] = distinct_cois[d]['area_title']
            acc.at[p, 'unit'] = distinct_cois[d]['unit']
            acc.at[p, 'tiles'] = distinct_cois[d]['tiles']
        pivot = pivot.append(acc)
    
    pivot = pivot.fillna(0)
    return pivot

In [None]:
bg_2010_gdf = gpd.read_file(bg_2010_shapefile_path).set_index('GEOID10').to_crs(crs)
bg_2010_gdf.index = bg_2010_gdf.index.astype(str)

In [None]:
block_crosswalk = Crosswalk(block_2010_to_block_2020_crosswalk_path)
unit_mappings = {
  from_unit: {
    to_unit: {
      f: set(t) for f, t in json.load(open(path)).items()
    }
    for to_unit, path in paths.items()
  }
  for from_unit, paths in crosswalk_paths.items()
}

In [None]:
plan_df, coi_df, _ = fetch.submissions(ids_url, plan, cois, written)
coi_df = coi_df[coi_df['districtr_data'].apply(lambda d: 'plan' in d)]  # remove problematic imported submissions

In [None]:
def pseudo_coi(data):
  assignment = data.get('plan', {}).get('assignment', {})
  unique_assignments = set()
  for val in assignment.values():
    if isinstance(val, list):
      for v in val:
        unique_assignments.add(v)
    else:
      unique_assignments.add(val)
  return len(unique_assignments) == 1

In [None]:
plan_df['pseudo_coi'] = plan_df['districtr_data'].apply(pseudo_coi)

In [None]:
plan_df = plan_df[plan_df['pseudo_coi']]

In [None]:
def load_labels(labels_path):
  labeled_submissions = [
    json.loads(line)
    for line in open(labels_path)
  ]
  labeled_cois = [
    submission
    for submission in labeled_submissions
    if submission.get('pseudo_coi', False) == True or submission['type'] == 'coi'
  ]
  
  coi_labels_by_id = defaultdict(set)
  for submission in labeled_cois:
    labels = set.union(
      *(set(annotation['labels'])
        for annotation in submission['annotations'].values())
    )
    coi_labels_by_id[submission['type'][0] + submission['id']] = labels
    
  return coi_labels_by_id

In [None]:
def blocks_2020(row):
    """Maps wards/VTDs or 2010 block groups to 2020 blocks."""
    unit = row['unit']
    if unit == 'blockgroups':
        return list(block_crosswalk.map_2010_block_groups(str(t) for t in row['tiles']))
    return list(set.union(*(unit_mappings[unit]['block_2020'][vtd] for vtd in row['tiles'])))

In [None]:
def block_groups_2010(row):
  """Standardizes units across submissions: preserves 2010 block
  groups or (approximately) maps VTDs/wards to 2010 block groups."""
  unit = row['unit']
  if unit == 'blockgroups':
    return row['tiles']
  return list(set.union(*(unit_mappings[unit]['bg_2010'][vtd] for vtd in row['tiles'])))

In [None]:
pivoted_coi = new_pivot(coi_df, state_name)

In [None]:
if not plan_df.empty:
  pivoted_plan = new_pivot(plan_df, state_name)

In [None]:
if plan_df.empty:
  pivoted = pivoted_coi
else:
  pivoted = pivoted_coi.append(pivoted_plan)

In [None]:
pivoted_coi.groupby('unit').size()

In [None]:
pivoted = pivoted[~(pivoted['unit'].isin(exclude_units))]
pivoted = pivoted[~pivoted.index.to_series().apply(
  lambda _id: any(_id.startswith(prefix) for prefix in exclude_prefixes)
)]  # remove excluded submissions

coi_labels_by_id = load_labels(labels_path)
    
pivoted['blocks_2020'] = pivoted.apply(blocks_2020, axis=1)
pivoted['block_groups_2010'] = pivoted.apply(block_groups_2010, axis=1)
pivoted['labels'] = pivoted.index.to_series().apply(lambda x: list(coi_labels_by_id[x]))
pivoted.to_csv(os.path.join(data_dir, output_path))