In [None]:
import io
import os
import sys
import json
import pickle
from submission_analysis import ccdb
sys.modules['ccdb'] = ccdb  # pickle compatibility

In [None]:
data_dir = '../../MI/data'
db_path = '../../MI/data/mi_cluster_db_20210809.pkl'
labeled_submissions_path = '../../MI/data/MI_dump_20210815.jsonl'
submissions_index_path = '../../MI/data/mi_submissions_index_20210816.json'
output_filename = 'mi_cluster_db_20210809_new_format.pkl'

### Old database

In [None]:
db = pickle.load(open(db_path, 'rb'))

### Submission labels

In [None]:
labeled_submissions = [
  json.loads(line)
  for line in open(labeled_submissions_path).readlines()
]

In [None]:
labeled_submission_ids = set(sub['id'] for sub in labeled_submissions)

In [None]:
labeled_cois = [
  submission
  for submission in labeled_submissions
  if submission.get('pseudo_coi') == True or submission['type'] == 'coi'
]

In [None]:
all_coi_labels_by_id = {
  submission['id']: set.union(
    *(set(annotation['labels'])
      for annotation in submission['annotations'].values())
  )
  for submission in labeled_cois
}

coi_labels_by_id = {
  submission_id: labels
  for submission_id, labels in all_coi_labels_by_id.items()
  if labels
}

all_titles_by_id = {
  submission['id']: submission['title']
  for submission in labeled_cois
}

### Submissions index

In [None]:
submissions_index = json.load(open(submissions_index_path))['submissionPreviews']
plan_id_to_submission_id = {
  sub['link'].split('/')[-1].split('?')[0].strip(): str(sub['id'])
  for sub in submissions_index
  if 'link' in sub and sub['link'] is not None
}
plan_id_to_submission_prefix = {
  sub['link'].split('/')[-1].split('?')[0].strip(): sub['type'][0]
  for sub in submissions_index
  if 'link' in sub and sub['link'] is not None
}
submission_id_to_plan_id = {v: k for k, v in plan_id_to_submission_id.items()}

In [None]:
def pid_to_sid_with_prefix(plan_id):
  pid, part = plan_id.strip().split('-')
  prefix = plan_id_to_submission_prefix[pid.strip()]
  sid_no_part = plan_id_to_submission_id[pid.strip()]
  if sid_no_part in labeled_submission_ids:
    return prefix + sid_no_part
  return prefix + sid_no_part + '-' + part.strip()

### New columns

In [None]:
db.coi_data['id'] = db.coi_data.index.to_series().apply(pid_to_sid_with_prefix)
db.coi_data['labels'] = db.coi_data['id'].apply(lambda pid: list(coi_labels_by_id.get(pid[1:], []))).apply(str)
db.coi_data['submission_title'] = db.coi_data['id'].apply(lambda pid: all_titles_by_id.get(pid[1:], '')).apply(str)

In [None]:
db.coi_data['districtr_id'] = db.coi_data.index.to_series()
db.coi_data = db.coi_data.set_index('id')

### New location data format 

In [None]:
coi_tiles = []
for row in db.coi_location_data.to_dict(orient='records'):
  coi_tiles.append([
    block_group
    for block_group, in_coi in row.items()
    if in_coi == 1
  ])

In [None]:
db.coi_data['tiles'] = db.coi_data['block_groups_2010'] = coi_tiles

In [None]:
# A horrible hack to get around moving the `ccdb` module.
from submission_analysis import ccdb as ccdb_new
db_new = object.__new__(ccdb_new.coi_cluster_database)

db_new.coi_data = db.coi_data
db_new.coi_total_dissimilarities = db.coi_total_dissimilarities
db_new.dendrogram = db.dendrogram
db_new.dual_graph = db.dual_graph

with open(os.path.join(data_dir, output_filename), 'wb') as f:
  pickle.dump(db_new, f)