In [None]:
import os
import json
import pickle
import pandas as pd

## Augmenting clusters with semantics
This notebook replaces selected geographical clusters with subclusters based on geography _and_ semantics (generated with the Markov chain algorithm in `GeoSemanticClusters`).

In [None]:
db_path = '../../MI/data/mi_cluster_db_20210825.pkl'
data_dir = '../../MI/data'
output_dir = '../../MI/outputs'
output_filename = 'mi_cluster_db_20210825_augmented'
num_clusters = 36  # before splitting
splits = [
  {'target': 22, 'cluster_filename': 'mi_cluster_db_20210823_cluster_22_clustering.json'},
  {'target': 32, 'cluster_filename': 'mi_cluster_db_20210823_cluster_32_clustering.json'}
]

In [None]:
db = pickle.load(open(db_path, 'rb'))

In [None]:
geo_clusters = db.clusters_from_number(num_clusters)

In [None]:
next_cluster = geo_clusters['clusters'].max() + 1

In [None]:
for split in splits:
  with open(os.path.join(output_dir, split['cluster_filename'])) as f:
    split_data = json.load(f)
  new_clusters = sorted(set(split_data.values()))
  assert new_clusters[0] == 1
  cluster_map = {1: split['target']}
  for sub_id in new_clusters[1:]:
    cluster_map[sub_id] = next_cluster
    next_cluster += 1
  
  cluster_df = geo_clusters[geo_clusters['clusters'] == split['target']]
  for idx, sub_id in split_data.items():
    name = cluster_df.iloc[int(idx)].name
    geo_clusters.loc[name, 'clusters'] = cluster_map[sub_id]

In [None]:
geo_clusters.index.name = 'id'
geo_clusters.to_csv(os.path.join(data_dir, output_filename))