In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from ast import literal_eval

## Per-cluster geo+semantic pipeline

Moon suggests:
* Leave clusters that are roughly the desired size alone.
* For clusters that are too large, run JN's Markov chain to the split the clusters into $k$ parts. _(Question: do we fix $k$ for all clusters? Should $k$ vary linearly with the number of submissions in a cluster? Is this once again an ad-hoc thing?)_

We will need to recombine the split clusters with the untouched clusters to create a final data product. The flow for the `A` clusters (geographical clustering only) is `preprocess_submissions` → `generate_geo_clusters` → `cluster_outputs`, where `generate_geo_clusters` is a minimal wrapper around `ccdb`. A reasonable flow for this analysis might be:
* `preprocess_submissions` (downloads data and joins labels—same as before)
* `generate_geo_clusters` (call geographical clustering algorithm—same as before)
* this notebook (generates matrices from specified superclusters)
for each cluster to split:
  * JN's notebook (generates labelings from the matrices)
  * another new notebook (generates a CSV or pickle file usable by `cluster_outputs` as is)

We save the matrices in `outputs`, as they are intermediate files used only by this pipeline and therefore not worth cataloguing in `data`.

In [None]:
db_path = '../../MI/data/mi_cluster_db_20210825.pkl'
output_dir = '../../MI/outputs'
output_prefix = 'mi_cluster_db_20210823'
num_clusters = 36
clusters_to_split = (22, 32)

In [None]:
db = pickle.load(open(db_path, 'rb'))

In [None]:
db.coi_data['idx'] = range(len(db.coi_data))

In [None]:
clusters_df = db.clusters_from_number(num_clusters)

In [None]:
clusters_df['labels'] = clusters_df['labels'].apply(literal_eval)

In [None]:
def jaccard_similarity_matrix(cluster_df):
  """Generates a Jaccard similarity matrix over unique labels."""
  unique_labels = {
    label: idx
    for idx, label in enumerate(set.union(*(set(labels) for labels in cluster_df['labels'])))
  }
  
  n = len(cluster_df)
  label_vectors = np.zeros((n, len(unique_labels)), dtype=int)
  for idx, labels in enumerate(cluster_df['labels']):
    for label in labels:
      if label in unique_labels:
        label_vectors[idx, unique_labels[label]] = 1
        
  semantic_similarities = np.zeros((n, n))
  for ii, outer_vec in enumerate(label_vectors):
    for jj, inner_vec in enumerate(label_vectors):
      inter = np.bitwise_and(inner_vec, outer_vec)
      union = np.bitwise_or(inner_vec, outer_vec)
      semantic_similarities[ii, jj] = inter.sum() / max(union.sum(), 1)
  return semantic_similarities

In [None]:
for cluster_id in clusters_to_split:
  cluster_df = clusters_df[clusters_df['clusters'] == int(cluster_id)]
  jaccard_sims = jaccard_similarity_matrix(cluster_df)
  np.savetxt(
    os.path.join(output_dir, f'{output_prefix}_cluster_{cluster_id}_jaccard_sims.txt'),
    jaccard_sims
  )
    
  indices = cluster_df['idx']
  hausdorff_distances = db.coi_total_dissimilarities[indices][:, indices]
  np.savetxt(
    os.path.join(output_dir, f'{output_prefix}_cluster_{cluster_id}_hausdorff_dists.txt'),
    hausdorff_distances
  )