In [1]:
import hashlib
import numpy as np
import pandas as pd
import copy
import random
import pickle

class DistanceClusterModel():
    
    def __init__(self, cluster_centers=None, key_map=None, nn_tree=None, distance_mat=None):
        self.cluster_centers = copy.deepcopy(cluster_centers)
        self.key_map = copy.deepcopy(key_map)
        self.nn_tree = copy.deepcopy(nn_tree)
        self.distance_mat = copy.deepcopy(distance_mat)
        
    def save(self, name='12012020-distance-cluster-model.pkl'):
        save_dict = {'cluster_centers': self.cluster_centers, 'key_map': self.key_map,
                    'nn_tree': self.nn_tree, 'distance_mat': self.distance_mat}
        
        with open(name, 'wb') as f:
            pickle.dump(save_dict, f)
        
    def load(self, name):
        with open(name, 'rb') as f:
            save_dict = pickle.load(f)
        
        self.cluster_centers = save_dict['cluster_centers']
        self.key_map = save_dict['key_map']
        self.nn_tree = save_dict['nn_tree']
        self.distance_mat = save_dict['distance_mat']

In [2]:
"""read dcm into memory to re-write"""
dcm = DistanceClusterModel()
dcm.load('12012020-distance-cluster-model.pkl')

In [46]:
"""Examine a cluster center, to see if it's the same as what's already been written"""
# cluster_num = 1
def all_cluster_ordered_keys(cluster_num):
    dist, n_idx = dcm.nn_tree.query([dcm.cluster_centers[cluster_num]], k=len(dcm.key_map))
    return [int(k) for k in dcm.key_map[n_idx][0]]
# """NOTE IT IS NOT, SO WE NEED TO WRITE NEW DATA, AND PULL EXISTING DATA TO MAKE SURE WE'RE MATCHING"""

In [6]:
import sys
import os
sys.path.insert(0,f'{os.getcwd()}/../art_snob_primrose/')
from src.datastore_reader import DataStoreReader

In [7]:
# get all the features from datastore
project='artsnob-1'
kind='11292020-inverse-cluster-index'

dsr = DataStoreReader()
entities = dsr.execute(project, kind, max_records=None)

2021-01-27 16:28:32,936 INFO datastore_reader.py execute: Starting datastore read from kind: 11292020-inverse-cluster-index
2021-01-27 16:28:34,326 INFO datastore_reader.py execute: Read down 100 records from kind: 11292020-inverse-cluster-index


In [25]:
"""get art cluster definitions to add to backend"""
art_cluster_def = pd.read_csv('art_cluster_descriptions.csv').to_dict()['Description']

In [47]:
"""Now we loop across each cluster and make a new entry for the DB"""
updated_sorted_clusters = []
updated_sorted_cluster_keys = [i for i in range(1,101)]

for cluster_key in updated_sorted_cluster_keys:
    all_ordered_keys = all_cluster_ordered_keys(cluster_key-1)
    cluster_unordered_keys = set(entities['reader_data'][cluster_key]['idx'])
    cluster_centroid = entities['reader_data'][cluster_key]['centroid']
    cluster_ordered_keys = [k for k in all_ordered_keys if k in cluster_unordered_keys]
    if set(cluster_ordered_keys) != set(cluster_unordered_keys):
        print(f'Mismatch with cluster {cluster_key}')
        break
    updated_sorted_clusters.append({'centroid': cluster_centroid, 
                                    'idx': cluster_ordered_keys, 
                                    'description': art_cluster_def[cluster_key-1]})

In [55]:
"""Now upload new inverse index to ds"""
# write inverse cluster index to the db
from utilities.datastore_helpers import DataStoreInterface
dsi = DataStoreInterface(project='artsnob-1')
dsi.update(data_list=updated_sorted_clusters, ids=updated_sorted_cluster_keys, kind='01272021-inverse-cluster-index')

