In [None]:
import sys
import os
sys.path.insert(0,f'{os.getcwd()}/../art_snob_primrose/')
from src.datastore_reader import DataStoreReader
from src.list_flattener import ListFlattener
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import pickle
import requests
from IPython.display import Image

In [None]:
# get all the features from datastore
project='artsnob-1'
kind='10232020-pca-nn'

dsr = DataStoreReader()
entities = dsr.execute(project, kind, max_records=None)

In [None]:
# get all the features from datastore
project='artsnob-1'
kind='11202020-tag_reverse_index'

dsr = DataStoreReader()
ri_entities = dsr.execute(project, kind, max_records=None)

In [None]:
with open('10232020-vectors.pkl', 'wb') as f:
    pickle.dump(entities, f)

In [None]:
with open('11202020-reversetag.pkl', 'wb') as f:
    pickle.dump(ri_entities, f)

In [None]:
with open('10232020-vectors.pkl', 'rb') as f:
    entities = pickle.load(f)
with open('11202020-reversetag.pkl', 'rb') as f:
    ri_entities = pickle.load(f)

In [None]:
"""TODO: let's make some tag embeddings-- we can vectorize with tfidf scores across the tags, then we can 
    do some pca or umap dim reduction, and concat that vector to see if that helps make the clusters
    even better

"""
with open('ordered_aids.npy', 'rb') as f:
    ordered_aids = np.load(f, allow_pickle=kle=True)
    
with open('tag_embeddings.npy', 'rb') as f:
    tag_embeddings = np.load(f, allow_pickle=True)

In [None]:
tag_embed_map = dict(zip(ordered_aids, tag_embeddings))

In [None]:
tag_embeddings_ordered = np.array([tag_embed_map[k] for k in all_keys])

In [None]:
vecs = entities['reader_data']
ri = ri_entities['reader_data']

In [None]:
def tag_centroid(tag):
    keys = ri[tag]['keys']
    vector_lists = [vecs.get(int(key))['umap_data'] for key in keys if int(key) in vecs]
    return np.array(vector_lists).mean(axis=0)

def close_art(tag, all_art, all_keys):
    tc = tag_centroid(tag)
    view_art(all_keys[np.argsort(np.abs(all_art - tc).sum(axis=1))[:10]])

def dim_extrema(dim=0):
    single_dim = np.argsort(all_art[:, dim])
    return all_keys[np.concatenate((single_dim[:5], single_dim[-5:]))]
    

In [None]:
view_art(dim_extrema(6))

In [None]:
def all_art_vectors():
    all_art = np.array([v['umap_data'] for k,v in vecs.items()])
    all_keys = np.array([k for k,v in vecs.items()])
    return all_art, all_keys

In [None]:
def view_art(ids):
    for idx in ids:
        art = requests.get(f'http://localhost:8000/art/{idx}')
        print(idx)
        display(Image("https://storage.googleapis.com/artsnob-image-scrape/"+art.json()['images'], width=400, height=400))
        

In [None]:
all_art, all_keys = all_art_vectors()

In [None]:
all_art_tags = np.concatenate((all_art, tag_embeddings_ordered), axis=1)

In [None]:
tc = tag_centroid('Digital')

In [None]:
close_art('Movies-tv', all_art, all_keys)

In [None]:
"""Try some umap further dimensional reduction..."""
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import umap
%matplotlib inline

In [None]:
sns.set(style='white', context='poster', rc={'figure.figsize':(14,10)})

In [None]:
fit = umap.UMAP(min_dist=0.5, n_neighbors=15, metric='manhattan')
u = fit.fit_transform(all_art)

In [None]:
# run this to use u for all_art rather than the fitted one
# prev_u = u
u = all_art_tags

In [None]:
sns.jointplot(x=u[:,0], y=u[:,1])

In [None]:
from sklearn.neighbors import KDTree
from sklearn.cluster import Birch, MiniBatchKMeans
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from itertools import cycle

In [None]:
tree = KDTree(u)

In [None]:
n_clusters=100
mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=1000,
                      n_init=10, max_no_improvement=10, verbose=0,
                      random_state=0)

In [None]:
# Use all colors that matplotlib provides by default.
mbk.fit(u)
mbk_means_labels_unique = np.unique(mbk.labels_)

In [None]:
len(high_centroids)

In [None]:
## how can we get someone to where they need to be quickest?
## do we need to make 10 clusters on top of these clusters? I think so 
## then we can iterate through those 10, diving into them after the fact 

## algorithm: iterate through the 10 centroid-based high-level clusters
## iterate through likes, then skips, then dislikes
high_centroids, high_tree = get_centroids(mbk.cluster_centers_, n_clusters=10)

# get the images for each cluster item
for hc in high_centroids:
    cluster_image(hc, tree, k=3)


In [None]:
# get the central stuff
def get_centroids(data, n_clusters=10):
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=1000,
                          n_init=10, max_no_improvement=10, verbose=0,
                          random_state=0)
    mbk.fit(data)
    tree = KDTree(data)
    return mbk.cluster_centers_, tree

# get the example from each
def cluster_image(this_centroid, tree, k=1):
    dist, n_idx = tree.query([this_centroid], k=k)
    local_keys = all_keys[n_idx]
    view_art(local_keys[0])

In [None]:
## examples without the umap clustering
colors_ = cycle(colors.cnames.keys())

fig = plt.figure(figsize=(12, 4))
fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9)
ax = fig.add_subplot(1, 3, 3)
for this_centroid, k, col in zip(mbk.cluster_centers_,
                                 range(n_clusters), colors_):
    
    print(f'CLUSTER {k}')
    dist, n_idx = tree.query([this_centroid], k=5)
    local_keys = all_keys[n_idx]
    view_art(local_keys[0])
    
    mask = mbk.labels_ == k
    ax.scatter(u[mask, 0], u[mask, 1], marker='.',
               c='w', edgecolor=col, alpha=0.5)
    ax.scatter(this_centroid[0], this_centroid[1], marker='+',
               c='k', s=25)

ax.set_title("MiniBatchKMeans")
ax.set_autoscaley_on(False)
plt.show()

In [None]:
# algorithm should start somewhere (can be optimized)
# then after dislike, we hurt everything around it with a lower score and move to the unknown place
# FIRST let's make a distance matrix
cluster_tree = KDTree(mbk.cluster_centers_)


In [None]:
distance_mat = []
for c_num, c in enumerate(mbk.cluster_centers_):
    dist, n_idx = cluster_tree.query([c], k=len(mbk.cluster_centers_))
    dist_map = dict(zip(n_idx[0], dist[0]))
    distance_mat.append([dist_map[i] for i in range(len(mbk.cluster_centers_))])

distance_mat = np.array(distance_mat)

In [None]:
def preference_mask(pos_idx=[], neg_idx=[], size=100):
#     blank = np.zeros([size, size])
    blank = np.eye(size)
    for pid in pos_idx:
        blank[pid, :] += np.array([1]*size)
        blank[:, pid] += np.array([1]*size).T
    for pid in neg_idx:
        blank[pid, :] += np.array([1]*size)
        blank[:, pid] += np.array([1]*size).T
    
    return blank

In [None]:
mask = preference_mask([0,1])
pos_vals = np.exp(np.multiply(mask, distance_mat)*-0.1).sum(axis=1)
neg_mask = preference_mask([], [5,11])
neg_vals = -1*np.exp(np.multiply(neg_mask, distance_mat)*-0.1).sum(axis=1)
pos_vals + neg_vals

In [None]:
neg_vals

In [None]:
np.multiply(mask, distance_mat)

In [None]:
colors_ = cycle(colors.cnames.keys())

fig = plt.figure(figsize=(12, 4))
fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9)
ax = fig.add_subplot(1, 3, 3)
for this_centroid, k, col in zip(mbk.cluster_centers_,
                                 range(n_clusters), colors_):
    
    print(f'CLUSTER {k}')
    dist, n_idx = tree.query([this_centroid], k=5)
    local_keys = all_keys[n_idx]
    view_art(local_keys[0])
    
    mask = mbk.labels_ == k
    ax.scatter(u[mask, 0], u[mask, 1], marker='.',
               c='w', edgecolor=col, alpha=0.5)
    ax.scatter(this_centroid[0], this_centroid[1], marker='+',
               c='k', s=25)
# ax.set_xlim([-25, 25])
# ax.set_ylim([-25, 25])
ax.set_title("MiniBatchKMeans")
ax.set_autoscaley_on(False)
plt.show()