In [None]:
import sys
import os
sys.path.insert(0,f'{os.getcwd()}/../art_snob_primrose/')
from src.datastore_reader import DataStoreReader
from src.list_flattener import ListFlattener
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import random
from sklearn.decomposition import PCA
from sklearn.neighbors import KDTree
from sklearn.cluster import Birch, MiniBatchKMeans
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from itertools import cycle
import pickle
from IPython.display import Image

In [None]:
with open('12012020-distance-cluster-model.pkl', 'rb') as f:
    dcm=pickle.load(f)

In [None]:
# get all the features from datastore
project='artsnob-1'
kind='frames-scraped-image-data'

dsr = DataStoreReader()
entities = dsr.execute(project, kind, max_records=None)

In [None]:
# get all the features from datastore
project='artsnob-1'
kind='10232020-pca-nn'

dsr = DataStoreReader()
feat_entities = dsr.execute(project, kind, max_records=None)

In [None]:
# get all the features from datastore
project='artsnob-1'
kind='11202020-tag_reverse_index'

dsr = DataStoreReader()
ri_entities = dsr.execute(project, kind, max_records=None)

In [None]:
vecs = feat_entities['reader_data']
ri = ri_entities['reader_data']
data = entities['reader_data']

In [None]:
aids = []
tags = []
for aid, entity in data.items():            
    tags.append(' '.join(entity['standard_tags']))
    aids.append(aid)
    
tag_data = pd.DataFrame({'aids': aids, 'tags': tags})
tag_data['tags'] = tag_data['tags'].map(lambda x: x.lower())

In [None]:
tag_data['tags'] = tag_data['tags'].map(lambda x: x.replace('\n', '').replace('\r', '').replace('----', '-').replace('---', '-').replace('--', '-'))
vectorizer = TfidfVectorizer(token_pattern="(?u)\\b[\\w-]+\\b", max_features=1000)
X = vectorizer.fit_transform(tag_data['tags'])

In [None]:
counts = (X>0).sum(axis=0)
sums = X.sum(axis=0)
# get average scores and number of items with the tag
feature_names = vectorizer.get_feature_names()
tags_values = {}
for s,c,n in zip((sums/counts).T, counts.T, feature_names):
    tags_values[n] = {'score': float(s[0]), 'count': int(c[0])} 
tag_df = pd.DataFrame().from_dict(tags_values, orient='index')
from sklearn import preprocessing
tag_df['scaled_score'] = preprocessing.StandardScaler().fit_transform(tag_df['score'].values.reshape(-1, 1))
tag_df['scaled_count'] = preprocessing.StandardScaler().fit_transform(np.log(tag_df['count']).values.reshape(-1, 1))
tag_df['weighted_score'] = tag_df['scaled_score'] + tag_df['scaled_count'] - (tag_df['count']<500)*5
tag_dict = tag_df[['weighted_score', 'score', 'count']].sort_values('weighted_score', ascending=False).to_dict(orient='index')

In [None]:
from utilities.datastore_helpers import DataStoreInterface

dsi = DataStoreInterface(project='artsnob-1')
ids = []
data = []

for t, keys in tag_dict.items():
    data.append(keys)
    ids.append(t)

dsi.update(data_list=data, ids=ids, kind='11122020-tag-scores')

In [None]:
pca = PCA(n_components=100)
fullX = X.toarray()
feature_fullX = pca.fit_transform(fullX)
tag_data['embedding'] = tag_data.index.map(lambda i: feature_fullX[i])
ordered_aids = tag_data['aids'].values
tag_embeddings = tag_data['embedding'].values

In [None]:
def view_art(ids):
    for idx in ids:
        art = requests.get(f'http://localhost:8000/art/{idx}')
        print(idx)
        display(Image("https://storage.googleapis.com/artsnob-image-scrape/"+art.json()['images'], width=400, height=400))

def all_art_vectors():
    all_art = np.array([v['umap_data'] for k,v in vecs.items()])
    all_keys = np.array([k for k,v in vecs.items()])
    return all_art, all_keys


In [None]:
all_art, all_keys = all_art_vectors()

In [None]:
all_art_tags = np.concatenate((all_art, tag_embeddings_ordered), axis=1)

In [None]:
tree = KDTree(all_art_tags)

In [None]:
n_clusters=100
mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=1000,
                      n_init=10, max_no_improvement=10, verbose=0,
                      random_state=0)

In [None]:
# Use all colors that matplotlib provides by default.
mbk.fit(u)
mbk_means_labels_unique = np.unique(mbk.labels_)

In [None]:
# make a list of dicts for the inverse index 
predictions = mbk.predict(u)
inverse_cluster_index = []
inverse_cluster_keys = []

for i, centroid in enumerate(mbk.cluster_centers_):
    inverse_cluster_keys.append(i+1)
    inverse_cluster_index.append({'centroid': list(centroid), 'idx': [int(ids) for ids in list(all_keys[predictions==i])]})

In [None]:
# write inverse cluster index to the db
from utilities.datastore_helpers import DataStoreInterface
dsi = DataStoreInterface(project='artsnob-1')
dsi.update(data_list=inverse_cluster_index, ids=inverse_cluster_keys, kind='11292020-inverse-cluster-index')

In [None]:
# make a distance matrix between cluster centers
distance_mat = []
for c_num, c in enumerate(mbk.cluster_centers_):
    dist, n_idx = cluster_tree.query([c], k=len(mbk.cluster_centers_))
    dist_map = dict(zip(n_idx[0], dist[0]))
    distance_mat.append([dist_map[i] for i in range(len(mbk.cluster_centers_))])

distance_mat = np.array(distance_mat)

In [None]:
## examples of clusaters 
colors_ = cycle(colors.cnames.keys())

fig = plt.figure(figsize=(12, 4))
fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9)
ax = fig.add_subplot(1, 3, 3)
for this_centroid, k, col in zip(mbk.cluster_centers_,
                                 range(n_clusters), colors_):
    
    print(f'CLUSTER {k}')
    dist, n_idx = tree.query([this_centroid], k=5)
    local_keys = all_keys[n_idx]
    view_art(local_keys[0])
    
    mask = mbk.labels_ == k
    ax.scatter(all_art_tags[mask, 0], all_art_tags[mask, 1], marker='.',
               c='w', edgecolor=col, alpha=0.5)
    ax.scatter(this_centroid[0], this_centroid[1], marker='+',
               c='k', s=25)

ax.set_title("MiniBatchKMeans")
ax.set_autoscaley_on(False)
plt.show()

In [None]:
class DistanceClusterModel():
    
    def __init__(self, cluster_centers=None, key_map=None, nn_tree=None, distance_mat=None):
        self.cluster_centers = copy.deepcopy(cluster_centers)
        self.key_map = copy.deepcopy(key_map)
        self.nn_tree = copy.deepcopy(nn_tree)
        self.distance_mat = copy.deepcopy(distance_mat)
        
    def save(self, name='12012020-distance-cluster-model.pkl'):
        save_dict = {'cluster_centers': self.cluster_centers, 'key_map': self.key_map,
                    'nn_tree': self.nn_tree, 'distance_mat': self.distance_mat}
        
        with open(name, 'wb') as f:
            pickle.dump(save_dict, f)
        
    def load(self, name):
        with open(name, 'rb') as f:
            save_dict = pickle.load(f)
        
        self.cluster_centers = save_dict['cluster_centers']
        self.key_map = save_dict['key_map']
        self.nn_tree = save_dict['nn_tree']
        self.distance_mat = save_dict['distance_mat']

In [None]:
class ExploreExploitClusters():
    
    def __init__(self, distance_cluster_model, alpha=1.0, min_dist=6.0, exp_exl=0.1):
        self.cluster_centers = distance_cluster_model.cluster_centers
        self.key_map = distance_cluster_model.key_map
        self.nn_tree = distance_cluster_model.nn_tree
        self.distance_mat = distance_cluster_model.distance_mat

        self.alpha = alpha
        self.min_dist = min_dist
        self.exp_exl = exp_exl
        self.exponential_drop = self.vectorized_drop()
    
    def vectorized_drop(self):
        f = lambda d: np.exp(max(d, self.min_dist)*-1*self.alpha)
        return np.vectorize(f)
    
    def preference_mask(self, idx=[]):

        size = len(self.cluster_centers)
        blank = np.zeros([size, size])

        for pid in idx:
            blank[pid, :] += np.array([1]*size)
            blank[:, pid] += np.array([1]*size).T
            blank[pid, pid] -= 1

        return blank
    
    def next_item(self, total_mask, likes, skip_n=0):
    
        masked_exp = np.multiply(exponential_drop(self.distance_mat), total_mask).sum(axis=0)
    
        if random.random() < self.exp_exl:
            print('EXPLOIT')
            sorted_mask = np.argsort(-1*masked_exp)
            for item in sorted_mask:
                if item not in likes:
                    if skip_n == 0:
                        return item
                    else:
                        skip_n -= 1
        else:
            print('EXPLORE')
            sorted_mask = np.argsort(np.abs(masked_exp))
            for item in sorted_mask:
                if item not in likes:
                    if skip_n == 0:
                        return item
                    else:
                        skip_n -= 1
    
    def predict_next(self, likes=[], dislikes=[], skip_n=0, art_ids=True, n_ids=5):
        
        mask = self.preference_mask(likes)
        neg_mask = self.preference_mask(dislikes)
        total_mask = mask - neg_mask
        
        if art_ids:
            item = self.next_item(total_mask, likes, skip_n)
            dist, n_idx = self.nn_tree.query([self.cluster_centers[item]], k=n_ids)
            return item, self.key_map[n_idx][0]
        else:
            return self.next_item(total_mask, likes, skip_n)
    
        

In [None]:
dcm = DistanceClusterModel(mbk.cluster_centers_, all_keys, tree, distance_mat)
eec = ExploreExploitClusters(dcm, exp_exl=1.0)

In [None]:
dcm.save()

In [None]:
# test the model here
# test teh algorithm

cluster, ex_art = eec.predict_next(
                                   likes=[0, 38, 33, 94, 69, 36, 39, 75], 
                                   dislikes=[89, 49, 45, 82, 81, 11, 21, 5, 26], skip_n=0)
print(cluster)
view_art(ex_art)