In [1]:
# source: https://github.com/JustGlowing/minisom

from math import sqrt
import numpy as np
np.set_printoptions(suppress=True, precision=5)
import pandas as pd
pd.options.display.float_format = '{:.5f}'.format
from numpy import (array, unravel_index, nditer, linalg, random, subtract, max,
                   power, exp, pi, zeros, ones, arange, outer, meshgrid, dot,
                   logical_and, mean, std, cov, argsort, linspace, transpose,
                   einsum, prod, nan, sqrt, hstack, diff, argmin, multiply)
from numpy import sum as npsum
from numpy.linalg import norm
from collections import defaultdict, Counter
from warnings import warn
from sys import stdout
from time import time
from datetime import timedelta
import pickle
import os
from itertools import product

# for testing
from numpy.testing import assert_almost_equal, assert_array_almost_equal
from numpy.testing import assert_array_equal
import unittest

from IPython.display import display
import matplotlib.pyplot as plt
from PIL import Image

from sklearn.metrics import ndcg_score, dcg_score
from scipy.spatial import distance_matrix
from scipy.optimize import linear_sum_assignment

from munkres import Munkres

In [2]:
#### Plain SOM ####

def _build_iteration_indexes(data_len, num_iterations,
                             verbose=False, random_generator=None):
    """Returns an iterable with the indexes of the samples
    to pick at each iteration of the training.
    If random_generator is not None, it must be an instalce
    of numpy.random.RandomState and it will be used
    to randomize the order of the samples."""
    iterations = arange(num_iterations) % data_len
    if random_generator:
        random_generator.shuffle(iterations)
    if verbose:
        return _wrap_index__in_verbose(iterations)
    else:
        return iterations

def _wrap_index__in_verbose(iterations):
    """Yields the values in iterations printing the status on the stdout."""
    m = len(iterations)
    digits = len(str(m))
    progress = '\r [ {s:{d}} / {m} ] {s:3.0f}% - ? it/s'
    progress = progress.format(m=m, d=digits, s=0)
    stdout.write(progress)
    beginning = time()
    stdout.write(progress)
    for i, it in enumerate(iterations):
        yield it
        sec_left = ((m-i+1) * (time() - beginning)) / (i+1)
        time_left = str(timedelta(seconds=sec_left))[:7]
        progress = '\r [ {i:{d}} / {m} ]'.format(i=i+1, d=digits, m=m)
        progress += ' {p:3.0f}%'.format(p=100*(i+1)/m)
        progress += ' - {time_left} left '.format(time_left=time_left)
        stdout.write(progress)

def fast_norm(x):
    """Returns norm-2 of a 1-D numpy array.
    * faster than linalg.norm in case of 1-D arrays (numpy 1.9.2rc1).
    """
    return sqrt(dot(x, x.T))

def asymptotic_decay(learning_rate, t, max_iter):
    """Decay function of the learning process.
    Parameters
    ----------
    learning_rate : float
        current learning rate.
    t : int
        current iteration.
    max_iter : int
        maximum number of iterations for the training.
    """
    return learning_rate / (1+t/(max_iter/2))

class plain_SOM(object):
    def __init__(self, x, y, input_len, data, sigma=1.0, learning_rate=0.5,
                 decay_function=asymptotic_decay,
                 neighborhood_function='gaussian', topology='rectangular',
                 activation_distance='euclidean', random_seed=None):
        """Initializes a Self Organizing Maps.
        A rule of thumb to set the size of the grid for a dimensionality
        reduction task is that it should contain 5*sqrt(N) neurons
        where N is the number of samples in the dataset to analyze.
        E.g. if your dataset has 150 samples, 5*sqrt(150) = 61.23
        hence a map 8-by-8 should perform well.
        Parameters
        ----------
        x : int
            x dimension of the SOM.
        y : int
            y dimension of the SOM.
        input_len : int
            Number of the elements of the vectors in input.
        sigma : float, optional (default=1.0)
            Spread of the neighborhood function, needs to be adequate
            to the dimensions of the map.
            (at the iteration t we have sigma(t) = sigma / (1 + t/T)
            where T is #num_iteration/2)
        learning_rate : initial learning rate
            (at the iteration t we have
            learning_rate(t) = learning_rate / (1 + t/T)
            where T is #num_iteration/2)
        decay_function : function (default=None)
            Function that reduces learning_rate and sigma at each iteration
            the default function is:
                        learning_rate / (1+t/(max_iterarations/2))
            A custom decay function will need to to take in input
            three parameters in the following order:
            1. learning rate
            2. current iteration
            3. maximum number of iterations allowed
            Note that if a lambda function is used to define the decay
            MiniSom will not be pickable anymore.
        neighborhood_function : string, optional (default='gaussian')
            Function that weights the neighborhood of a position in the map.
            Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle'
        topology : string, optional (default='rectangular')
            Topology of the map.
            Possible values: 'rectangular', 'hexagonal'
        activation_distance : string, optional (default='euclidean')
            Distance used to activate the map.
            Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev'
        random_seed : int, optional (default=None)
            Random seed to use.
        """
        if sigma >= x or sigma >= y:
            warn('Warning: sigma is too high for the dimension of the map.')

        self.som_size = x
        
        self._random_generator = random.RandomState(random_seed)

        self._learning_rate = learning_rate
        self._sigma = sigma
        
        self._input_len = input_len
        self.data = data
        
        # random initialization
        self._weights = self._random_generator.rand(x, y, self._input_len)*2-1
        self._weights /= linalg.norm(self._weights, axis=-1, keepdims=True)
        
        self._activation_map = np.zeros((x, y))
        self._neigx = arange(x)
        self._neigy = arange(y)  # used to evaluate the neighborhood function

        self.topology = topology
        self._xx, self._yy = meshgrid(self._neigx, self._neigy)
        self._xx = self._xx.astype(float)
        self._yy = self._yy.astype(float)

        self._decay_function = decay_function

        neig_functions = {'gaussian': self._gaussian}

        self.neighborhood = neig_functions[neighborhood_function]

        distance_functions = {'euclidean': self._euclidean_distance}

        self._activation_distance = distance_functions[activation_distance]
    
        # empty dataframe with column names
        self.selected_images = pd.DataFrame(columns = ['filename', 'ID', 'BMU_x', 'BMU_y'])
        self.selected_vectors = np.zeros((self.som_size, self.som_size, 128))
    
    def _check_input_len(self):
        """Checks that the data in input is of the correct shape."""
        data_len = len(self.data[0])
        if self._input_len != data_len:
            msg = 'Received %d features, expected %d.' % (data_len, self._input_len)
            raise ValueError(msg)
    
    def get_weights(self):
        """Returns the weights of the neural network."""
        return self._weights

    def _activate(self, x):
        """Updates matrix activation_map, in this matrix
           the element i,j is the response of the neuron i,j to x."""   
            
        self._activation_map = self._activation_distance(x, self._weights)

    def activate(self, x):
        """Returns the activation map to x."""
        self._activate(x)
        return self._activation_map

    def _gaussian(self, c, sigma):
        """Returns a Gaussian centered in c."""
        d = 2*sigma*sigma
        ax = exp(-power(self._xx-self._xx.T[c], 2)/d)
        ay = exp(-power(self._yy-self._yy.T[c], 2)/d)
        return (ax * ay).T  # the external product gives a matrix

    def _euclidean_distance(self, x, w):
        return linalg.norm(subtract(x, w), axis=-1)

    def winner(self, x):
        """Computes the coordinates of the winning neuron for the sample x."""
        
        self._activate(x)
        
        # index of bmu
        return unravel_index(self._activation_map.argmin(),
                             self._activation_map.shape)

    def update(self, x, win, t, max_iteration):
        """Updates the weights of the neurons.
        Parameters
        ----------
        x : np.array
            Current pattern to learn.
        win : tuple
            Position of the winning neuron for x (array or tuple).
        t : int
            Iteration index
        max_iteration : int
            Maximum number of training itarations.
        """
        eta = self._decay_function(self._learning_rate, t, max_iteration)
        # sigma and learning rate decrease with the same rule
        sig = self._decay_function(self._sigma, t, max_iteration)
        # improves the performances
        g = self.neighborhood(win, sig)*eta
        # w_new = eta * neighborhood_function * (x-w)
        
        self._weights += einsum('ij, ijk->ijk', g, x-self._weights)

    def quantization(self):
        """Assigns a code book (weights vector of the winning neuron)
        to each sample in data."""
        self._check_input_len()
        winners_coords = argmin(self._distance_from_weights(), axis=1)
        return self._weights[unravel_index(winners_coords,
                                           self._weights.shape[:2])]
    
    def _distance_from_weights(self):
        """Returns a matrix d where d[i,j] is the euclidean distance between
        data[i] and the j-th weight.
        """
        input_data = array(self.data)
        weights_flat = self._weights.reshape(-1, self._weights.shape[2])
        input_data_sq = power(input_data, 2).sum(axis=1, keepdims=True)
        weights_flat_sq = power(weights_flat, 2).sum(axis=1, keepdims=True)
        cross_term = dot(input_data, weights_flat.T)
        return sqrt(-2 * cross_term + input_data_sq + weights_flat_sq.T)

    def random_weights_init(self):
        """Initializes the weights of the SOM
        picking random samples from data."""
        self._check_input_len()
        it = nditer(self._activation_map, flags=['multi_index'])
        while not it.finished:
            rand_i = self._random_generator.randint(len(self.data))
            self._weights[it.multi_index] = self.data[rand_i]
            it.iternext()

    def train(self, num_iteration, random_order=False, verbose=False):
        """Trains the SOM.
        Parameters
        ----------
        data : np.array or list
            Data matrix.
        num_iteration : int
            Maximum number of iterations (one iteration per sample).
        random_order : bool (default=False)
            If True, samples are picked in random order.
            Otherwise the samples are picked sequentially.
        verbose : bool (default=False)
            If True the status of the training
            will be printed at each iteration.
        """
        self._check_input_len()
        
        random_generator = None
        # if random_order:
        #    random_generator = self._random_generator

        iterations = _build_iteration_indexes(len(self.data), num_iteration,
                                              verbose, random_generator)

        for t, iteration in enumerate(iterations):
            self.update(self.data[iteration], self.winner(self.data[iteration]),
                        t, num_iteration)
        
        if verbose:
            print('\n quantization error:', self.quantization_error())

    def train_batch(self, num_iteration, verbose=False):
        """Trains the SOM using all the vectors in data sequentially.
        Parameters
        ----------
        data : np.array or list
            Data matrix.
        num_iteration : int
            Maximum number of iterations (one iteration per sample).
        verbose : bool (default=False)
            If True the status of the training
            will be printed at each iteration.
        """
        self.train(num_iteration, random_order=False, verbose=verbose)

    def distance_map(self):
        """Returns the distance map of the weights.
        Each cell is the normalised sum of the distances between
        a neuron and its neighbours. Note that this method uses
        the euclidean distance."""
        um = zeros((self._weights.shape[0],
                    self._weights.shape[1],
                    8))  # 2 spots more for hexagonal topology

        ii = [[0, -1, -1, -1, 0, 1, 1, 1]]*2
        jj = [[-1, -1, 0, 1, 1, 1, 0, -1]]*2

        for x in range(self._weights.shape[0]):
            for y in range(self._weights.shape[1]):
                w_2 = self._weights[x, y]
                e = y % 2 == 0   # only used on hexagonal topology
                for k, (i, j) in enumerate(zip(ii[e], jj[e])):
                    if (x+i >= 0 and x+i < self._weights.shape[0] and
                        y+j >= 0 and y+j < self._weights.shape[1]):
                        w_1 = self._weights[x+i, y+j]
                        um[x, y, k] = fast_norm(w_2-w_1)

        um = um.sum(axis=2)
        return um/um.max()

    def activation_response(self):
        """
            Returns a matrix where the element i,j is the number of times
            that the neuron i,j have been winner.
        """
        self._check_input_len()
        a = zeros((self._weights.shape[0], self._weights.shape[1]))
        for x in self.data:
            a[self.winner(x)] += 1
        return a

    def quantization_error(self):
        """Returns the quantization error computed as the average
        distance between each input sample and its best matching unit."""
        self._check_input_len()
        return norm(self.data - self.quantization(), axis=1).mean()

    def win_map(self, return_indices=False):
        """Returns a dictionary wm where wm[(i,j)] is a list with:
        - all the patterns that have been mapped to the position (i,j),
          if return_indices=False (default)
        - all indices of the elements that have been mapped to the
          position (i,j) if return_indices=True"""
        self._check_input_len()
        winmap = defaultdict(list)
        for i, x in enumerate(self.data):
            winmap[self.winner(x)].append(i if return_indices else x)
        return winmap

    def labels_map(self, labels):
        """Returns a dictionary wm where wm[(i,j)] is a dictionary
        that contains the number of samples from a given label
        that have been mapped in position i,j.
        Parameters
        ----------
        data : np.array or list
            Data matrix.
        label : np.array or list
            Labels for each sample in data.
        """
        self._check_input_len()
        if not len(self.data) == len(labels):
            raise ValueError('data and labels must have the same length.')
        winmap = defaultdict(list)
        for x, l in zip(self.data, labels):
            winmap[self.winner(x)].append(l)
        for position in winmap:
            winmap[position] = Counter(winmap[position])
        return winmap
    
    ## custom functions for displaying images ##
    
    def display_results(self):
        
        #start_time = time()
        # putting the 3 functions below together
        filepaths = pd.read_csv(r"C:\Users\KWP\bachelor_thesis\native-queries\frame-ID-to-filepath.csv",sep=' ',names=['filename', 'ID'])
        df = self.node2image(filepaths)
        self.select_images(df)
        self.display_images()

        #print("Time elapsed: %s seconds" % (time() - start_time))
        
    def node2image(self, filepaths):

        # run find_bmu to obtain BMU for each feature vector

        bmu_list = []
        for vec in self.data:
            bmu_list.append(np.asarray(self.winner(vec)))

        # storing the result in a dataframe. 
        # i-th row of the dataframe (ex: 4 1) is associated with the BMU coordinates of the i-th feature vector  
        data_ = pd.DataFrame(bmu_list, columns=['BMU_x','BMU_y'])

        # merge the 2 dataframes
        df = pd.concat([filepaths, data_], axis=1, join='inner')

        # now we have:

        # filename ID BMU_x BMU_y
        # ........ rows ........

        return df

    def select_images(self, df):

        # empty dataframe with column names
        selected_images = self.selected_images

        for i in range(self.som_size):
            for j in range(self.som_size):

                tmp = df.loc[(df.BMU_x == i) & (df.BMU_y == j)]

                if(len(tmp) != 0):

                    ## random selection (for now) ##
                    sample = tmp.sample()
                    selected_images = selected_images.append(sample)

        #print("The following images were selected: ")
        #display(selected_images)

        self.selected_images = selected_images
        
    def display_images(self):
        
        my_dir = "C:/Users/KWP/bachelor_thesis/native-queries/thumbs/"

        #fig, ax = plt.subplots(self.som_size, self.som_size, sharex='col', sharey='row', figsize=(320,180))

        for i in range(len(self.selected_images)):

            row = self.selected_images.iloc[i,:]
            x = row.BMU_x
            y = row.BMU_y
            id = row.ID
        
            # extracting selected vectors used for output - for evaluation purposes later
            self.selected_vectors[x, y] = self.data[id]
        
            #img = Image.open(my_dir + row.filename)
            #img = img.resize((320, 180))
            #img = np.asarray(img)

            #ax[x, y].imshow(img)

        #plt.tight_layout(pad=0.1, w_pad=0.1, h_pad=0.1)
        #plt.show()
        
    ## evaluation metrics ## 
    # comparing the distance of selected images on the output screen and in the feature space
    
    def mean_distance(self):
    
        # mean distance matrix[i,j] contains the mean distance of neighbors from selected_vectors[i, j]
        mean_distance_matrix = np.zeros((self.som_size, self.som_size))
        
        distance_sum = 0
        
        for i in range(self.som_size):
            for j in range(self.som_size):
                
                for index in list(direct_neighbours((i,j), self.som_size)):
                    
                    distance_sum += self._euclidean_distance(self.selected_vectors[index], self.selected_vectors[i, j])
                    
                distance_sum /= len(list(direct_neighbours((i,j), self.som_size)))
                
                mean_distance_matrix[i, j] = distance_sum
        
        return mean_distance_matrix
                
    def mean_distance_nextdoor(self):
        
        mean_distance_matrix = np.zeros((self.som_size, self.som_size))
        
        for i in range(self.som_size):
            for j in range(self.som_size):
                
                if j == 0:
                    mean_distance_matrix[i, j] = self._euclidean_distance(self.selected_vectors[i,j+1], self.selected_vectors[i, j])
                    
                elif j == self.som_size-1:
                    mean_distance_matrix[i, j] = self._euclidean_distance(self.selected_vectors[i, j-1], self.selected_vectors[i, j])
                
                else:
                    mean_distance_matrix[i, j] = self._euclidean_distance(self.selected_vectors[i, j-1], self.selected_vectors[i, j]) 
                    + self._euclidean_distance(self.selected_vectors[i, j+1], self.selected_vectors[i, j])
                    mean_distance_matrix[i, j] /= 2
                    
        return mean_distance_matrix
        
    ## Rank correlation metrics ## 
    # comparing the original ranking with SOM-induced ranking
    
    def add_ranking(self, df):
        
        selected = self.selected_images
        # merge the dataframes where ID matches
        selected = selected.merge(df, how='inner', on='ID')  

        selected['SOM_rank'] = np.arange(len(selected)) + 1
        selected['QS_rank'] = selected['query_scores'].rank(ascending=False).astype(int)
    
        self.selected_images = selected
        
    def tau(self):
            
        return self.selected_images['SOM_rank'].corr(self.selected_images['QS_rank'],method='kendall')

    def nDCG(self):
        ## use original scores, not rankings. ##
    
        # Releveance scores in actual order
        actual = self.selected_images['query_scores']
        # in ideal order
        ideal = actual.sort_values()
        
        actual = np.asarray(actual).reshape(1,100)
        ideal = np.asarray(ideal).reshape(1, 100)

        return ndcg_score(actual, ideal) 
              
# returns indexes of direct neighbors of a given position in matrix
def direct_neighbours(cell, size):
    for c in product(*(range(n-1, n+2) for n in cell)):
        if c != cell and all(0 <= n < size for n in c):
            yield c

In [145]:
#### Biased SOM ####

#### TODO: rebiasing ####

class biased_SOM(object):
    def __init__(self, x, y, input_len, data, sigma=1.0, learning_rate=0.5,
                 decay_function=asymptotic_decay,
                 neighborhood_function='gaussian', topology='rectangular',
                 activation_distance='euclidean', random_seed=None, auto_var_adjustment=False, var_param=0, 
                 num_images=20000, query_scores=np.zeros(shape=(20000,)), frame_ids=np.zeros(shape=(20000,))):
        """Initializes a Self Organizing Maps.
        A rule of thumb to set the size of the grid for a dimensionality
        reduction task is that it should contain 5*sqrt(N) neurons
        where N is the number of samples in the dataset to analyze.
        E.g. if your dataset has 150 samples, 5*sqrt(150) = 61.23
        hence a map 8-by-8 should perform well.
        Parameters
        ----------
        x : int
            x dimension of the SOM.
        y : int
            y dimension of the SOM.
        input_len : int
            Number of the elements of the vectors in input.
        sigma : float, optional (default=1.0)
            Spread of the neighborhood function, needs to be adequate
            to the dimensions of the map.
            (at the iteration t we have sigma(t) = sigma / (1 + t/T)
            where T is #num_iteration/2)
        learning_rate : initial learning rate
            (at the iteration t we have
            learning_rate(t) = learning_rate / (1 + t/T)
            where T is #num_iteration/2)
        decay_function : function (default=None)
            Function that reduces learning_rate and sigma at each iteration
            the default function is:
                        learning_rate / (1+t/(max_iterarations/2))
            A custom decay function will need to to take in input
            three parameters in the following order:
            1. learning rate
            2. current iteration
            3. maximum number of iterations allowed
            Note that if a lambda function is used to define the decay
            MiniSom will not be pickable anymore.
        neighborhood_function : string, optional (default='gaussian')
            Function that weights the neighborhood of a position in the map.
            Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle'
        topology : string, optional (default='rectangular')
            Topology of the map.
            Possible values: 'rectangular', 'hexagonal'
        activation_distance : string, optional (default='euclidean')
            Distance used to activate the map.
            Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev'
        random_seed : int, optional (default=None)
            Random seed to use.
        """
        if sigma >= x or sigma >= y:
            warn('Warning: sigma is too high for the dimension of the map.')

        self.som_size = x
        self.num_images = num_images
        
        self._random_generator = random.RandomState(random_seed)

        self._learning_rate = learning_rate
        self._sigma = sigma

        self.data = data
        self.query_scores=query_scores
        
        self._activation_map = np.zeros((x, y))
        
        ## using top 100 images only! ##
        if self.num_images != 20000:
            
            ids = pd.DataFrame(frame_ids, columns=['ID'])
            ids = ids[0:num_images]
            
            self.data = data[ids.ID]
        
        # random initialization
        if auto_var_adjustment == True or var_param > 0: 
            self._input_len = input_len + 1
            self._weights = self._random_generator.rand(x, y, self._input_len)*2-1
            self.data = np.append(self.data, np.zeros((self.num_images, 1)), axis=1)
            
            #self.random_weights_init()
            
            self.adjust_var()
            self.initial_bias()
            
        else:
            self._input_len = input_len
            self._weights = self._random_generator.rand(x, y, self._input_len)*2-1
            #self.random_weights_init()
            
        self._weights /= linalg.norm(self._weights, axis=-1, keepdims=True)

        self._neigx = arange(x)
        self._neigy = arange(y)  # used to evaluate the neighborhood function

        self.topology = topology
        self._xx, self._yy = meshgrid(self._neigx, self._neigy)
        self._xx = self._xx.astype(float)
        self._yy = self._yy.astype(float)

        self._decay_function = decay_function

        neig_functions = {'gaussian': self._gaussian}

        self.neighborhood = neig_functions[neighborhood_function]

        distance_functions = {'euclidean': self._euclidean_distance}

        self._activation_distance = distance_functions[activation_distance]
    
        # empty dataframe with column names
        self.selected_images = pd.DataFrame(columns = ['filename', 'ID', 'BMU_x', 'BMU_y'])
        self.selected_vectors = np.zeros((self.som_size, self.som_size, 128))
    
    ## custom functions for adding bias to SOM initialization ## 
    
    def adjust_var(self):
        # variance of randomly initialized SOM nodes (on original features)
        sum = 0
        for i in range(self.som_size):
            for j in range(self.som_size):
                sum += self._weights[i, j].var()
        #print("sum of variance over all SOM nodes' original features: {}".format(sum))
            
        # if the var difference is low enough, then we adjust the query scores
        for i in range(10000):
            
            tmp = self.query_scores * i

            if abs(sum - tmp.var()) < 0.1:
                
                #print("variance of adjusted query scores: {}".format(tmp.var()))
                break
                
        self.query_scores = tmp
        
    def initial_bias(self):

        # add query scores to SOM as another feature, considering the position of the nodes

        # we will divide query_scores into N intervals of equal size (where SOM has size n x n)
        # then from each interval corresponding to each row, pick N values randomly.

        # for example, for 5 x 5 SOM: divide query_score into 5 intervals
        # top 80~100% values, 60~80%, ..., 0~20%. Nodes in top row are assigned 5 random values from the first interval, and so on.

        cutpoints = []
        interval_size = 100 / self.som_size 
        for i in range(1, self.som_size):

            cutpoint = np.quantile(self.query_scores, interval_size * i * 0.01)
            cutpoints.append(cutpoint)
            
            if i == 1:

                bottom_interval = self.query_scores[self.query_scores < cutpoint]
                
                # assign to bottom nodes i.e. som[size-1, 0] ... som[size-1, size-1]
                for j in range(self.som_size):
                    self._weights[self.som_size-1, j, 128] = np.random.choice(bottom_interval, size=1)[0]

            else:
                interval = self.query_scores[np.logical_and(self.query_scores > cutpoints[i-2], self.query_scores < cutpoint)]
                
                # assign to middle row nodes e.g. som[1, 0] ... som[1, size-1]
                for j in range(self.som_size):
                    self._weights[self.som_size-i, j, 128] = np.random.choice(interval, size=1)[0]

                if i == self.som_size-1:

                    top_interval = self.query_scores[self.query_scores > cutpoint]         

                    # assign to top nodes i.e. som[0, 0] ... som[0, size-1]
                    for j in range(self.som_size):

                        self._weights[0, j, 128] = np.random.choice(top_interval, size=1)[0]

    # remove the extra dimension added to the data 
    def reset_data(self):
        # return? or modify via class directly?
        self.data = np.delete(self.data, -1, axis=1)
    
    def _check_input_len(self):
        """Checks that the data in input is of the correct shape."""
        data_len = len(self.data[0])
        if self._input_len != data_len:
            msg = 'Received %d features, expected %d.' % (data_len, self._input_len)
            raise ValueError(msg)

    def _activate(self, x):
        """Updates matrix activation_map, in this matrix
           the element i,j is the response of the neuron i,j to x."""   
            
        self._activation_map = self._activation_distance(x, self._weights)

    def activate(self, x):
        """Returns the activation map to x."""
        self._activate(x)
        return self._activation_map

    def _gaussian(self, c, sigma):
        """Returns a Gaussian centered in c."""
        d = 2*sigma*sigma
        ax = exp(-power(self._xx-self._xx.T[c], 2)/d)
        ay = exp(-power(self._yy-self._yy.T[c], 2)/d)
        return (ax * ay).T  # the external product gives a matrix

    def _euclidean_distance(self, x, w):
        return linalg.norm(subtract(x, w), axis=-1)

    def winner(self, x):
        """Computes the coordinates of the winning neuron for the sample x."""
        self._activate(x)
        
        # index of bmu
        return unravel_index(self._activation_map.argmin(),
                             self._activation_map.shape)

    def update(self, x, win, t, max_iteration):
        """Updates the weights of the neurons.
        Parameters
        ----------
        x : np.array
            Current pattern to learn.
        win : tuple
            Position of the winning neuron for x (array or tuple).
        t : int
            Iteration index
        max_iteration : int
            Maximum number of training itarations.
        """
        eta = self._decay_function(self._learning_rate, t, max_iteration)
        # sigma and learning rate decrease with the same rule
        sig = self._decay_function(self._sigma, t, max_iteration)
        # improves the performances
        g = self.neighborhood(win, sig)*eta
        # w_new = eta * neighborhood_function * (x-w)
        
        # change the update function so that the new dimension doesn't get updated
        
        #_weight_newdim = self._weights 
        #self._weights = self._weights[0:]
        self._weights += einsum('ij, ijk->ijk', g, x-self._weights)

    def quantization(self):
        """Assigns a code book (weights vector of the winning neuron)
        to each sample in data."""
        self._check_input_len()
        winners_coords = argmin(self._distance_from_weights(), axis=1)
        return self._weights[unravel_index(winners_coords,
                                           self._weights.shape[:2])]
    
    def _distance_from_weights(self):
        """Returns a matrix d where d[i,j] is the euclidean distance between
        data[i] and the j-th weight.
        """
        input_data = array(self.data)
        weights_flat = self._weights.reshape(-1, self._weights.shape[2])
        input_data_sq = power(input_data, 2).sum(axis=1, keepdims=True)
        weights_flat_sq = power(weights_flat, 2).sum(axis=1, keepdims=True)
        cross_term = dot(input_data, weights_flat.T)
        return sqrt(-2 * cross_term + input_data_sq + weights_flat_sq.T)

    def random_weights_init(self):
        """Initializes the weights of the SOM
        picking random samples from data."""
        self._check_input_len()
        it = nditer(self._activation_map, flags=['multi_index'])
        while not it.finished:
            rand_i = self._random_generator.randint(len(self.data))
            self._weights[it.multi_index] = self.data[rand_i]
            it.iternext()

    def train(self, num_iteration, random_order=False, verbose=False):
        """Trains the SOM.
        Parameters
        ----------
        data : np.array or list
            Data matrix.
        num_iteration : int
            Maximum number of iterations (one iteration per sample).
        random_order : bool (default=False)
            If True, samples are picked in random order.
            Otherwise the samples are picked sequentially.
        verbose : bool (default=False)
            If True the status of the training
            will be printed at each iteration.
        """
        self._check_input_len()
        random_generator = None
        
        iterations = _build_iteration_indexes(len(self.data), num_iteration,
                                              verbose, random_generator)

        for t, iteration in enumerate(iterations):
            self.update(self.data[iteration], self.winner(self.data[iteration]), t, num_iteration)
        
        if verbose:
            print('\n quantization error:', self.quantization_error())

    def train_batch(self, num_iteration, verbose=False):
        """Trains the SOM using all the vectors in data sequentially.
        Parameters
        ----------
        data : np.array or list
            Data matrix.
        num_iteration : int
            Maximum number of iterations (one iteration per sample).
        verbose : bool (default=False)
            If True the status of the training
            will be printed at each iteration.
        """
        self.train(num_iteration, random_order=False, verbose=verbose)

    def distance_map(self):
        """Returns the distance map of the weights.
        Each cell is the normalised sum of the distances between
        a neuron and its neighbours. Note that this method uses
        the euclidean distance."""
        um = zeros((self._weights.shape[0],
                    self._weights.shape[1],
                    8))  # 2 spots more for hexagonal topology

        ii = [[0, -1, -1, -1, 0, 1, 1, 1]]*2
        jj = [[-1, -1, 0, 1, 1, 1, 0, -1]]*2

        for x in range(self._weights.shape[0]):
            for y in range(self._weights.shape[1]):
                w_2 = self._weights[x, y]
                e = y % 2 == 0   # only used on hexagonal topology
                for k, (i, j) in enumerate(zip(ii[e], jj[e])):
                    if (x+i >= 0 and x+i < self._weights.shape[0] and y+j >= 0 and y+j < self._weights.shape[1]):
                        w_1 = self._weights[x+i, y+j]
                        um[x, y, k] = fast_norm(w_2-w_1)

        um = um.sum(axis=2)
        return um/um.max()

    def activation_response(self):
        """
            Returns a matrix where the element i,j is the number of times
            that the neuron i,j have been winner.
        """
        self._check_input_len()
        a = zeros((self._weights.shape[0], self._weights.shape[1]))
        for x in self.data:
            a[self.winner(x)] += 1
        return a

    def quantization_error(self):
        """Returns the quantization error computed as the average
        distance between each input sample and its best matching unit."""
        self._check_input_len()
        return norm(self.data - self.quantization(), axis=1).mean()

    def win_map(self, return_indices=False):
        """Returns a dictionary wm where wm[(i,j)] is a list with:
        - all the patterns that have been mapped to the position (i,j),
          if return_indices=False (default)
        - all indices of the elements that have been mapped to the
          position (i,j) if return_indices=True"""
        self._check_input_len()
        winmap = defaultdict(list)
        for i, x in enumerate(self.data):
            winmap[self.winner(x)].append(i if return_indices else x)
        return winmap

    def labels_map(self, labels):
        """Returns a dictionary wm where wm[(i,j)] is a dictionary
        that contains the number of samples from a given label
        that have been mapped in position i,j.
        Parameters
        ----------
        data : np.array or list
            Data matrix.
        label : np.array or list
            Labels for each sample in data.
        """
        self._check_input_len()
        if not len(self.data) == len(labels):
            raise ValueError('data and labels must have the same length.')
        winmap = defaultdict(list)
        for x, l in zip(self.data, labels):
            winmap[self.winner(x)].append(l)
        for position in winmap:
            winmap[position] = Counter(winmap[position])
        return winmap
    
    ## custom functions for displaying images ##
    
    def display_results(self, frameID_scores, original_data):
        
        #start_time = time()

        # putting the 3 functions below together
        filepaths = pd.read_csv(r"C:\Users\KWP\bachelor_thesis\native-queries\frame-ID-to-filepath.csv",sep=' ',names=['filename', 'ID'])
        
        self.hungarian_algo(filepaths, frameID_scores)
        
        #df = self.node2image(filepaths)
        #self.select_images(df, frameID_scores)
        
        self.display_images(original_data)

        # adding ranking to selected images 
        self.add_ranking()
    
    def hungarian_algo(self, filepaths, frameID_scores):
        
        frameID_scores = frameID_scores[0:100]
        # self.selected_images = pd.DataFrame(columns = ['filename', 'ID', 'BMU_x', 'BMU_y'])
        
        # merge with filepaths on top 100 IDs only 
        merged = frameID_scores.merge(filepaths, how='inner', on='ID') 
    
        # pairwise distance matrix for SOM weight vectors and input data 
        pairwise_dist = distance_matrix(np.concatenate(self._weights), self.data)
        m = Munkres()
        indexes = m.compute(pairwise_dist)

        my_pd = pd.DataFrame(columns = ['ID', 'BMU_x', 'BMU_y'])

        for row, column in indexes:

            x = row / 10
            y = row % 10
            my_pd = my_pd.append({'ID': merged.ID[column],'BMU_x': int(x), 'BMU_y': y}, ignore_index=True)

        merged = my_pd.merge(merged, on='ID')
        
        self.selected_images = merged
    """
    def node2image(self, filepaths):

        # run find_bmu to obtain BMU for each feature vector

        bmu_list = []
        for vec in self.data:
            bmu_list.append(np.asarray(self.winner(vec)))

        # storing the result in a dataframe. 
        # i-th row of the dataframe (ex: 4 1) is associated with the BMU coordinates of the i-th feature vector  
        data_ = pd.DataFrame(bmu_list, columns=['BMU_x','BMU_y'])

        # merge the 2 dataframes
        df = pd.concat([filepaths, data_], axis=1, join='inner')

        # now we have:

        # filename ID BMU_x BMU_y
        # ........ rows ........

        return df

    # image selection after training
    def select_images(self, df, id_scores):

        # empty dataframe with column names
        selected_images = self.selected_images

        for i in range(self.som_size):
            for j in range(self.som_size):

                tmp = df.loc[(df.BMU_x == i) & (df.BMU_y == j)]

                if(len(tmp) != 0):
                    
                    merged = tmp.merge(id_scores, how='inner', on='ID') 
                    
                    # select image with highest score
                    highest = merged.loc[merged['query_scores'].idxmax()]
                    
                    selected_images = selected_images.append(highest)

        #print("The following images were selected: ")
        #display(selected_images)

        self.selected_images = selected_images
    """
    
    def display_images(self, original_data):

        # this function not only displays images, but also removes the extra dimension we added to the data
        # and extracts selected vectors for each SOM node
        
        my_dir = "C:/Users/KWP/bachelor_thesis/native-queries/thumbs/"

        #fig, ax = plt.subplots(self.som_size, self.som_size, sharex='col', sharey='row', figsize=(320,180))

        # remove the extra dimension we initially added to the data
        if(self._input_len == 129):
            self.reset_data()
        for i in range(len(self.selected_images)):

            row = self.selected_images.iloc[i,:]
            x = row.BMU_x
            y = row.BMU_y
            id = row.ID
        
            # extracting selected vectors used for output - for evaluation purposes later
            self.selected_vectors[x, y] = original_data[id]
    
            #img = Image.open(my_dir + row.filename)
            #img = img.resize((320, 180))
            #img = np.asarray(img)

            #ax[x, y].imshow(img)

        #plt.tight_layout(pad=0.1, w_pad=0.1, h_pad=0.1)
        #plt.show()
    
    def add_ranking(self):
        
        selected = self.selected_images
        
        # merge the dataframes where ID matches
        selected = selected.merge(df, how='inner', on='ID')  

        selected['SOM_rank'] = np.arange(len(selected)) + 1
        selected['QS_rank'] = selected['query_scores'].rank(ascending=False).astype(int)
    
        self.selected_images = selected
    
    ## evaluation metrics ## 
    # comparing the distance of selected images on the output screen and in the feature space
    
    def mean_distance(self):
    
        # mean distance matrix[i,j] contains the mean distance of neighbors from selected_vectors[i, j]
        mean_distance_matrix = np.zeros((self.som_size, self.som_size))
        
        distance_sum = 0
        
        for i in range(self.som_size):
            for j in range(self.som_size):
                
                for index in list(direct_neighbours((i,j), self.som_size)):
                    
                    distance_sum += self._euclidean_distance(self.selected_vectors[index], self.selected_vectors[i, j])
                    
                distance_sum /= len(list(direct_neighbours((i,j), self.som_size)))
                
                mean_distance_matrix[i, j] = distance_sum
        
        return mean_distance_matrix
                
    def mean_distance_nextdoor(self):
        
        mean_distance_matrix = np.zeros((self.som_size, self.som_size))
        
        for i in range(self.som_size):
            for j in range(self.som_size):
                
                if j == 0:
                    mean_distance_matrix[i, j] = self._euclidean_distance(self.selected_vectors[i,j+1], self.selected_vectors[i, j])
                    
                elif j == self.som_size-1:
                    mean_distance_matrix[i, j] = self._euclidean_distance(self.selected_vectors[i, j-1], self.selected_vectors[i, j])
                
                else:
                    mean_distance_matrix[i, j] = self._euclidean_distance(self.selected_vectors[i, j-1], self.selected_vectors[i, j]) 
                    + self._euclidean_distance(self.selected_vectors[i, j+1], self.selected_vectors[i, j])
                    mean_distance_matrix[i, j] /= 2
                    
        return mean_distance_matrix
        
    ## Rank correlation metrics ## 
    # comparing the original ranking with SOM-induced ranking
        
    def tau(self):
            
        return self.selected_images['SOM_rank'].corr(self.selected_images['QS_rank'], method='kendall')

    def nDCG(self):
        ## use original scores, not rankings. ##
    
        # Releveance scores in actual order
        actual = self.selected_images['query_scores']
        # in ideal order
        ideal = actual.sort_values()
        
        actual = np.asarray(actual).reshape(1,100)
        ideal = np.asarray(ideal).reshape(1, 100)

        return ndcg_score(actual, ideal)

In [157]:
biased_som._weights

array([[[ 0.14172, -0.31115,  0.44848, ...,  0.02673,  0.00573,
          0.0052 ],
        [ 0.12778, -0.31073,  0.46666, ...,  0.03012, -0.00175,
          0.04312],
        [ 0.12322, -0.30231,  0.47196, ...,  0.02895,  0.00107,
          0.00882],
        ...,
        [ 0.12375, -0.26378,  0.34735, ...,  0.03953,  0.00868,
          0.13403],
        [ 0.12034, -0.22881,  0.29817, ...,  0.03024, -0.00234,
          0.19582],
        [ 0.09973, -0.25239,  0.3091 , ...,  0.05142,  0.03088,
          0.03282]],

       [[ 0.14149, -0.31794,  0.46132, ...,  0.02772, -0.00586,
          0.00122],
        [ 0.14449, -0.31916,  0.46955, ...,  0.02895, -0.00527,
          0.00049],
        [ 0.13817, -0.32239,  0.46663, ...,  0.02774,  0.00457,
          0.00077],
        ...,
        [ 0.13911, -0.28702,  0.35496, ...,  0.02733, -0.00412,
          0.00258],
        [ 0.11647, -0.27209,  0.30015, ...,  0.02418, -0.00271,
          0.0035 ],
        [ 0.10454, -0.24393,  0.29397, ...,  0.0

In [123]:
filepaths = pd.read_csv(r"C:\Users\KWP\bachelor_thesis\native-queries\frame-ID-to-filepath.csv",sep=' ',names=['filename', 'ID'])

In [124]:
query_scores = np.fromfile("native-queries/native-query-scores.bin", dtype='f')
query_scores = query_scores.reshape(327, 20000)

frame_ids = np.fromfile("native-queries/native-query-frame-IDs.bin", dtype='i')
frame_ids = frame_ids.reshape(327, 20000)

frame_features = np.fromfile("native-queries/frame-features.bin", dtype='f')
frame_features = frame_features.reshape(20000, 128)

In [140]:
# biased SOM pipeline with 100 images

biased_som = biased_SOM(10, 10, frame_features[0].shape[0], data=frame_features, query_scores=query_scores[0], frame_ids=frame_ids[0], sigma=8.0, learning_rate=0.5, random_seed=1903, auto_var_adjustment=True, num_images=100)
biased_som.train_batch(20, verbose=True)

# we can see that there are many empty nodes
print("Activation response: \n{}".format(biased_som.activation_response()))

a = pd.DataFrame(frame_ids[0], columns=['ID'])
b = pd.DataFrame(query_scores[0], columns=['query_scores'])
ab = pd.concat([a, b], axis=1, join='inner')

biased_som.display_results(ab, frame_features)

 [  0 / 20 ]   0% - ? it/s [  0 / 20 ]   0% - ? it/s [  1 / 20 ]   5% - 0:00:00 left  [  2 / 20 ]  10% - 0:00:00 left  [  3 / 20 ]  15% - 0:00:00 left  [  4 / 20 ]  20% - 0:00:00 left  [  5 / 20 ]  25% - 0:00:00 left  [  6 / 20 ]  30% - 0:00:00 left  [  7 / 20 ]  35% - 0:00:00 left  [  8 / 20 ]  40% - 0:00:00 left  [  9 / 20 ]  45% - 0:00:00 left  [ 10 / 20 ]  50% - 0:00:00 left  [ 11 / 20 ]  55% - 0:00:00 left  [ 12 / 20 ]  60% - 0:00:00 left  [ 13 / 20 ]  65% - 0:00:00 left  [ 14 / 20 ]  70% - 0:00:00 left  [ 15 / 20 ]  75% - 0:00:00 left  [ 16 / 20 ]  80% - 0:00:00 left  [ 17 / 20 ]  85% - 0:00:00 left  [ 18 / 20 ]  90% - 0:00:00 left  [ 19 / 20 ]  95% - 0:00:00 left  [ 20 / 20 ] 100% - 0:00:00 left 
 quantization error: 0.6876894031825559
Activation response: 
[[ 0.  5. 20.  5.  0.  2.  4.  1.  1.  0.]
 [ 1.  0.  0.  0.  0.  0.  1.  0.  2.  5.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  3.  2.]
 [ 1.  1.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 1.  0.  1.  0.  0.  0.  0.  

In [141]:
biased_som.selected_images

Unnamed: 0,ID,BMU_x,BMU_y,query_scores,filename,SOM_rank,QS_rank
0,13953,0,0,0.11298,v04990_s00064(f007026-f007189)_g00156_f007042.jpg,1,55
1,18845,0,1,0.22473,v06770_s00011(f001919-f002233)_g00046_f002067.jpg,2,18
2,12903,0,2,0.22309,v04625_s00061(f008128-f008274)_g00150_f008211.jpg,3,19
3,12902,0,3,0.09151,v04625_s00046(f004905-f005202)_g00100_f005124.jpg,4,66
4,4375,0,4,0.10107,v01596_s00025(f003043-f003170)_g00076_f003086.jpg,5,62
5,12334,0,5,0.07845,v04419_s00020(f003833-f004068)_g00055_f004015.jpg,6,85
6,19007,0,6,0.16621,v06817_s00165(f016454-f016575)_g00233_f016483.jpg,7,29
7,15811,0,7,0.08851,v05654_s00010(f002100-f002497)_g00036_f002350.jpg,8,68
8,10711,0,8,0.16804,v03882_s00063(f010423-f011199)_g00174_f010450.jpg,9,28
9,5697,0,9,1.00000,v02098_s00013(f002609-f003250)_g00087_f003116.jpg,10,1


In [142]:
biased_som.mean_distance()

array([[0.75452, 0.89801, 0.7843 , 0.8612 , 1.06268, 1.23612, 1.43167,
        1.52871, 1.65594, 1.90603],
       [1.06346, 0.80822, 0.68158, 0.77969, 0.91803, 1.12352, 1.27113,
        1.35257, 1.42626, 1.59691],
       [0.95468, 0.75918, 0.69565, 0.80308, 1.00165, 1.13649, 1.34825,
        1.33473, 1.42583, 1.47219],
       [1.02448, 0.81326, 0.89802, 0.93571, 1.06115, 1.24112, 1.346  ,
        1.40778, 1.34878, 1.24303],
       [0.87374, 0.91649, 1.00965, 1.14401, 1.13989, 1.19437, 1.32601,
        1.44714, 1.21328, 1.25665],
       [0.99711, 0.95026, 1.29058, 1.10026, 0.98275, 1.1208 , 1.17993,
        1.18025, 1.24447, 1.52905],
       [1.27339, 0.99672, 1.07441, 0.91287, 0.79237, 0.91454, 0.94327,
        0.9812 , 1.05977, 1.24986],
       [1.05126, 1.004  , 0.92607, 0.77179, 0.69382, 0.74781, 0.87071,
        0.80223, 1.01658, 1.30427],
       [1.03854, 1.05748, 0.8358 , 0.75021, 0.7206 , 0.74334, 0.70822,
        0.7845 , 0.87751, 1.24904],
       [1.22373, 1.00513, 0.89591, 0.

In [143]:
biased_som.mean_distance_nextdoor()

array([[0.8158 , 0.4079 , 0.34974, 0.32002, 0.41734, 0.47194, 0.53945,
        0.54543, 0.66381, 1.36382],
       [0.68681, 0.3434 , 0.26552, 0.32744, 0.41751, 0.36523, 0.50613,
        0.48796, 0.63316, 1.27399],
       [0.54098, 0.27049, 0.27963, 0.30401, 0.39286, 0.4581 , 0.54948,
        0.64341, 0.64427, 1.21535],
       [0.76646, 0.38323, 0.35604, 0.35481, 0.4748 , 0.51774, 0.54364,
        0.63134, 0.58439, 1.14904],
       [0.60661, 0.3033 , 0.36498, 0.53554, 0.45031, 0.43902, 0.61191,
        0.63092, 0.68207, 0.77594],
       [0.63751, 0.31876, 0.59695, 0.56982, 0.4133 , 0.41594, 0.54853,
        0.53462, 0.52604, 1.30962],
       [0.90656, 0.45328, 0.48433, 0.45922, 0.30309, 0.39981, 0.4075 ,
        0.38481, 0.40866, 0.88327],
       [0.78293, 0.39147, 0.43235, 0.36542, 0.25827, 0.26627, 0.40442,
        0.36563, 0.43073, 1.15703],
       [0.91697, 0.45849, 0.42361, 0.31548, 0.34932, 0.33327, 0.27028,
        0.33758, 0.30092, 1.02827],
       [0.72811, 0.36406, 0.31633, 0.

In [144]:
print(biased_som.tau())
print(biased_som.nDCG())

-0.07434343434343435
0.7500568083760983


In [120]:
# Running biased SOM on all 327 queries

mean_dist = np.zeros(shape=(10,10))
mean_distN = np.zeros(shape=(10,10))
tau = 0
nDCG = 0

start_time = time()

reps = 100

for i in range(0, reps):
    biased_som = biased_SOM(10, 10, frame_features[0].shape[0], data=frame_features, sigma=8.0, learning_rate=0.5, random_seed=1903, 
               query_scores=query_scores[i], frame_ids=frame_ids[i], auto_var_adjustment=True, num_images=100)
    
    biased_som.train_batch(15)
    
    a = pd.DataFrame(frame_ids[i], columns=['ID'])
    b = pd.DataFrame(query_scores[i], columns=['query_scores'])
    ab = pd.concat([a, b], axis=1, join='inner')
    
    biased_som.display_results(ab, frame_features)
    
    mean_dist += biased_som.mean_distance()
    mean_distN += biased_som.mean_distance_nextdoor()
    
    tau += biased_som.tau()
    nDCG += biased_som.nDCG()
    
    print("i: {}".format(i))

mean_dist /= reps
mean_distN /= reps

print("mean distance between surrounding neighbors: \n{}".format(mean_dist))
print("mean distance between next door neighbors: \n{}".format(mean_distN))

tau /= reps
nDCG /= reps

print("Kendall's Tau Coefficient: {}".format(tau))
print("nDCG: {}".format(nDCG))

print("Time elapsed: %s seconds" % (time() - start_time))

i: 0
i: 1
i: 2
i: 3
i: 4
i: 5
i: 6
i: 7
i: 8
i: 9
i: 10
i: 11
i: 12
i: 13
i: 14
i: 15
i: 16
i: 17
i: 18
i: 19
i: 20
i: 21
i: 22
i: 23
i: 24
i: 25
i: 26
i: 27
i: 28
i: 29
i: 30
i: 31
i: 32
i: 33
i: 34
i: 35
i: 36
i: 37
i: 38
i: 39
i: 40
i: 41
i: 42
i: 43
i: 44
i: 45
i: 46
i: 47
i: 48
i: 49
i: 50
i: 51
i: 52
i: 53
i: 54
i: 55
i: 56
i: 57
i: 58
i: 59
i: 60
i: 61
i: 62
i: 63
i: 64
i: 65
i: 66
i: 67
i: 68
i: 69
i: 70
i: 71
i: 72
i: 73
i: 74
i: 75
i: 76
i: 77
i: 78
i: 79
i: 80
i: 81
i: 82
i: 83
i: 84
i: 85
i: 86
i: 87
i: 88
i: 89
i: 90
i: 91
i: 92
i: 93
i: 94
i: 95
i: 96
i: 97
i: 98
i: 99
mean distance between surrounding neighbors: 
[[1.0273  1.23121 1.27064 1.26678 1.26168 1.25349 1.26483 1.27451 1.27619
  1.45194]
 [1.30225 1.16795 1.14015 1.14799 1.14034 1.14226 1.15538 1.15327 1.15908
  1.25775]
 [1.25823 1.16095 1.1432  1.13629 1.13222 1.15328 1.13809 1.14672 1.16739
  1.26944]
 [1.26289 1.15969 1.15708 1.14365 1.14427 1.15052 1.15655 1.16391 1.16856
  1.26318]
 [1.25547 1.16093 1.1422

In [121]:
biased_som._activation_map

array([[0.86223, 0.86203, 0.82291, 0.78304, 0.7912 , 0.77292, 0.79096,
        0.80193, 0.80223, 0.82448],
       [0.87234, 0.82542, 0.8331 , 0.77583, 0.77653, 0.78562, 0.77771,
        0.79105, 0.79681, 0.81227],
       [0.86726, 0.81453, 0.79419, 0.78263, 0.77422, 0.78361, 0.78047,
        0.78196, 0.7934 , 0.79165],
       [0.844  , 0.80758, 0.79404, 0.79155, 0.77267, 0.77589, 0.77345,
        0.77806, 0.779  , 0.79213],
       [0.81122, 0.81626, 0.81053, 0.79789, 0.7942 , 0.77357, 0.77202,
        0.76849, 0.77396, 0.7774 ],
       [0.82452, 0.81525, 0.81807, 0.79957, 0.7904 , 0.77873, 0.77307,
        0.76795, 0.76757, 0.77563],
       [0.82946, 0.83129, 0.82336, 0.81128, 0.79753, 0.78655, 0.77372,
        0.76735, 0.76705, 0.77052],
       [0.84711, 0.84775, 0.83398, 0.82015, 0.8048 , 0.7888 , 0.77708,
        0.77116, 0.76829, 0.77165],
       [0.8572 , 0.8485 , 0.84192, 0.8213 , 0.81182, 0.78965, 0.78171,
        0.77266, 0.76933, 0.75809],
       [0.85788, 0.8385 , 0.83343, 0.

In [158]:
# Plain SOM pipeline

plain_som = plain_SOM(10, 10, frame_features[0].shape[0], data=frame_features, sigma=8.0, learning_rate=0.5, random_seed=1903)
plain_som.train_batch(20, verbose=True)

#a = pd.DataFrame(frame_ids[0], columns=['ID'])
#b = pd.DataFrame(query_scores[0], columns=['query_scores'])
#ab = pd.concat([a, b], axis=1, join='inner')

plain_som.display_results()
#plain_som.add_ranking(ab)

 [ 20 / 20 ] 100% - 0:00:00 left 
 quantization error: 0.9833118713591011


In [None]:
plain_som.selected_images

In [None]:
plain_som.mean_distance()

In [None]:
plain_som.mean_distance_nextdoor()

In [None]:
print(plain_som.tau())
print(plain_som.nDCG())