In [1]:
# source: https://github.com/andremsouza/python-som

from collections import Counter
from typing import Union, Callable, Tuple, Iterable

import numpy as np
np.set_printoptions(suppress=True, precision=5)
import pandas as pd
pd.options.display.float_format = '{:.5f}'.format

import sklearn
import sklearn.decomposition
import sklearn.preprocessing

import matplotlib.pyplot as plt
from IPython.display import display
from PIL import Image

from sklearn.metrics import ndcg_score, dcg_score
from scipy.spatial import distance_matrix
from scipy.optimize import linear_sum_assignment

from time import time
from itertools import product
from munkres import Munkres

In [2]:
def _asymptotic_decay(x: float, t: int, max_t: int) -> float:
    """
    Asymptotic decay function. Can be used for both the learning_rate or the neighborhood_radius.
    :param x: float: Initial x parameter
    :param t: int: Current iteration
    :param max_t: int: Maximum number of iterations
    :return: float: Current state of x after t iterations
    """
    return x / (1 + t / (max_t / 2))


def _linear_decay(x: float, t: int, max_t: int) -> float:
    """
    Linear decay function. Can be used for both the learning_rate or the neighborhood_radius.
    :param x: float: Initial x parameter
    :param t: int: Current iteration
    :param max_t: int: Maximum number of iterations
    :return: float: Current state of x after t iterations
    """
    return x * (1.0 - t / max_t)

def _euclidean_distance(a: Union[float, np.ndarray], b: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
    return np.linalg.norm(np.subtract(a, b), ord=2, axis=-1)

def direct_neighbours(cell, size):
    for c in product(*(range(n-1, n+2) for n in cell)):
        if c != cell and all(0 <= n < size for n in c):
            yield c

class SOM:   
    """
    Features:
        - Stepwise and batch training
        - Random weight initialization
        - Random sampling weight initialization
        - Linear weight initialization (with PCA)
        - Automatic selection of map size ratio (with PCA)
        - Gaussian and Bubble neighborhood functions
        - Support for custom decay functions
        - Support for visualization (U-matrix, activation matrix)
        - Support for supervised learning (label map)
        - Support for NumPy arrays, Pandas DataFrames and regular lists of values
    """
    
    def __init__(
            self,
            x: Union[int, None],
            y: Union[int, None],
            input_len: int,
            learning_rate: float = 0.5,
            learning_rate_decay: Callable[[float, int, int], float] = _asymptotic_decay,
            neighborhood_radius: float = 1.0,
            neighborhood_radius_decay: Callable[[float, int, int], float] = _asymptotic_decay,
            neighborhood_function: str = 'gaussian',
            distance_function: Callable[
                [Union[float, np.ndarray], Union[float, np.ndarray]], Union[float, np.ndarray]] = _euclidean_distance,
            random_seed: Union[int, None] = None,
            data: Union[np.ndarray, pd.DataFrame, list, None] = None,
        
            auto_var_adjustment=False, 
            var_param=0, 
            num_images=20000, 
            query_scores=np.zeros(shape=(20000,)), 
            frame_ids=np.zeros(shape=(20000,))
    ) -> None:
        """
        Constructor for the self-organizing map class.
        :param x: int or NoneType: X dimension of the self-organizing map
        :param y: int or NoneType: Y dimension of the self-organizing map
        :param input_len: int: Number of features of the training dataset, i.e.,
            number of elements of each node of the network.
        :param learning_rate: float: Initial learning rate for the training process. Defaults to 0.5.
            Note: The value of the learning_rate is irrelevant for the 'batch' training mode.
        :param learning_rate_decay: function: Decay function for the learning_rate variable.
            May be a predefined one from this package, or a custom function, with the same parameters and return type.
            Defaults to _asymptotic_decay.
        :param neighborhood_radius: float: Initial neighborhood radius for the training process. Defaults to 1.
        :param neighborhood_radius_decay: function: Decay function for the neighborhood_radius variable.
            May be a predefined one from this package, or a custom function, with the same parameters and return type.
            Defaults to _asymptotic_decay
        :param neighborhood_function: str: Neighborhood function name for the training process.
            May be either 'gaussian' or 'bubble'.
        :param distance_function: function: Function for calculating distances/dissimilarities between models of the
            network.
            May be a predefined one from this package, or a custom function, with the same parameters and return type.
            Defaults to _euclidean_distance.
        :param random_seed: int or None: Seed for NumPy random value generator. Defaults to None.
        :param data: array-like: dataset for performing PCA.
            Required when either x or y is None, for determining map size.
        """
        
        if (x, y) == (None, None):
            raise ValueError('At least one of the dimensions (x, y) must be specified')
        if x == None or y == None:
            # If a dataset was given through **kwargs, select missing dimension with PCA
            # The ratio of the (x, y) sizes will comply roughly with the ratio of the two largest principal components
            if 'data' == None:
                raise ValueError(
                    "If one of the dimensions is not specified, a dataset must be provided for automatic size "
                    "initialization.")
        # Convert data to numpy array
        if isinstance(data, pd.DataFrame):
            data_array = data.to_numpy()
        else:
            data_array = np.array(data)

        # Update missing size variable
        if x == None:
            x = y // ratio
        if y == None:
            y = x // ratio
            
        # Initializing private variables
        self.som_size = x
        
        self._shape = (np.uint(x), np.uint(y))
        self._input_len = np.uint(input_len)
        self._learning_rate = float(learning_rate)
        self._learning_rate_decay = learning_rate_decay
        self._neighborhood_radius = float(neighborhood_radius)
        self._neighborhood_radius_decay = neighborhood_radius_decay
        self._neighborhood_function = {
            'gaussian': self._gaussian}[neighborhood_function]
        self._distance_function = distance_function
        self._neigx, self._neigy = np.arange(self._shape[0]), np.arange(self._shape[1])
        
        self.data = data
        self.query_scores = query_scores
        self.num_images = num_images
        
        # empty dataframe with column names
        self.selected_images = pd.DataFrame(columns = ['filename', 'ID', 'BMU_x', 'BMU_y'])
        self.selected_vectors = np.zeros((x, y, 128))
        
        # Seed numpy random generator
        if random_seed == None:
            self._random_seed = np.random.randint(np.random.randint(np.iinfo(np.int32).max))
        else:
            self._random_seed = int(random_seed)
        np.random.seed(self._random_seed)

        ## using top N images only! ##
        if num_images != 20000:
            
            ids = pd.DataFrame(frame_ids, columns=['ID'])
            ids = ids[0:num_images]
            
            self.data = data[ids.ID]
        
        # Random weight initialization
        self._weights = np.random.standard_normal(size=(self._shape[0], self._shape[1], self._input_len))
        
        # adding extra dimension to the data
        # random initialization
        if auto_var_adjustment == True or var_param > 0: 
 
            self.data = np.append(self.data, np.zeros((num_images, 1)), axis=1)
            
            self.adjust_var()
            self.initial_bias()
        
    ## custom functions for adding bias to SOM initialization ## 
    def adjust_var(self):
        # variance of randomly initialized SOM nodes (on original features)
        sum = 0
        for i in range(self.som_size):
            for j in range(self.som_size):
                sum += self._weights[i, j].var()
        print("sum of variance over all SOM nodes' original features: {}".format(sum))
            
        # if the var difference is low enough, then we adjust the query scores
        for i in range(10000):
            
            tmp = self.query_scores * i

            if abs(sum - tmp.var()) < 0.1:
                
                print("variance of adjusted query scores: {}".format(tmp.var()))
                break
                
        self.query_scores = tmp
        
    def initial_bias(self):

        # add query scores to SOM as another feature, considering the position of the nodes

        # we will divide query_scores into N intervals of equal size (where SOM has size n x n)
        # then from each interval corresponding to each row, pick N values randomly.

        # for example, for 5 x 5 SOM: divide query_score into 5 intervals
        # top 80~100% values, 60~80%, ..., 0~20%. Nodes in top row are assigned 5 random values from the first interval, and so on.

        cutpoints = []
        interval_size = 100 / self.som_size 
        for i in range(1, self.som_size):

            cutpoint = np.quantile(self.query_scores, interval_size * i * 0.01)
            cutpoints.append(cutpoint)
            
            if i == 1:

                bottom_interval = self.query_scores[self.query_scores < cutpoint]
                
                # assign to bottom nodes i.e. som[size-1, 0] ... som[size-1, size-1]
                for j in range(self.som_size):
                    self._weights[self.som_size-1, j, 128] = np.random.choice(bottom_interval, size=1)[0]

            else:
                interval = self.query_scores[np.logical_and(self.query_scores > cutpoints[i-2], self.query_scores < cutpoint)]
                
                # assign to middle row nodes e.g. som[1, 0] ... som[1, size-1]
                for j in range(self.som_size):
                    self._weights[self.som_size-i, j, 128] = np.random.choice(interval, size=1)[0]

                if i == self.som_size-1:

                    top_interval = self.query_scores[self.query_scores > cutpoint]         

                    # assign to top nodes i.e. som[0, 0] ... som[0, size-1]
                    for j in range(self.som_size):

                        self._weights[0, j, 128] = np.random.choice(top_interval, size=1)[0]

    # remove the extra dimension added to the data 
    def reset_data(self):
        # return? or modify via class directly?
        self.data = np.delete(self.data, -1, axis=1)

    def get_weights(self) -> np.ndarray:
        """
        Gets the weight matrix of the network.
        :return: np.ndarray: Weight matrix of the network.
        """
        return self._weights

    def activate(self, x: Union[np.ndarray, pd.DataFrame, list]) -> np.ndarray:
        """
        Calculates distances between an instance x and the weights of the network.
        :param x: array-like: Instance to be compared with the weights of the network.
        :return: np.ndarray: Distances between x and each weight of the network.
        """
        return self._distance_function(x, self._weights)

    def winner(self, x: Union[np.ndarray, pd.DataFrame, list]) -> Union[Iterable, Tuple[int, int]]:
        """
        Calculates the best-matching unit of the network for an instance x
        :param x: array-like: Instance to be compared with the weights of the network.
        :return: (int, int): Index of the best-matching unit of x.
        """
        activation_map = self.activate(x)
        return np.unravel_index(activation_map.argmin(), activation_map.shape)

    def quantization(self, data: Union[np.ndarray, pd.DataFrame, list]) -> np.ndarray:
        """
        Calculates distances from each instance of 'data' to each of the weights of the network.
        :param data: array-like: Dataset to be compared with the weights of the network.
        :return: np.ndarray: array of lists of distances from each instance of the dataset
            to each weight of the network.
        """
        # Convert data to numpy array
        if isinstance(data, pd.DataFrame):
            data_array = data.to_numpy()
        else:
            data_array = np.array(data)
        return np.array([(self._distance_function(i, self._weights[self.winner(i)])) for i in data_array])

    def quantization_error(self, data: Union[np.ndarray, pd.DataFrame, list]) -> float:
        """
        Calculates average distance of the weights of the network to their assigned instances from data.
        This error is a quality measure for the training process.
        :param data: array-like: Dataset to be compared with the weights of the network.
        :return: float: Quantization error.
        """
        quantization = self.quantization(data)
        return quantization.mean()

    def activation_matrix(self) -> np.ndarray:
        """
        Calculates the activation matrix of the network for a dataset, i.e., for each node, the count of instances that
        have been assigned to it, in the current state.
        :param data: array-like: Dataset to be compared with the weights of the network.
        :return: np.ndarray: Activation matrix.
        """
        # Convert data to numpy array
        if isinstance(self.data, pd.DataFrame):
            data_array = self.data.to_numpy()
        else:
            data_array = np.array(self.data)

        activation_matrix = np.zeros(self._shape)
        for i in data_array:
            activation_matrix[self.winner(i)] += 1
        return activation_matrix
    
    def winner_map(self, data: Union[np.ndarray, pd.DataFrame, list]) -> dict:
        """
        Calculates, for each node (i, j) of the network,
        the list of all instances from 'data' that has been assigned to it.
        :param data: array-like: Dataset to be compared with the weights of the network.
        :return: dict: Winner map.
        """
        # Convert data to numpy array
        if isinstance(data, pd.DataFrame):
            data_array = data.to_numpy()
        else:
            data_array = np.array(data)
        winner_map = {(i, j): [] for i in range(self._shape[0]) for j in range(self._shape[1])}
        for i in data_array:
            winner_map[self.winner(i)].append(i)
        return winner_map

    def train(self, n_iteration: Union[int, None] = None,
              mode: str = 'sequential', verbose: bool = False) -> float:
        """
        Trains the self-organizing map, with the dataset 'data', and a certain number of iterations.
        :param data: array-like: Dataset for training.
        :param n_iteration: int or None: Number of iterations of training.
            If None, defaults to 1000 * len(data) for stepwise training modes,
            or 10 * len(data) for batch training mode.
        :param mode: str: Training mode name. May be either 'random', 'sequential', or 'batch'.
            For 'batch' mode, a much smaller number of iterations is needed, but a higher computation power is required
            for each individual iteration.
        :param verbose: bool: Activate to print useful information to the terminal/console, e.g.,
            the progress of the training process
        :return: float: Quantization error after training
        """
        # Convert data to numpy array for training
        if isinstance(self.data, pd.DataFrame):
            data_array = self.data.to_numpy()
        else:
            data_array = np.array(self.data)

        # If no number of iterations is given, select automatically
        if n_iteration == None:
            n_iteration = {'random': 1000, 'sequential': 1000, 'batch': 10}[mode] * len(data_array)

        if verbose:
            print("Training with", n_iteration,
                  "iterations.\nTraining mode:", mode, sep=' ')

        elif mode == 'sequential':
            # Sequential sampling from training dataset
            for it, i in enumerate(data_array):
                # Calculating decaying alpha and sigma parameters for updating weights
                alpha = self._learning_rate_decay(self._learning_rate, it, n_iteration)
                sigma = self._neighborhood_radius_decay(
                    self._neighborhood_radius, it, n_iteration)

                # Finding winner node (best-matching unit)
                winner = self.winner(i)

                # Updating weights, based on current neighborhood function
                self._weights += alpha * self._neighborhood_function(winner, sigma)[..., None] * (
                        i - self._weights)

                # Print progress, if verbose is activated
                if verbose:
                    print("Iteration:", it, "/", n_iteration, sep=' ', end='\r', flush=True)
                    
        elif mode == 'batch':
            # Batch training
            for it in range(n_iteration):

                # Calculating decaying sigma
                sigma = self._neighborhood_radius_decay(
                    self._neighborhood_radius, it, n_iteration)

                # For each node, create a list of instances associated to it
                winner_map = self.winner_map(data_array)

                # Calculate the weighted average of all instances in the neighborhood of each node
                new_weights = np.zeros(self._weights.shape)
                for i in winner_map.keys():
                    neig = self._neighborhood_function(i, sigma)
                    upper, bottom = np.zeros(self._input_len), 0.0
                    for j in winner_map.keys():
                        upper += neig[j] * np.sum(winner_map[j], axis=0)
                        bottom += neig[j] * len(winner_map[j])

                    # Only update if there is any instance associated with the winner node or its neighbors
                    if bottom != 0:
                        new_weights[i] = upper / bottom

                # Update all nodes concurrently
                self._weights = new_weights

                # Print progress, if verbose is activated
                if verbose:
                    print("Iteration:", it, "/", n_iteration, sep=' ', end='\r', flush=True)
        else:
            # Invalid training mode value
            raise ValueError(
                "Invalid value for 'mode' parameter. Value should be in " + str(['random', 'sequential', 'batch']))

        # Compute quantization error
        q_error = self.quantization_error(data_array)
        if verbose:
            print("Quantization error:", q_error, sep=' ')
        return q_error

    def weight_initialization(self, mode: str = 'random',
                              **kwargs: Union[np.ndarray, pd.DataFrame, list, str, int]) -> None:
        """
        Function for weight initialization of the self-organizing map. Calls other methods for each initialization mode.
        :param mode: str: Initialization mode. May be either 'random', 'linear', or 'sample'.
            Note: Each initialization method may require multiple additional arguments in kwargs.
        :param kwargs:
            For 'random' initialization mode, 'sample_mode': str may be provided to determine the sampling mode.
            'sample_mode' may be either 'standard_normal' (default) or 'uniform'.
            For 'random' and 'sample' modes, 'random_seed': int may be provided for the random value generator.
            For 'sample' and 'linear' modes, 'data': array-like must be provided for sampling/PCA.
        """
        modes = {'random': self._weight_initialization_random,
                 'sample': self._weight_initialization_sample}
        try:
            modes[mode](**kwargs)
        except KeyError:
            raise ValueError("Invalid value for 'mode' parameter. Value should be in " + str(modes.keys()))

    def _weight_initialization_random(self, sample_mode: str = 'standard_normal',
                                      random_seed: Union[int, None] = None) -> None:
        """
        Random initialization method. Assigns weights from a random distribution defined by 'sample_mode'.
        :param sample_mode: str: Distribution for random sampling. May be either 'uniform' or 'standard_normal'.
            Defaults to 'standard_normal'.
        :param random_seed: int or None: Seed for NumPy random value generator. Defaults to None.
        """
        sample_modes = {'uniform': np.random.random, 'standard_normal': np.random.standard_normal}

        # Seed numpy random generator
        if random_seed == None:
            random_seed = np.random.randint(
                np.random.randint(np.iinfo(np.int32).max))
        else:
            random_seed = int(random_seed)
        np.random.seed(random_seed)

        # Initialize weights randomly
        try:
            self._weights = sample_modes[sample_mode](size=self._weights.shape)
        except KeyError:
            raise ValueError(
                "Invalid value for 'sample_mode' parameter. Value should be in " + str(sample_modes.keys()))

    def _weight_initialization_sample(self, data: Union[np.ndarray, pd.DataFrame, list],
                                      random_seed: Union[int, None] = None) -> None:
        """
        Initialization method. Assigns weights to random samples from an input dataset.
        :param data: Dataset for weight initialization/sampling.
        :param random_seed: int or None: Seed for NumPy random value generator. Defaults to None.
        """
        # Seed numpy random generator
        if random_seed == None:
            random_seed = np.random.randint(
                np.random.randint(np.iinfo(np.int32).max))
        else:
            random_seed = int(random_seed)
        np.random.seed(random_seed)

        # Convert data to numpy array for training
        if isinstance(data, pd.DataFrame):
            data_array = data.to_numpy()
        else:
            data_array = np.array(data)

        # Assign weights to random samples from dataset
        sample_size = self._shape[0] * self._shape[1]
        sample = np.random.choice(len(data_array), size=sample_size,
                                  replace=(sample_size > len(data_array)))
        self._weights = data_array[sample].reshape(self._weights.shape)

    def _gaussian(self, c: Tuple[int, int], sigma: float) -> np.ndarray:
        """
        Gaussian neighborhood function, centered in c. 
        :param c: (int, int): Center coordinates for gaussian function.
        :param sigma: float: Spread variable for gaussian function.
        :return: np.ndarray: Gaussian, centered in c, over all the weights of the network.
        """
        # Calculate coefficient with sigma
        d = 2 * sigma * sigma
        # Calculate vertical and horizontal distances
        dx = self._neigx - c[0]
        dy = self._neigy - c[1]

        # Calculate gaussian centered in c
        ax = np.exp(-np.power(dx, 2) / d)
        ay = np.exp(-np.power(dy, 2) / d)
        return np.outer(ax, ay)
    
    
    ## custom functions for displaying images ##
    
    def display_results(self, frameID_scores, original_data, ranking=False):

        start_time = time()

        # putting the 3 functions below together
        filepaths = pd.read_csv(r"native-queries\frame-ID-to-filepath.csv",sep=' ',names=['filename', 'ID'])
        
        if self.num_images == (self.som_size * self.som_size):
            self.hungarian_algo(filepaths, frameID_scores)
        
        else:
            df = self.node2image(filepaths)
            self.select_images(df, frameID_scores)
        
        self.display_images(original_data)
        
        if ranking == True:
            self.add_ranking()

        print("Time elapsed: %s seconds" % (time() - start_time))
    
    def hungarian_algo(self, filepaths, frameID_scores):
        
        frameID_scores = frameID_scores[0:self.num_images]
        # self.selected_images = pd.DataFrame(columns = ['filename', 'ID', 'BMU_x', 'BMU_y'])
        
        # merge with filepaths on top 100 IDs only 
        merged = frameID_scores.merge(filepaths, how='inner', on='ID') 
    
        # pairwise distance matrix for SOM weight vectors and input data 
        pairwise_dist = distance_matrix(np.concatenate(self._weights), self.data)
        m = Munkres()
        indexes = m.compute(pairwise_dist)

        my_pd = pd.DataFrame(columns = ['ID', 'BMU_x', 'BMU_y'])

        for row, column in indexes:

            x = row / 10
            y = row % 10
            my_pd = my_pd.append({'ID': merged.ID[column],'BMU_x': int(x), 'BMU_y': y}, ignore_index=True)

        merged = my_pd.merge(merged, on='ID')
        
        print("The following images were selected by Hungarian Algo: ")
        display(merged)
        
        self.selected_images = merged
    
    def node2image(self, filepaths):

        # run find_bmu to obtain BMU for each feature vector

        bmu_list = []
        for vec in self.data:
            bmu_list.append(np.asarray(self.winner(vec)))

        # storing the result in a dataframe. 
        # i-th row of the dataframe (ex: 4 1) is associated with the BMU coordinates of the i-th feature vector  
        data_ = pd.DataFrame(bmu_list, columns=['BMU_x','BMU_y'])

        # merge the 2 dataframes
        df = pd.concat([filepaths, data_], axis=1, join='inner')

        # now we have:

        # filename ID BMU_x BMU_y
        # ........ rows ........

        return df

    # image selection after training
    def select_images(self, df, id_scores):

        # empty dataframe with column names
        ##selected_images = self.selected_images
        selected_images = pd.DataFrame(columns = ['filename', 'ID', 'BMU_x', 'BMU_y'])
        
        for i in range(self.som_size):
            for j in range(self.som_size):

                tmp = df.loc[(df.BMU_x == i) & (df.BMU_y == j)]

                if(len(tmp) != 0):
                    
                    merged = tmp.merge(id_scores, how='inner', on='ID') 
                    
                    # select image with highest score
                    highest = merged.loc[merged['query_scores'].idxmax()]
                    
                    selected_images = selected_images.append(highest)

        print("The following images were selected: ")
        display(selected_images)

        self.selected_images = selected_images
    
    def display_images(self, original_data):

        # this function not only displays images, but also removes the extra dimension we added to the data
        # and extracts selected vectors for each SOM node
        
        my_dir = "native-queries/thumbs/"

        
        
        # fig, ax = plt.subplots(self.som_size, self.som_size, sharex='col', sharey='row', figsize=(320,180))

        
        
        # remove the extra dimension we initially added to the data
        if(self._input_len == 129):
            self.reset_data()
        for i in range(len(self.selected_images)):

            row = self.selected_images.iloc[i,:]
            x = row.BMU_x
            y = row.BMU_y
            id = row.ID
        
            # extracting selected vectors used for output - for evaluation purposes later
            self.selected_vectors[x, y] = original_data[id]
    
        """
            img = Image.open(my_dir + row.filename)
            img = img.resize((320, 180))
            img = np.asarray(img)

            ax[x, y].imshow(img)

        plt.tight_layout(pad=0.1, w_pad=0.1, h_pad=0.1)
        plt.show()
        """
        
   # adding ranking to selected images 
    def add_ranking(self):
        
        selected = self.selected_images
        
        selected['SOM_rank'] = np.arange(len(selected)) + 1
        selected['QS_rank'] = selected['query_scores'].rank(ascending=False).astype(int)
    
        self.selected_images = selected
        
        print("ranking: ")
        display(selected)
 
    ## evaluation metrics ## 
    # comparing the distance of selected images on the output screen and in the feature space
    
    # here we use 2 different notions of "neighbor" - direct neighbors (neighbors in all cardinal directions) and next door neighbors.
    def mean_distance(self, print_result=False):
        # mean distance matrix[i,j] contains the mean distance of neighbors from selected_vectors[i, j]
        mean_distance_matrix = np.zeros((self.som_size, self.som_size))
        
        distance_sum = 0
        
        for i in range(self.som_size):
            for j in range(self.som_size):
                for index in list(direct_neighbours((i,j), self.som_size)):

                    distance_sum += self._distance_function(self.selected_vectors[index], self.selected_vectors[i, j])
                distance_sum /= len(list(direct_neighbours((i,j), self.som_size)))
                mean_distance_matrix[i, j] = distance_sum
        
        if print_result == True:
            print("mean_distance_matrix: \n")
            print(mean_distance_matrix)
            
        return mean_distance_matrix
                
    def mean_distance_nextdoor(self, print_result=False):
        
        mean_distance_matrix = np.zeros((self.som_size, self.som_size))
        
        for i in range(self.som_size):
            for j in range(self.som_size):
                if j == 0:
                    mean_distance_matrix[i, j] = self._distance_function(self.selected_vectors[i,j+1], self.selected_vectors[i, j])
                elif j == self.som_size-1:
                    mean_distance_matrix[i, j] = self._distance_function(self.selected_vectors[i, j-1], self.selected_vectors[i, j])
                else:
                    mean_distance_matrix[i, j] = self._distance_function(self.selected_vectors[i, j-1], self.selected_vectors[i, j]) 
                    + self._distance_function(self.selected_vectors[i, j+1], self.selected_vectors[i, j])
                    mean_distance_matrix[i, j] /= 2
                    
        if print_result == True:
            print("\nmean_distance_nextdoor: \n")
            print(mean_distance_matrix)
                    
        return mean_distance_matrix
    
    ## Rank correlation metrics ## 
    # comparing the original ranking with SOM-induced ranking
        
    def tau(self):
            
        return self.selected_images['SOM_rank'].corr(self.selected_images['QS_rank'], method='kendall')

    def nDCG(self):
        ## use original scores, not rankings. ##
    
        # Releveance scores in actual order
        actual = self.selected_images['query_scores']
        # in ideal order
        ideal = actual.sort_values()
        
        actual = np.asarray(actual).reshape(1,100)
        ideal = np.asarray(ideal).reshape(1, 100)

        return ndcg_score(actual, ideal)

In [3]:
query_scores = np.fromfile("native-queries/native-query-scores.bin", dtype='f')
query_scores = query_scores.reshape(327, 20000)

frame_ids = np.fromfile("native-queries/native-query-frame-IDs.bin", dtype='i')
frame_ids = frame_ids.reshape(327, 20000)
 
frame_features = np.fromfile("native-queries/frame-features.bin", dtype='f')
frame_features = frame_features.reshape(20000, 128)

In [4]:
som = SOM(x=10, y=10, input_len=128, learning_rate=0.5, neighborhood_radius=1.0,
        neighborhood_function='gaussian', data=frame_features, random_seed=1903)

In [5]:
biased_som = SOM(x=10, y=10, input_len=129, neighborhood_function='gaussian', data=frame_features, random_seed=1903, 
             auto_var_adjustment=True, num_images=100, query_scores=query_scores[0], frame_ids=frame_ids[0])

sum of variance over all SOM nodes' original features: 98.94712212304543
variance of adjusted query scores: 98.86817932128906


In [6]:
som.train(mode='sequential', n_iteration=40000) 

0.8303661286288043

In [7]:
biased_som.train(mode='sequential', n_iteration=40000) 

0.6389408935151962

In [8]:
som.activation_matrix()

array([[220., 207., 139., 130., 297., 470., 248., 308., 159., 172.],
       [258., 173., 328., 315., 264., 173., 158., 149.,  89., 148.],
       [ 69., 223.,  94., 110.,  56., 167., 132., 149., 119., 160.],
       [ 25., 301., 186., 117., 105., 176., 225., 190., 141., 160.],
       [353., 472., 172., 295., 187., 171., 129., 343., 309., 165.],
       [264., 172., 153.,  94., 182., 155.,  97., 209., 250., 375.],
       [157., 389., 109., 151., 174., 233., 124., 139., 173., 131.],
       [216., 188., 233., 182., 122., 134., 239., 199., 174., 175.],
       [315., 169., 381., 240., 243., 223., 456., 120., 152., 188.],
       [350., 341., 264., 144., 272., 165., 107.,  73., 177., 121.]])

In [9]:
biased_som.activation_matrix()

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  4.,  1.,  4.,  4.,  0.,  0.],
       [ 0.,  0.,  0.,  4.,  3.,  3.,  2.,  3.,  2.,  0.],
       [ 0.,  0.,  0.,  2.,  5.,  1.,  0.,  2.,  2.,  0.],
       [ 0.,  0.,  0.,  4., 10., 21.,  4.,  8., 11.,  0.]])

In [10]:
a = pd.DataFrame(frame_ids[0], columns=['ID'])
b = pd.DataFrame(query_scores[0], columns=['query_scores'])
ab = pd.concat([a, b], axis=1, join='inner')

som.display_results(ab, frame_features, ranking=True)

The following images were selected: 


Unnamed: 0,filename,ID,BMU_x,BMU_y,query_scores
133,v04582_s00192(f017354-f017429)_g00297_f017400.jpg,12773,0,0,0.00403
175,v06055_s00202(f036886-f037073)_g00290_f037072.jpg,16897,0,1,0.00766
127,v06692_s00061(f009716-f009776)_g00088_f009758.jpg,18661,0,2,0.00741
6,v00382_s00002(f003291-f006109)_g00050_f005544.jpg,1096,0,3,0.03613
214,v05326_s00001(f001841-f013458)_g00225_f008121.jpg,14890,0,4,0.11311
...,...,...,...,...,...
112,v04803_s00535(f028723-f028786)_g00639_f028771.jpg,13417,9,5,0.00164
17,v01419_s00073(f003647-f003677)_g00102_f003675.jpg,3885,9,6,0.00535
13,v01037_s00006(f001673-f001732)_g00011_f001708.jpg,2800,9,7,0.00341
122,v05033_s00074(f004550-f004599)_g00091_f004575.jpg,14099,9,8,0.12221


ranking: 


Unnamed: 0,filename,ID,BMU_x,BMU_y,query_scores,SOM_rank,QS_rank
133,v04582_s00192(f017354-f017429)_g00297_f017400.jpg,12773,0,0,0.00403,1,68
175,v06055_s00202(f036886-f037073)_g00290_f037072.jpg,16897,0,1,0.00766,2,49
127,v06692_s00061(f009716-f009776)_g00088_f009758.jpg,18661,0,2,0.00741,3,51
6,v00382_s00002(f003291-f006109)_g00050_f005544.jpg,1096,0,3,0.03613,4,27
214,v05326_s00001(f001841-f013458)_g00225_f008121.jpg,14890,0,4,0.11311,5,12
...,...,...,...,...,...,...,...
112,v04803_s00535(f028723-f028786)_g00639_f028771.jpg,13417,9,5,0.00164,96,83
17,v01419_s00073(f003647-f003677)_g00102_f003675.jpg,3885,9,6,0.00535,97,58
13,v01037_s00006(f001673-f001732)_g00011_f001708.jpg,2800,9,7,0.00341,98,76
122,v05033_s00074(f004550-f004599)_g00091_f004575.jpg,14099,9,8,0.12221,99,10


Time elapsed: 1.6463747024536133 seconds


In [11]:
a = pd.DataFrame(frame_ids[0], columns=['ID'])
b = pd.DataFrame(query_scores[0], columns=['query_scores'])
ab = pd.concat([a, b], axis=1, join='inner')

biased_som.display_results(ab, frame_features, ranking=True)

The following images were selected by Hungarian Algo: 


Unnamed: 0,ID,BMU_x,BMU_y,query_scores,filename
0,18358,0,0,0.08505,v06594_s00035(f004610-f004975)_g00071_f004900.jpg
1,15387,0,1,0.07843,v05509_s00052(f005042-f005141)_g00114_f005100.jpg
2,1,0,2,0.32380,v00000_s00017(f002103-f002959)_g00050_f002800.jpg
3,7890,0,3,0.14411,v02879_s00046(f005836-f006173)_g00127_f005994.jpg
4,18843,0,4,0.08343,v06769_s00069(f009408-f009738)_g00130_f009422.jpg
...,...,...,...,...,...
95,5645,9,5,0.86154,v02083_s00068(f006057-f006208)_g00171_f006083.jpg
96,10496,9,6,0.07137,v03798_s00030(f004646-f004785)_g00086_f004680.jpg
97,14911,9,7,0.07515,v05332_s00051(f004908-f005001)_g00130_f004975.jpg
98,19706,9,8,0.06889,v07055_s00085(f006951-f006993)_g00209_f006975.jpg


ranking: 


Unnamed: 0,ID,BMU_x,BMU_y,query_scores,filename,SOM_rank,QS_rank
0,18358,0,0,0.08505,v06594_s00035(f004610-f004975)_g00071_f004900.jpg,1,75
1,15387,0,1,0.07843,v05509_s00052(f005042-f005141)_g00114_f005100.jpg,2,86
2,1,0,2,0.32380,v00000_s00017(f002103-f002959)_g00050_f002800.jpg,3,12
3,7890,0,3,0.14411,v02879_s00046(f005836-f006173)_g00127_f005994.jpg,4,36
4,18843,0,4,0.08343,v06769_s00069(f009408-f009738)_g00130_f009422.jpg,5,77
...,...,...,...,...,...,...,...
95,5645,9,5,0.86154,v02083_s00068(f006057-f006208)_g00171_f006083.jpg,96,2
96,10496,9,6,0.07137,v03798_s00030(f004646-f004785)_g00086_f004680.jpg,97,93
97,14911,9,7,0.07515,v05332_s00051(f004908-f005001)_g00130_f004975.jpg,98,88
98,19706,9,8,0.06889,v07055_s00085(f006951-f006993)_g00209_f006975.jpg,99,98


Time elapsed: 3.6214120388031006 seconds


In [12]:
som.mean_distance(print_result=True)
som.mean_distance_nextdoor(print_result=True);

mean_distance_matrix: 

[[1.25411 1.51603 1.59204 1.57529 1.4399  1.46328 1.40872 1.5081  1.5813
  1.83708]
 [1.71943 1.56478 1.49685 1.49022 1.31228 1.26349 1.33087 1.38015 1.51089
  1.61019]
 [1.59889 1.52032 1.54495 1.43873 1.43249 1.48649 1.45806 1.44036 1.51219
  1.57885]
 [1.51867 1.36196 1.48011 1.4127  1.49898 1.50736 1.49316 1.52758 1.49903
  1.56421]
 [1.51423 1.36297 1.38529 1.47154 1.36847 1.48676 1.46426 1.40652 1.33559
  1.44217]
 [1.52583 1.45276 1.46872 1.48414 1.40044 1.26696 1.43554 1.27568 1.29346
  1.3214 ]
 [1.45115 1.36572 1.46057 1.48676 1.49242 1.35367 1.45495 1.46826 1.31631
  1.35719]
 [1.44806 1.33838 1.45071 1.49361 1.47978 1.48261 1.51834 1.46574 1.39343
  1.52572]
 [1.44531 1.324   1.39476 1.45609 1.41813 1.44363 1.51219 1.46246 1.40769
  1.35147]
 [1.56616 1.45702 1.47454 1.42189 1.53009 1.45245 1.41784 1.60718 1.54301
  1.64422]]

mean_distance_nextdoor: 

[[1.13104 0.56552 0.54763 0.65171 0.63781 0.60955 0.49061 0.55966 0.66851
  1.23814]
 [1.39279 0.69

In [13]:
biased_som.mean_distance(print_result=True)
biased_som.mean_distance_nextdoor(print_result=True);

mean_distance_matrix: 

[[1.20631 1.37352 1.2954  1.37234 1.22617 1.19929 1.25654 1.48416 1.38169
  1.64566]
 [1.59856 1.36694 1.31694 1.22291 1.29886 1.3864  1.39363 1.34347 1.40818
  1.39549]
 [1.35235 1.21258 1.24537 1.31245 1.50098 1.45908 1.44231 1.43467 1.41108
  1.40433]
 [1.53928 1.19255 1.1573  1.35115 1.3888  1.22893 1.31725 1.37544 1.36832
  1.38998]
 [1.3904  1.11283 1.12999 1.14914 1.10958 1.20335 1.09986 1.35535 1.41376
  1.38276]
 [1.54878 1.14855 1.00473 1.06065 1.29144 1.17501 1.111   1.3226  1.36972
  1.37115]
 [1.32166 1.40046 1.10218 1.32531 1.37698 1.11277 1.01744 1.08058 1.2923
  1.31785]
 [1.44387 1.20183 1.11512 1.05026 1.08059 1.19754 0.88507 1.02738 1.1842
  1.20527]
 [1.43203 1.40417 0.99513 1.01917 0.88601 0.88984 0.91518 0.88559 0.91614
  1.28316]
 [1.51822 1.19699 1.02719 0.96628 0.89964 0.83237 1.05821 0.98629 0.98809
  1.25126]]

mean_distance_nextdoor: 

[[1.24659 0.6233  0.40482 0.43145 0.63696 0.33638 0.3225  0.56605 0.65805
  1.05373]
 [1.30459 0.652

In [14]:
som.tau()

-0.02343434343434344

In [15]:
som.nDCG()

0.7486023513020942

In [16]:
som._input_len

128

In [17]:
biased_som.tau()

0.034747474747474756

In [18]:
biased_som.nDCG()

0.6800081016184968