## Vector Similarity Metrics:

* It is the representation of data in the form of nth dimensional vectors and the similarity is determined by the distance between them as they are both inversly propotional.

* Most common metrics:

  * Euclidean distance: 
    * measures the straight line distance between 2 points in space.
    * heavily influenced by the magnitude of the vector.
    * calculated using the normal distance formula. 
  
  * Cosine similarity:
    * measures the cos of an angle between two vectors concerned with the orientaion of the vectors(1 = perfectly aligned).
    * it ignores the magnitude of the vectors.
    * calculated as the dot product of the vectors divided by the product of their magnitudes.
  
  * Manhattan distance:
    * measures how close they are to each other without being influenced by the vector magnitude.
    * less sensitive to outliers
    * calculated by summing the absolute difference of their co-ordinates.
  
  * Minkowski distance:
    * a generalized distance that includes manhattan and euclidean distance and is defined by a parameter p.
    * if p = 2 it is normal euclidean distance, p = 1 manhattan distance.
    * Chebyshev distance is when p = infinity, which defines the max co-ordinate distance

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
data = pd.read_csv('compat.csv')

In [67]:
def get_cosine_similarity(data, targetId):
        # drop any none numerical data
        numdata = data.drop(columns=data.select_dtypes(include=object).columns, inplace=False)

        # get magnitude of each vector and multiply it with the target vector magnitude
        mag_data = np.sqrt((numdata ** 2).sum(axis=1))
        mag_data = mag_data[1:].mul(mag_data.loc[targetId])

        # get target vector
        targetrow = data.drop(columns=data.select_dtypes(include=object).columns, inplace=False).loc[targetId]

        # calculate the dot product
        dot_product = numdata[1:].mul(targetrow)

        # calculate the cosine similarity value
        results = round((dot_product.sum(axis=1)).div(mag_data), 4)

        # print the most similar to the target
        print(data.loc[results.idxmax()]['Name'], ": ", max(results))

get_cosine_similarity(data,0)

Ferrari :  0.9973


In [None]:
def get_euclidean_distance(data, target_id):
    # remove none numerical data
    numdata = data.drop(columns=data.select_dtypes(include=object).columns, inplace=False)
    print(numdata)

    # get the target row values
    target_row = numdata.loc[target_id]

    # calculate the distance between the target and other points
    distance = round(np.sqrt(((numdata[1:] - target_row) ** 2).sum(axis=1)), 4)

    # print results
    print(data.loc[distance.idxmin()]['Name'], ": ", min(distance))

get_euclidean_distance(data, 0)

In [None]:
def get_manhattan_distance(point1, point2):
    # set the initial distance value
    distance = 0
    column_labels = ['Speed', 'Aggression', 'Adaptability', 'Technical Skill', 'Teamwork', 'Risk-taking']

    # loop over each value in each vector and get their absolute difference
    for label in column_labels:
        distance += abs((point1[label] - point2[label]))

    return distance

results = {}

for row in range(1, len(data)):
    results[data.loc[row]['Name']] = get_manhattan_distance(data.loc[0], data.loc[row])

print(min(results, key=results.get), ": " , min(results.values()))

In [None]:
def get_minkowski_distance(point1, point2, p):
    # set the initial distance value
    distance = 0
    column_labels = ['Speed', 'Aggression', 'Adaptability', 'Technical Skill', 'Teamwork', 'Risk-taking']

    # loop over each value in each vector and get their absolute difference
    for label in column_labels:
        distance += (abs((point1[label] - point2[label]))) ** p

    return round(distance ** (1/p), 2)

results = {}

for row in range(1, len(data)):
    results[data.loc[row]['Name']] = get_minkowski_distance(data.loc[0], data.loc[row], 100)

print(min(results, key=results.get), ": " , min(results.values()))

In [74]:
from math import sqrt

class VectorSimilarity:
    def __init__(self, data):
        self.data = data
        
        # The dispatch table for metric functions, correctly referencing methods
        self.__metric_functions = {
            'euclidean': self.__get_euclidean_distance, 
            'manhattan': self.__get_manhattan_distance, 
            'minkowski': self.__get_minkowski_distance, 
            'cosine': self.__get_cosine_similarity
        }

    def __get_manhattan_distance(self, target_id):
        # remove none numerical data
        numdata = self.data.drop(columns=self.data.select_dtypes(include=object).columns, inplace=False)

        # get the target row values
        target_row = numdata.loc[target_id]

        # calculate the distance between the target and other points
        distance = round((np.abs(numdata[1:] - target_row).sum(axis=1)), 4)

        # print results
        print(self.data.loc[distance.idxmin()]['Name'], ": ", min(distance))
    
    def __get_euclidean_distance(self, target_id):
        # remove none numerical data
        numdata = self.data.drop(columns=self.data.select_dtypes(include=object).columns, inplace=False)

        # get the target row values
        target_row = numdata.loc[target_id]

        # calculate the distance between the target and other points
        distance = round(np.sqrt(((numdata[1:] - target_row) ** 2).sum(axis=1)), 4)

        # print results
        print(self.data.loc[distance.idxmin()]['Name'], ": ", min(distance))
    
    def __get_minkowski_distance(self, target_id, p):
        # remove none numerical data
        numdata = self.data.drop(columns=self.data.select_dtypes(include=object).columns, inplace=False)

        # get the target row values
        target_row = numdata.loc[target_id]

        # calculate the distance between the target and other points
        distance = round(((np.abs(numdata[1:] - target_row) ** p).sum(axis=1) ** (1/p)), 4)

        # print results
        print(self.data.loc[distance.idxmin()]['Name'], ": ", min(distance))
    
    def __get_cosine_similarity(self, targetId):
        # drop any none numerical data
        numdata = self.data.drop(columns=self.data.select_dtypes(include=object).columns, inplace=False)

        # get magnitude of each vector and multiply it with the target vector magnitude
        mag_data = np.sqrt((numdata ** 2).sum(axis=1))
        mag_data = mag_data[1:].mul(mag_data.loc[targetId])

        # get target vector
        targetrow = data.drop(columns=self.data.select_dtypes(include=object).columns, inplace=False).loc[targetId]

        # calculate the dot product
        dot_product = numdata[1:].mul(targetrow)

        # calculate the cosine similarity value
        results = round((dot_product.sum(axis=1)).div(mag_data), 4)

        # print the most similar to the target
        print(self.data.loc[results.idxmax()]['Name'], ": ", max(results))
    
    def similarity_metric(self, targetID, funct_type='cosine', p=3):
        funct = self.__metric_functions[funct_type]
        
        # Call the correct function with the correct parameters
        if funct_type == 'minkowski':
            funct(targetID, p)
        else:
            funct(targetID)

In [75]:
metric = VectorSimilarity(data)

metric.similarity_metric(0)
metric.similarity_metric(0, 'euclidean')
metric.similarity_metric(0, 'manhattan')
metric.similarity_metric(0, 'minkowski', p=5)

Ferrari :  0.9973
Ferrari :  2.0
Ferrari :  4
Ferrari :  1.3195
