# Introduction to Clustering 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
custs_df = pd.read_excel("https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/distance/Distance_Datasets.xlsx", 
                         'Customers')

In [None]:
custs_df

In [None]:
sn.scatterplot(data = custs_df,
               x = 'Age',
               y = 'Income');

## Euclidean Distance

The  distance between two customers, $customer_{1}$ and $customer_{2}$ is calcualte as follows:

$\sqrt{(age_{1} - age_{2})^2 +  (income_{1} - income_{2})^2}$

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
euclidean_distances(custs_df[['Age', 'Income']])

#### Inference:

- Distance wise A and B are very different, whereas A and c are similar.
In reality, A and B are are very similar whereas A and C are very different as they have huge difference in terms of age.

- This is because of difference in scale in which age and income are represented.

##  Min Max Scaling


In this technique, the minimum value of the feature is scaled to 0 and the maximum value is scaled to 1. All other values are scaled to a value between 0 and 1 based on their relative position to the minimum and maximum values.

$X_{norm} = \frac{X_{i} - X_{min}}{X_{max} - X_{min}}$

[Sklearn Source](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaled_custs_df = scaler.fit_transform( 
    custs_df[["Age", "Income"]] )

scaled_custs_df[0:10]

In [None]:
dist = euclidean_distances(scaled_custs_df)

In [None]:
pd.DataFrame(dist, index = custs_df.index, columns = custs_df.index )

## Cosine Distances

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(custs_df[['Age', 'Income']])

## Jaccard Distance for categorical features

In [None]:
movies_df = pd.read_excel("https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/distance/Distance_Datasets.xlsx", 
                          "Movies", 
                          index_col='Users')

In [None]:
movies_df

In [None]:
from scipy.spatial.distance import jaccard

In [None]:
jaccard(movies_df.loc['User_1'].values, movies_df.loc['User_2'].values)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances 

In [None]:
jaccard_similarity = 1 - pairwise_distances(movies_df.values, metric="jaccard")

In [None]:
jaccard_similarity

In [None]:
pd.DataFrame(jaccard_similarity, index = movies_df.index, columns = movies_df.index )

## Haversine

In [None]:
cities_df = pd.read_excel("https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/distance/Distance_Datasets.xlsx"
                          , "Cities"
                          , index_col='City')

In [None]:
cities_df

In [None]:
from math import radians

In [None]:
cities_df['lat_radians'] = cities_df['Latitude'].map(radians)
cities_df['long_radians'] = cities_df['Longitude'].map(radians)

In [None]:
cities_df

In [None]:
from sklearn.metrics.pairwise import haversine_distances

In [None]:
aerial_distance = haversine_distances(cities_df[['lat_radians', 'long_radians']])

In [None]:
aerial_distance

In [None]:
aerial_distance_kms = aerial_distance * 6371000/1000
aerial_distance_kms

In [None]:
pd.DataFrame( aerial_distance_kms, index = cities_df.index, columns = cities_df.index )