In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import DBSCAN 

In [2]:
df = pd.read_csv('Wholesale customers data.csv')

In [3]:
df.shape

(440, 8)

In [4]:
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [5]:
df.isnull().sum()

Channel             0
Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
dtype: int64

In [6]:
model = DBSCAN(eps=1000, min_samples=2)
DBSCAN_labels = model.fit(df)

In [7]:
DBSCAN_labels.labels_ 

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  2, -1, -1,  3, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  4,  4, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  5, -1, -1, -1,  6, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  7, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  7, -1, -1, -1, -1, -1,  8,
       -1,  4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  9, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1,  2, -1, -1, -1, -1, 10, -1, -1, -1, 11,
       -1, -1,  9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  6, -1, -1, -1,
       -1, -1, -1, -1, -1

## Apply PCA

In [8]:
num_components = 3

In [9]:
X_meaned = df - np.mean(df , axis = 0)

In [10]:
cov_mat = np.cov(X_meaned , rowvar = False)

In [11]:
eigen_values , eigen_vectors = np.linalg.eigh(cov_mat)

In [12]:
sorted_index = np.argsort(eigen_values)[::-1]
sorted_eigenvalue = eigen_values[sorted_index]
sorted_eigenvectors = eigen_vectors[:,sorted_index]

In [13]:
eigenvector_subset = sorted_eigenvectors[:,0:num_components]

In [14]:
X_reduced = np.dot(eigenvector_subset.transpose() , X_meaned.transpose() ).transpose()


In [15]:
X_reduced.shape  #Now we have the same number of features as the number of components  

(440, 3)

In [16]:
X_reduced

array([[   650.02211956,   1585.51910693,    -95.39065399],
       [ -4426.80498362,   4042.45152053,   1534.8047351 ],
       [ -4841.99870994,   2578.76218764,   3801.38478039],
       ...,
       [  4555.11497765,  26201.75861611,  -5887.43291762],
       [ -2734.37091187,  -7070.77534304,   -790.70302385],
       [-10370.12530647,  -6161.4649232 ,  -1017.14237907]])

In [17]:
model = DBSCAN(eps=1000, min_samples=2)
DBSCAN_labels = model.fit(X_reduced)

In [18]:
DBSCAN_labels.labels_ 

array([-1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1,  2, -1,  3, -1, -1, -1,  4,  3,  5, -1, -1, -1,  6, -1, -1,
        7, -1, -1, -1, -1, -1,  8, -1, -1,  9, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, 10, -1,  2, -1, -1, -1, -1, -1, -1, -1,
       -1,  3, -1, -1, -1, -1,  0, -1, -1, -1,  3, -1,  3, 11, -1, 12, 13,
       -1, -1, -1, 14, -1, 15, -1, -1, -1, -1, -1, -1, 16, 16, -1, -1, -1,
       17, -1, -1,  5, -1, -1, -1, -1, 15, -1, -1, -1, 18,  3, 19, -1, -1,
       20, 21, 22, 23, -1, -1, -1, 24, -1, -1, -1, -1, -1,  5, 25,  3,  3,
       -1, 17, -1, -1, -1, -1, -1, -1, 18, -1,  3, 26, -1, -1, -1,  6, 18,
       -1, 16, -1, 27, 18, -1, -1, -1, 23, 28, -1, -1, -1, -1, 29,  3,  3,
       -1, -1, 30, -1, 31, 32, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, 10,  3, -1,  3, -1, -1, -1, 13,  3, -1, -1, -1,  7,
       29,  9,  3, -1, -1, -1, 18, -1,  3, 17, 32, -1, -1, 18, -1, 33, 34,
       -1, -1,  6,  3, -1