In [11]:
import pandas as pd
from sklearn import datasets

data = datasets.load_wine()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
#y = pd.Series(data["target"])

# Comment the different variables of this dataset
# Notice how they are all numerical - KMeans cannot deal with categorical variables
X.head()

# The scale of "proline" is much higher than the scale of many other variables!
# K-Means is a distance based algorithm: we need to scale / normalize:
from sklearn.preprocessing import StandardScaler
X_prep = StandardScaler().fit_transform(X)

# Now, all features will have the same weight.
pd.DataFrame(X_prep).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


In [12]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [14]:
# Clustering:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=1234)
kmeans.fit(X_prep)

KMeans(n_clusters=3, random_state=1234)

In [15]:
kmeans.predict(X_prep)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int32)

In [16]:
# Predicting / assigning the clusters:
clusters = kmeans.predict(X_prep)

# Check the size of the clusters
pd.Series(clusters).value_counts().sort_index()

# Explore the cluster assignment in the original dataset
X_df = pd.DataFrame(X)
X_df["cluster"] = clusters
X_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,cluster
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [17]:
cluster_0 = X_df[X_df['cluster'] == 0]
cluster_0

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,cluster
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,13.29,1.97,2.68,16.8,102.0,3.00,3.23,0.31,1.66,6.00,1.07,2.84,1270.0,0
58,13.72,1.43,2.50,16.7,108.0,3.40,3.67,0.19,2.04,6.80,0.89,2.87,1285.0,0
73,12.99,1.67,2.60,30.0,139.0,3.30,2.89,0.21,1.96,3.35,1.31,3.50,985.0,0
95,12.47,1.52,2.20,19.0,162.0,2.50,2.27,0.32,3.28,2.60,1.16,2.63,937.0,0


In [None]:
np.mean(cluster_0['alcohol'])

In [None]:
np.mean(cluster_0['malic_acid'])

In [18]:
np.mean(cluster_0)

alcohol                           13.676774
malic_acid                         1.997903
ash                                2.466290
alcalinity_of_ash                 17.462903
magnesium                        107.967742
total_phenols                      2.847581
flavanoids                         3.003226
nonflavanoid_phenols               0.292097
proanthocyanins                    1.922097
color_intensity                    5.453548
hue                                1.065484
od280/od315_of_diluted_wines       3.163387
proline                         1100.225806
cluster                            0.000000
dtype: float64

In [None]:
cluster_1 = X_df[X_df['cluster'] == 1]
cluster_1

In [None]:
np.mean(cluster_1)

In [None]:
cluster_2 = X_df[X_df['cluster'] == 2]
cluster_2

In [None]:
np.mean(cluster_2)

In [None]:
song = {alcohol: 14, malic_acid: 2, ash: 12}

In [None]:
d1 = distance of the song with cluster 0

In [None]:
d2 = distance of the song with cluster 1

In [None]:
d3 = distance of the song with cluster 2

In [None]:
import math

In [None]:
d1 = math.sqrt((14-13.67677)**2 + (2-1.997903)**2. + (12-2.466290)**2)

In [None]:
d1

In [None]:
d2 = math.sqrt((14-13.134118)**2 + (2-3.307255)**2. + (12-2.417647)**2)

In [None]:
d2

In [None]:
d3 = math.sqrt((14-12.250923)**2 + (2-1.897385)**2. + (12-2.231231)**2)
d3

In [None]:
clusters

In [None]:
kmeans.inertia_ 

In [None]:
for i in range(4):
    print(i)

In [None]:
import numpy as np
K = range(2, 20)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_prep)
    inertia.append(kmeans.inertia_)

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, marker = '*')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

In [None]:
from sklearn.metrics import silhouette_score
K = range(2, 20)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_prep)
    silhouette.append(silhouette_score(X_prep, kmeans.predict(X_prep)))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
data = pd.read_csv('/Users/himanshuaggarwal/Downloads/tracks.csv')

In [3]:
data = data.select_dtypes(np.number)

In [4]:
data.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6,126903,0,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,0,98200,0,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,0,181640,0,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,0,176907,0,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,0,163080,0,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [5]:
X_scaled = StandardScaler().fit_transform(data)

In [6]:
from sklearn.cluster import N

dbscan = DBSCAN(eps=1, min_samples=3, n_jobs=-1).fit(X_scaled)

In [7]:
dbscan.labels_[:100]

array([-1,  0, -1,  1,  2,  3, -1,  4,  2,  4,  5,  6, -1,  3,  3, -1, -1,
       -1,  3, -1,  4,  6,  4,  0, -1,  4,  4, -1,  2,  6,  2,  2,  4, -1,
       -1,  7,  8,  3,  2, -1,  7, -1,  2,  4,  9,  3, -1, -1, -1, -1,  7,
       -1,  4,  3, -1, -1,  3, -1, 38,  4, -1,  7, -1, -1,  9, 10, -1,  6,
        7,  6, -1, -1, -1,  4,  7,  7, 11, 44,  2,  3, -1, -1,  6,  7, -1,
        2, 11, 12, -1,  2,  3,  4, -1,  4,  5,  3, 13,  5,  3,  6])

In [8]:
pd.Series(dbscan.labels_).value_counts()

 3       252176
-1       149101
 2       117044
 4        20637
 980       5841
          ...  
 4310         3
 2773         3
 4822         3
 1236         3
 2047         3
Length: 4897, dtype: int64

In [9]:
len(dbscan.labels_)

586672

https://stackoverflow.com/questions/45313176/what-are-noisy-samples-in-scikits-dbscan-clustering-algorithm