https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html


https://scikit-learn.org/stable/modules/clustering.html

# libraries

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn import datasets

from sklearn.preprocessing import StandardScaler

# wine dataset

import wine create df with column names

In [None]:

wine = datasets.load_wine()
wine

## numpy array of wine data

In [None]:
data = wine['data']

data

## convert to dataframe for ease of exploration

In [None]:

df_wine = pd.DataFrame(data, columns=wine['feature_names'])
df_wine

In [None]:
df_wine.describe()

## standardize using standard scaler

In [None]:
df_wine_scaled = StandardScaler().fit_transform(df_wine)
df_wine_scaled = pd.DataFrame(df_wine_scaled, columns=wine['feature_names']) # convert back to pandas from numpy
df_wine_scaled.head()

In [None]:
df_wine_scaled.describe()

add in the cultivar for our data

In [None]:


df_wine['cultivar'] = wine.target
df_wine_scaled['cultivar'] = wine.target
df_wine

## wine target

if we were doing supervised learning these are the labels/target that we would be predicting. notice this is NOT a binary classification problem. we have 3 total classes (wine cultivars)

In [None]:
wine['target']

# plot incremental increase in cluster numbers and results of that cluster by alcalinity and alcohol

In [None]:
df_wine_scaled.columns.to_list()

In [None]:


# experiment with different numbers of clusters and plot the results

for i in range(1,11):
  cluster = KMeans(i).fit_predict(df_wine_scaled.drop('cultivar',axis=1)) # make sure we don't use the target for kmeans
  df_wine_scaled['cluster'] = cluster
  print(f'cluster size {i}')
  sns.lmplot(x='alcohol', y='alcalinity_of_ash', data=df_wine_scaled, hue='cluster', fit_reg=False)

  plt.title(f"wines by alcohol and alcalinity of ash. {i} clusters highlight = cluster")
  plt.show()
  for column in df_wine_scaled.columns:
    df_wine_scaled.boxplot(column,by='cluster')
    plt.show()


#plt.plot(inertia_lst)

## inertia

one strategy to find the 'optimial' number of clusters is to use the sum of squares using the elbow method ideally there is a signficant reduction in the sum of squared distances until some number of clusters n at which point this is the optimal number as creating more clusters won't reduce the sum much more than has already been accomplished.

In the example below this is probably around 5-7  

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
list(range(1,15))

In [None]:

inertia_lst = []
model_lst = []

for i in range(1,15):
  model = KMeans(i).fit(df_wine_scaled)
  inertia_lst.append(model.inertia_)
  model_lst.append(model)


plt.plot(inertia_lst)
plt.title('elbow method to discern optimal cluster size')
plt.xlabel('cluster size')
plt.ylabel('sum of squares')

plt.show()

In [None]:
df_wine_scaled.cultivar.value_counts()

## explore cultivar and cluster creation

here we see what the actual cultivars are for our data. notice there is overlap in alcohol content and alcalinity across cultivars, but there are patterns in the data that immediately are apparent.

cultivar 0 appears to have more alcohol, and also less alcalinity than the other cultivars, but not in every case.





#### cultivar

In [None]:
sns.lmplot(x='alcohol', y='alcalinity_of_ash', data=df_wine_scaled, hue='cultivar', fit_reg=False)
plt.title("wines by alcohol and alcalinity of ash. highlight = cultivar")
plt.show()

In [None]:
kmeans_3 = KMeans(3).fit(df_wine_scaled.drop('cultivar',axis=1))

In [None]:
cluster = kmeans_3.predict(df_wine_scaled.drop('cultivar',axis=1)) # make sure we don't use the target for kmeans
df_wine_scaled['cluster'] = cluster

show the cluster center coordinates for a 3 cluster KMeans.

In [None]:
kmm_3_df = pd.DataFrame(kmeans_3.cluster_centers_,columns=df_wine_scaled.drop('cultivar',axis=1).columns)
kmm_3_df['cluster'] = 'centroid'
kmm_3_df

In [None]:


#sns.lmplot('alcohol', 'alcalinity_of_ash', data=kmm_3_df, hue='cluster', fit_reg=False)
sns.lmplot(x='alcohol', y='alcalinity_of_ash', data=kmm_3_df.append(df_wine_scaled), hue='cluster', fit_reg=False)
plt.title("wines by alcohol and alcalinity of ash. highlight = cluster")
plt.show()

In [None]:
sns.lmplot(x='alcohol', y='alcalinity_of_ash', data=df_wine_scaled, hue='cluster', fit_reg=False)
plt.title("wines by alcohol and alcalinity of ash. highlight = cluster")
plt.show()

In [None]:
df_wine_scaled.cluster.value_counts()

In [None]:
df_wine_scaled.cultivar.value_counts()

In [None]:
pd.crosstab(df_wine_scaled['cluster'], df_wine_scaled['cultivar'])

# boxplot alcohol

here we show a boxplot of alcohol content by both the original label of cultivar and the discovered cluster.

do these plots look similar to you? how could it be that the kmeans algorithm has somehow found a cluster separation that appears to match the cultivar?

In [None]:
df_wine_scaled.boxplot('alcohol',by='cultivar')
plt.show()

In [None]:
df_wine_scaled.boxplot('alcohol',by='cluster')
plt.show()

# PCA

Use principal component analysis to reduce the number of dimensions (columns) of our data and allow us to more easily visualize our clusters by the new columns which are now principal components

first make a PCA object with 15 principal components (the same shape as our current dataset's columns (15)

fit to our data to generate principal components

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=15, svd_solver='full')
pca.fit(df_wine_scaled)

explain which components explain what proportion of data

In [None]:
print(pca.explained_variance_ratio_)

cumulative sum the PCA components so that we can show how many principal components are needed to explain what proportion of variance in the data

the way to read this output is that the first compoenet explains 45% of variance, including the second adds 22% which brings us to 68% of the variance, and so on until 15 components are used and we can explain all the variance since we started with 15 original columns


In [None]:
np.cumsum(pca.explained_variance_ratio_)

by the 5th Principal component we have explained about 80% of the variance in our data

In [None]:
sum(pca.explained_variance_ratio_)

note: 0 on this plot is actually the FIRST principal component

In [None]:
pca_comp = pd.DataFrame(pca.components_)
pca_comp.columns = ['PC' + x for x in list(map(str,list(range(1,16))))]

pca_comp.columns = df_wine_scaled.columns
pca_comp

# in this case each row is a principal component

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title('cumulative sum of variance explained by principal components')
plt.show()

In [None]:
df_wine_scaled.shape

In [None]:
list(range(0,6))

In [None]:
pca_six = PCA(6)
df_wine_scaled_pca = pca_six.fit_transform(df_wine_scaled.drop('cluster',axis=1))

df_wine_scaled_pca = pd.DataFrame(df_wine_scaled_pca)
df_wine_scaled_pca.columns = ['PC' + x for x in list(map(str,list(range(0,6))))]

df_wine_scaled_pca['cluster'] = KMeans(3).fit_predict(df_wine_scaled_pca)

In [None]:
df_wine_scaled_pca#.head()

notice here how good of separation we have in our data when viewed through the lens of the first two principal components.

In [None]:
sns.lmplot(x="PC0", y="PC1", data=df_wine_scaled_pca, hue='cluster', fit_reg=False)
plt.title("clusters by Principal Components 1 and 2")
plt.show()

In [None]:
sns.lmplot(x="PC1", y="PC2", data=df_wine_scaled_pca, hue='cluster', fit_reg=False)
plt.title("clusters by Principal Components 2 and 3")
plt.show()

examine the principal component shape
in this case the rows are not observations, but are INSTEAD principal components. meaning row 0 is the FIRST principal component

In [None]:
pca_six.components_.shape

In [None]:
pca_six_df = pd.DataFrame(pca_six.components_,columns=df_wine_scaled.columns.drop('cluster'))
pca_six_df

In [None]:
pca_six_df_tp = pca_six_df.transpose()
pca_six_df_tp.columns = list(map(str,range(1,7)))
pca_six_df_tp

In [None]:
pca_six_df_tp.columns

In [None]:
pca_six_df_tp.sort_values('1',ascending=False)

# this shows that flavanoids is the most important feature in Principal Component 1


In [None]:
!cp "/content/drive/My Drive/Colab Notebooks/4482_KMeans_clustering.ipynb" ./

# run the second shell command, jupyter nbconvert --to html "file name of the notebook"
# create html from ipynb

!jupyter nbconvert --to html "4482_KMeans_clustering.ipynb"