In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
from path import Path

In [3]:
#load data
filePath = Path("../Resources/newIrisData.csv")
irisDF = pd.read_csv(filePath)
irisDF.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [4]:
#standardize data using StandardScaler
irisScaled = StandardScaler().fit_transform(irisDF)
print(irisScaled[0:5])

[[-0.90068117 -1.3412724   1.03205722 -1.31297673]
 [-1.14301691 -1.3412724  -0.1249576  -1.31297673]
 [-1.38535265 -1.39813811  0.33784833 -1.31297673]
 [-1.50652052 -1.2844067   0.10644536 -1.31297673]
 [-1.02184904 -1.3412724   1.26346019 -1.31297673]]


In [9]:
#initialize PCA model
pca = PCA(n_components=2)
#get two principal components for the iris data
irisPca = pca.fit_transform(irisScaled)

In [8]:
#transform PCA data to dataframe
irisPcaDf = pd.DataFrame(data=irisPca, columns=["principal component 1", "principal component 2"])
irisPcaDf.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [10]:
#fetch explained variance 
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

first principal component contains 72.77% of variance, and second contains 23.03%. together they contain 95.8% of the information 

In [11]:
#find best value for K
inertia = []
k = list(range(1,11))

#calc inertia for range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(irisPcaDf)
    inertia.append(km.inertia_)
#create elbow curve 
elbowData = {"k":k, "inertia":inertia}
elbowDf = pd.DataFrame(elbowData)
elbowDf.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [12]:
#initialize k-means model 
model = KMeans(n_clusters=3, random_state=0)
model.fit(irisPcaDf)
predictions = model.predict(irisPcaDf)

#add predicted class columns
irisPcaDf["class"] = model.labels_
irisPcaDf.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,1
1,-2.086426,-0.655405,1
2,-2.36795,-0.318477,1
3,-2.304197,-0.575368,1
4,-2.388777,0.674767,1


In [13]:
#plot the clusters
irisPcaDf.hvplot.scatter(x="principal component 1", y="principal component 2", hover_cols=["class"], by="class")