In [59]:
# Import dependencies.
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [60]:
# Retrieve and read data from a CSV file.
file_path = "Resources/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [61]:
# Dropped class column.
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [62]:
# Reordering columns.
new_column_order = ["sepal_length", "petal_length", "sepal_width", "petal_width"]
new_iris_df = new_iris_df[new_column_order]
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [63]:
# Export data to a CSV file.
output_file_path = "Resources/new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

In [64]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [65]:
# Loading data
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head(10)

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
5,5.4,1.7,3.9,0.4
6,4.6,1.4,3.4,0.3
7,5.0,1.5,3.4,0.2
8,4.4,1.4,2.9,0.2
9,4.9,1.5,3.1,0.1


In [66]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [67]:
# Fitting model
model.fit(df_iris)

KMeans(n_clusters=3, random_state=5)

In [68]:
# Get predictions
predictions = model.predict(df_iris)
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [69]:
# Add a new class column to the df_iris
df_iris["class"] = model.labels_
df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width,class
0,5.1,1.4,3.5,0.2,1
1,4.9,1.4,3.0,0.2,1
2,4.7,1.3,3.2,0.2,1
3,4.6,1.5,3.1,0.2,1
4,5.0,1.4,3.6,0.2,1


In [70]:
import plotly.express as px
import hvplot.pandas

In [71]:
# Plotting the clusters with two features
df_iris.hvplot.scatter(x="sepal_length", y="sepal_width", by="class")

In [72]:
# Plotting the clusters with three features
fig = px.scatter_3d(
    df_iris, 
    x="petal_width", 
    y="sepal_length", 
    z="petal_length", 
    color="class", 
    symbol="class", 
    size="sepal_width",
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [73]:
inertia = []
k = list(range(1, 11))

In [74]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [75]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [76]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Loading data
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head(10)

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
5,5.4,1.7,3.9,0.4
6,4.6,1.4,3.4,0.3
7,5.0,1.5,3.4,0.2
8,4.4,1.4,2.9,0.2
9,4.9,1.5,3.1,0.1


In [77]:
# Standardize data with StandardScaler
iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

[[-0.90068117 -1.3412724   1.03205722 -1.31297673]
 [-1.14301691 -1.3412724  -0.1249576  -1.31297673]
 [-1.38535265 -1.39813811  0.33784833 -1.31297673]
 [-1.50652052 -1.2844067   0.10644536 -1.31297673]
 [-1.02184904 -1.3412724   1.26346019 -1.31297673]]


In [78]:
# Initialize PCA model
pca = PCA(n_components=2)

In [79]:
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [80]:
# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(
    data=iris_pca, columns=["principal component 1", "principal component 2"]
)
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [81]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

In [82]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [83]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [84]:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

In [85]:
from sklearn.cluster import AgglomerativeClustering
import plotly.figure_factory as ff

In [86]:
# Create the dendrogram
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [87]:
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [89]:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class"
)