In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas

In [3]:
# Load data
file = "../Resources/new_iris_data.csv"
df_iris = pd.read_csv(file)
df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [4]:
# Step 1 of using PCA is to standardize entire dataset with the StandardScaler
iris_scaled = StandardScaler().fit_transform(df_iris)

In [5]:
# Initialize PCA model
pca = PCA(n_components=2)

In [6]:
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [7]:
# Transform PCA data to a DF
df_iris_pca = pd.DataFrame(
    data = iris_pca, columns=["principal component 1", "principal component 2"]
)
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [8]:
# Import libraries for dendogram
import plotly.figure_factory as ff

In [9]:
# Create the dendrogram
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [24]:
# Run the hierarchial alborithm using number of cluster 3 from visualizing dendogram
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [25]:
# Add a mew class column to df_iris
df_iris_pca["class"] = model.labels_ 
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,1
1,-2.086426,-0.655405,1
2,-2.36795,-0.318477,1
3,-2.304197,-0.575368,1
4,-2.388777,0.674767,1


In [26]:
# Finally create a plog of the results of the hierarchical clustering algo:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)