In [99]:
# Import dependencies/libraries
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [100]:
# Load in the full selection of processed data
file_path = "Data/processed_data.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,age,sex_new,breed_new,likes_people_new,coat_new,size_new,month_found_new
0,0.17,1,0,0,0,0,12
1,0.17,0,0,0,0,0,12
2,0.17,1,0,0,0,0,12
3,0.17,0,0,0,0,0,12
4,1.5,0,1,0,0,1,12


## Try using the method for K Centroids

In [101]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_iris
    df["class"] = model.labels_

In [102]:
# Test 2 clusters
test_cluster_amount(df, 2)
df.hvplot.scatter(x="age", y="month_found_new", by="class")

In [103]:
# Test 2 clusters (3D)
fig = px.scatter_3d(
    df,
    x="age",
    y="month_found_new",
    z="size_new",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [104]:
# Test 3 clusters
test_cluster_amount(df, 3)
df.hvplot.scatter(x="age", y="month_found_new", by="class")

In [105]:
# Test 3 clusters (3D)
fig = px.scatter_3d(
    df,
    x="age",
    y="month_found_new",
    z="size_new",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [106]:
# Test 4 clusters
test_cluster_amount(df, 4)
df.hvplot.scatter(x="age", y="month_found_new", by="class")

In [107]:
# Test 4 clusters (3D)
fig = px.scatter_3d(
    df,
    x="age",
    y="month_found_new",
    z="size_new",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [108]:
# Test 5 clusters
test_cluster_amount(df, 5)
df.hvplot.scatter(x="age", y="month_found_new", by="class")

In [109]:
# Test 5 clusters (3D)
fig = px.scatter_3d(
    df,
    x="age",
    y="month_found_new",
    z="size_new",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

## Try using PCA: reduce number of dimensions

In [110]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [111]:
# Re-load in the full selection of processed data
file_path = "Data/processed_data.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,age,sex_new,breed_new,likes_people_new,coat_new,size_new,month_found_new
0,0.17,1,0,0,0,0,12
1,0.17,0,0,0,0,0,12
2,0.17,1,0,0,0,0,12
3,0.17,0,0,0,0,0,12
4,1.5,0,1,0,0,1,12


In [112]:
# Initialize PCA model
pca = PCA(n_components=2)

In [113]:
# Get two principal components for the data.
data_pca = pca.fit_transform(df)

In [114]:
# Transform PCA data to a DF
df_pca = pd.DataFrame(
    data=data_pca, columns=["principal component 1", "principal component 2"]
)
df_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-9.662468,3.273512
1,-9.659724,3.271599
2,-9.662468,3.273512
3,-9.659724,3.271599
4,-8.33917,3.646571


In [115]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.58303125, 0.36527085])

In [116]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [117]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_pca)

# Predict clusters
predictions = model.predict(df_pca)

# Add the predicted class columns
df_pca["class"] = model.labels_
df_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-9.662468,3.273512,2
1,-9.659724,3.271599,2
2,-9.662468,3.273512,2
3,-9.659724,3.271599,2
4,-8.33917,3.646571,2


In [118]:
df_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

## Try Hierarchical Clustering

In [119]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.figure_factory as ff

In [120]:
# Re-load in the full selection of processed data
file_path = "Data/processed_data.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,age,sex_new,breed_new,likes_people_new,coat_new,size_new,month_found_new
0,0.17,1,0,0,0,0,12
1,0.17,0,0,0,0,0,12
2,0.17,1,0,0,0,0,12
3,0.17,0,0,0,0,0,12
4,1.5,0,1,0,0,1,12


In [121]:
# Create the dendrogram
fig = ff.create_dendrogram(df_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [122]:
# Run the hierarchical algorithm
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_pca)

In [123]:
# Add a new class column to the DF
df_pca["class"] = model.labels_
df_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-9.662468,3.273512,0
1,-9.659724,3.271599,0
2,-9.662468,3.273512,0
3,-9.659724,3.271599,0
4,-8.33917,3.646571,0


In [124]:
df_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

In [125]:
#Export PCA data to .csv
df_pca.to_csv("PCA_data.csv", index=False)