In [3]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import LabelEncoder

# Load a portion of the dataset (adjust the fraction as needed)
df = pd.read_csv("gun_deaths.csv").sample(frac=0.1, random_state=42)

# Preprocess the sampled dataset
df = df.dropna()

# Encode categorical features
label_encoder = LabelEncoder()
df["sex"] = label_encoder.fit_transform(df["sex"])
df["race"] = label_encoder.fit_transform(df["race"])
df["place"] = label_encoder.fit_transform(df["place"])
df["education"] = label_encoder.fit_transform(df["education"])

# Select relevant features for clustering
features_for_clustering = ["year", "month", "sex", "age", "race", "place", "education"]

X = df[features_for_clustering]

# Perform Agglomerative Clustering
agg = AgglomerativeClustering(n_clusters=4)
agg_labels = agg.fit_predict(X)

# Evaluate clustering
agg_sil = silhouette_score(X, agg_labels)
agg_ch = calinski_harabasz_score(X, agg_labels)
agg_db = davies_bouldin_score(X, agg_labels)

print("Agglomerative Clustering")
print("Silhouette score:", agg_sil)
print("Calinski-Harabasz score:", agg_ch)
print("Davies-Bouldin score:", agg_db)


Agglomerative Clustering
Silhouette score: 0.3960995597964092
Calinski-Harabasz score: 21419.78083405752
Davies-Bouldin score: 0.8032141666397277
