In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans


df = pd.read_csv('breast-cancer.data')

df.head()
print(df.columns)


In [None]:
df.info()

# I pulled up the attribute information to see correct column names and named the columns accordingly
column_names = [
    "Class",        
    "age",
    "menopause",
    "tumor_size",
    "inv_nodes",
    "node_caps",
    "deg_malig",
    "breast",
    "breast_quad",
    "irradiat"
]

df.columns = column_names
print(df.head())

In [None]:
#RangeIndex: 285, 0 to 284
#looks like all of our columns have entries

#but lot's of categorical data, we need to covert to numerical, #just wanted a visual
df_new = pd.get_dummies(df)
df_new.head()

In [None]:
#let's drop the target column 'Class' and first categorical columns to avoid duplicates (like no recurrence and recurrence)
df_kmeans = pd.get_dummies(df.drop(columns=['Class']), drop_first=True)
#if we leave class it will distort the clustering, the algorithm will cheat and separate clusters based on label and mess up distance
df_kmeans.head()


In [None]:
#standardization
#most of the values will be 1 or 0 due to True/False except first column deg-malig (0-3) This can mess things up so let's standardize.

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #creates scaler itself
scaled_df_kmeans = scaler.fit_transform(df_kmeans) #implements scaler on our data
df_kmeans.head



In [None]:
#PART A create K-Means cluster model using scikit learn
#see organizing categorical data and standardization above
from sklearn.cluster import KMeans 
kmeans_model = KMeans(n_clusters=3) #I randomly picked 3.  Will run SSD and elbow after
clusters = kmeans_model.fit_predict(df_kmeans) 
df_kmeans.insert(0, "Cluster", clusters) #creates a new column for cluster number
 
print(df_kmeans.head())

In [None]:
#So now going to use elbow method to find optimal number of K(clusters)
#Let's take possible values of K.  Let's pick 2-11. 
#Calculate SSD for every value of K. As K decreases, SSD decreases. 
#But optimal value for visualization is where steep drop or the elbow is.
ssd = [] #empty list
for k in range (2, 11): #set min and max value for K and run the loop for each
    kmeans_model = KMeans(n_clusters=k) #creates KMeans clustering model in each loop
    kmeans_model.fit(df_kmeans) #fit the model to the number of clusters in each group
    ssd.append(kmeans_model.inertia_) #add its calculated SSD, inertia means SSD
    print(f"K={k}, SSD={kmeans_model.inertia_}")

#and we visualize with the plot
plt.figure(figsize=(6, 4))
plt.plot(range(2, 11), ssd, color="red", marker="o")
plt.xlabel("Number of clusters(K)")
plt.ylabel("SSD for K")
plt.show()

#I picked 3 randomly but the elbow also shows 3 is optimal choice due to no significant drop off after 3


In [None]:
#PART C 2D VISUALIZATION OF CLUSTERS USING PCA, and PLOT CENTROIDS
from sklearn.decomposition import PCA 

#since I chose 3 clusters and elbow method confirmed that 3 is best for SSD and variability.
#Can use df_kmeans from above. 



In [None]:
df_kmeans.shape #285 rows, 35 different attributes before PCA


In [None]:
#just showing visualization 
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df_kmeans)

X_pca.shape

In [None]:

#Now let's use our data
# 1. Convert to array for PCA (no need to refit KMeans)

X_array = df_kmeans.values #the column names were giving me an error. 

# 2. Apply PCA to reduce to 2D from OG of 34 (see above shape listed )
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_array)

# 3. Transform existing cluster centroids to 2D
kmeans_model = KMeans(n_clusters=3)
clusters = kmeans_model.fit_predict(X_array)
centers_pca = pca.transform(kmeans_model.cluster_centers_) #centroids in the 2D PCA 
#sci-kit learn stores the coordinates of the cluster centroid in the cluster_centers_attribute

# 4. Plot
#create figure. Can pick size. 
plt.figure(figsize=(8,6))

#manually plotted since I know I set to 3 clusters using elbow method. 
plt.scatter(X_pca[clusters == 0, 0], X_pca[clusters == 0, 1],
            color='blue', marker='o', label='Cluster 0')
plt.scatter(X_pca[clusters == 1, 0], X_pca[clusters == 1, 1],
            color='green', marker='s', label='Cluster 1')
plt.scatter(X_pca[clusters == 2, 0], X_pca[clusters == 2, 1],
            color='orange', marker='^', label='Cluster 2')
    

# Plot centroids
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], #x (PC1) and y coordinates (PC2) of centroids.
            s=200, c='black', marker='X', label='Centroids') #size, color, marker X, and label 


plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title(f'K-Means Breast Cancer Data Clusters (k={kmeans_model.n_clusters})')
plt.legend()
plt.show()


In [None]:
cluster_means = df_kmeans.groupby('Cluster').mean()
print(cluster_means.T)
#visualizing data to see any likeness 

In [None]:
#visualizing likness of 'Clusters' to 'Class'
df["Cluster"] = clusters
pd.crosstab(df["Cluster"], df["Class"])