In [8]:
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [9]:
# Load the dataset
file_path = 'wisc_bc_ContinuousVar 2.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset and its info
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

(         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
 0    842302         M        17.99         10.38          122.80     1001.0   
 1    842517         M        20.57         17.77          132.90     1326.0   
 2  84300903         M        19.69         21.25          130.00     1203.0   
 3  84348301         M        11.42         20.38           77.58      386.1   
 4  84358402         M        20.29         14.34          135.10     1297.0   
 
    smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
 0          0.11840           0.27760          0.3001              0.14710   
 1          0.08474           0.07864          0.0869              0.07017   
 2          0.10960           0.15990          0.1974              0.12790   
 3          0.14250           0.28390          0.2414              0.10520   
 4          0.10030           0.13280          0.1980              0.10430   
 
    ...  radius_worst  texture_worst  perimeter_

In [10]:
# Selecting features for clustering (excluding 'id' and 'diagnosis')
features = data.drop(['id', 'diagnosis'], axis=1)

# Hierarchical clustering
linked = linkage(features, 'ward')

# Applying Agglomerative Clustering with 2 clusters
hclust = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
hclust_labels = hclust.fit_predict(features)

# Adding the cluster labels to the dataset
data['hclust_cluster'] = hclust_labels

# Tabulate hclust_cluster against the 'diagnosis' column
hclust_table = pd.crosstab(data['diagnosis'], data['hclust_cluster'])
hclust_table




hclust_cluster,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
B,357,0
M,126,86


In [11]:
from sklearn.cluster import KMeans

# k-means clustering
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans_labels = kmeans.fit_predict(features)

# Adding the k-means cluster labels to the dataset
data['kmeans_cluster'] = kmeans_labels

# Tabulate kmeans_cluster against the 'diagnosis' column
kmeans_table = pd.crosstab(data['diagnosis'], data['kmeans_cluster'])
kmeans_table




kmeans_cluster,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
B,356,1
M,82,130
