# K-Means Clustering of Tumor Data

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_csv("data.csv")


In [None]:

# Assuming 'ID' is the column with tumor IDs and 'diagnosis' column should be excluded for clustering
tumor_ids = data['ID']
features = data.drop(columns=['ID', 'diagnosis'], errors='ignore')


In [None]:

# Set the number of clusters (e.g., 2 for benign and malignant)
kmeans = KMeans(n_clusters=2, random_state=42)
data['Cluster'] = kmeans.fit_predict(features)


In [None]:

# Create a DataFrame with the tumor IDs and their cluster labels
cluster_results = pd.DataFrame({
    'Tumor_ID': tumor_ids,
    'Cluster_Label': data['Cluster']
})

# Save to CSV
cluster_results.to_csv("tumor_clusters.csv", index=False)

print("Clustering results saved to tumor_clusters.csv")


In [None]:

# Use PCA to reduce data to 2D for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features)

# Plot the clusters
plt.figure(figsize=(10, 6))
plt.scatter(
    reduced_features[:, 0], reduced_features[:, 1],
    c=data['Cluster'], cmap='viridis', edgecolor='k', s=50
)
plt.title("K-Means Clustering of Tumor Data (PCA-Reduced)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Cluster Label")
plt.show()
