In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator

# 1. Load Data
df = pd.read_csv('/content/DARWIN_cleaned.csv')  # Replace with your file path/URL
print("Data shape:", df.shape)

In [None]:
# Drop target and ID columns
X = df.drop(columns=['class', 'ID']) if 'class' in df.columns else df.drop(columns=['ID'])
y = df['class'] if 'class' in df.columns else None

# Label encode all categorical (object) columns in X
for column in X.columns:
    if X[column].dtype == 'object':
        X[column] = LabelEncoder().fit_transform(X[column])

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
inertias = []
K = range(2, 10)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K, inertias, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()

knee = KneeLocator(K, inertias, curve='convex', direction='decreasing')
optimal_k = knee.knee
print(f"Optimal number of clusters detected by KneeLocator: {optimal_k}")

In [None]:
silhouette_scores = []
K = range(2, 10)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(score)

plt.figure(figsize=(8, 5))
plt.plot(K, silhouette_scores, marker='o', linestyle='--', color='orange')
plt.title('Silhouette Analysis for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

# Get silhouette score for optimal_k (if it was found)
if optimal_k is not None and optimal_k in K:
    silhouette_at_optimal_k = silhouette_scores[K.index(optimal_k)]
    print(f"Silhouette Score at optimal k={optimal_k}: {silhouette_at_optimal_k:.4f}")
else:
    print("Could not determine silhouette score: optimal_k not in tested range.")
    silhouette_scores.append(score)

In [None]:
# Analyze cluster centers
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

print("Distinctive features for each cluster:")
for i in range(len(cluster_centers)):
    print(f"\nCluster {i}:")

    # Sort features by their values in the cluster center
    sorted_features = cluster_centers.iloc[i].sort_values()

    print("Lowest values:")
    print(sorted_features[:5])  # 5 lowest features

    print("\nHighest values:")
    print(sorted_features[-5:])  # 5 highest features

In [None]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(true_labels, cluster_labels)
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Cluster")
plt.ylabel("True Label")
plt.title("Confusion Matrix (Clustering vs. True Labels)")
plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Fit PCA to reduce to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Get cluster labels from KMeans
cluster_labels = kmeans.labels_

# Project cluster centers into PCA space
centroids_pca = pca.transform(kmeans.cluster_centers_)

# Plot clusters in PCA-reduced space
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='tab10', alpha=0.7, s=50)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='X', c='black', s=200, label='Centroids')

plt.title('Cluster Visualization with PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter, label='Cluster')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
kmeans.fit(X_scaled)
y_pred = kmeans.labels_
print(y_pred)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Example data (replace with your actual data)
# y_true = actual labels, y_pred = predicted cluster labels
y_true = np.array([0]*89 + [1]*85)
y_pred = kmeans.labels_

# Classification Report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=['0', '1']))

# Accuracy Score
accuracy = accuracy_score(y_true, y_pred)
print(f"\nAccuracy Score: {accuracy:.3f}")

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix (k=2):\n")
print("Actual Class")
print("    0    1")
print(f"0   {cm[0,0]}    {cm[0,1]}")  # TN | FP
print(f"1   {cm[1,0]}    {cm[1,1]}")  # FN | TP

# Visualize Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted Cluster')
plt.ylabel('Actual Class')
plt.title('Confusion Matrix')
plt.show()