In [36]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Print dataset dimensions
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Unique classes:", np.unique(y))

# Preview first 5 rows using Pandas
df = pd.DataFrame(X, columns=iris.feature_names)
print("\nFirst 5 rows:")
print(df.head())


Shape of X: (150, 4)
Shape of y: (150,)
Unique classes: [0 1 2]

First 5 rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [37]:
# Function to calculate Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# Function to initialize centroids randomly
def initialize_centroids(X, k):
    # Select k random indices
    indices = np.random.choice(X.shape[0], k, replace=False)
    # Return data points at those indices
    return X[indices]

# Set parameters
k = 3
np.random.seed(42)  # For reproducible results

# Initialize centroids
centroids = initialize_centroids(X, k)

print("Initial Centroids:\n", centroids)

Initial Centroids:
 [[6.1 2.8 4.7 1.2]
 [5.7 3.8 1.7 0.3]
 [7.7 2.6 6.9 2.3]]


In [38]:
# Function to assign each data point to the nearest centroid
def assign_clusters(X, centroids):
    labels = []
    for point in X:
        # Calculate distances to all centroids
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        # Find index of the nearest centroid
        closest_idx = np.argmin(distances)
        labels.append(closest_idx)
    return np.array(labels)

# Function to update centroids based on mean of assigned points
def update_centroids(X, labels, k):
    new_centroids = []
    for i in range(k):
        # Filter points belonging to cluster i
        points_in_cluster = X[labels == i]
        
        # Calculate mean if cluster is not empty
        if len(points_in_cluster) > 0:
            new_centroids.append(points_in_cluster.mean(axis=0))
        else:
            # Handle empty cluster (keep previous centroid or re-init)
            # For simplicity, we create a zero vector or keep random
            new_centroids.append(np.zeros(X.shape[1]))
            
    return np.array(new_centroids)

In [39]:
# Parameters
max_iters = 100

for i in range(max_iters):
    # 1. Assign points to nearest centroid
    labels = assign_clusters(X, centroids)
    
    # 2. Calculate new centroids
    new_centroids = update_centroids(X, labels, k)
    
    # 3. Check for convergence
    # If centroids do not change significantly, stop
    if np.allclose(centroids, new_centroids):
        print(f"Converged at iteration {i}")
        break
        
    centroids = new_centroids

print("Training completed.")
print("Final Centroids:\n", centroids)
print("First 20 labels:", labels[:20])

Converged at iteration 5
Training completed.
Final Centroids:
 [[5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.428      1.462      0.246     ]
 [6.85       3.07368421 5.74210526 2.07105263]]
First 20 labels: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [40]:
# Function to map cluster labels to true labels
def get_reference_labels(clusters, true_labels):
    reference_labels = np.zeros_like(clusters)
    for i in range(k):
        # Find indices where the cluster is i
        indices = np.where(clusters == i)
        # Find the most frequent true label in this cluster
        if len(indices[0]) > 0:
            mode_label = np.bincount(true_labels[indices]).argmax()
            reference_labels[indices] = mode_label
    return reference_labels

# Function to calculate metrics manually
def calculate_metrics_detailed(y_true, y_pred):
    classes = np.unique(y_true)
    total_samples = len(y_true)
    
    # Variables to store sums
    total_correct = 0
    recall_sum = 0
    
    for cls in classes:
        # True Positives (TP): Points that are actually cls AND predicted as cls
        tp = np.sum((y_true == cls) & (y_pred == cls))
        
        # False Negatives (FN): Points that are actually cls BUT predicted as others
        fn = np.sum((y_true == cls) & (y_pred != cls))
        
        # Accumulate correct predictions for Accuracy
        total_correct += tp

        # Calculate Recall for this specific class
        if (tp + fn) > 0:
            class_recall = tp / (tp + fn)
        else:
            class_recall = 0
        
        recall_sum += class_recall
        
    # Calculate final metrics
    accuracy = total_correct / total_samples
    error = 1 - accuracy
    avg_recall = recall_sum / len(classes)    
    return accuracy, error, avg_recall

# Calculate metrics again
acc, err, rec = calculate_metrics_detailed(y, predicted_labels)

print(f"Accuracy: {acc * 100:.2f}%")
print(f"Error Rate: {err * 100:.2f}%")
print(f"Recall: {rec * 100:.2f}%")

Accuracy: 89.33%
Error Rate: 10.67%
Recall: 89.33%
