In [91]:
import numpy as np
import pandas as pd
import random as rd

In [92]:
class Kmeans:
    def __init__(self, X, K):
        self.X = X
        self.Output = {}
        self.Centroids = np.array([]).reshape(self.X.shape[1], 0)
        self.K = K
        self.m = self.X.shape[0]

    def kmeanspp(self, X, K):
        i = rd.randint(0, X.shape[0]-1)
        Centroid_temp = np.array([X[i]])
        for k in range(1, K):
            D = np.array([])
            for x in X:
                D = np.append(D, np.min(np.sum((x - Centroid_temp) ** 2)))
            prob = D / np.sum(D)
            cumulative_prob = np.cumsum(prob)
            r = rd.random()
            i = 0
            for j, p in enumerate(cumulative_prob):
                if r < p:
                    i = j
                    break
            Centroid_temp = np.append(Centroid_temp, [X[i]], axis=0)
        return Centroid_temp.T

    def fit(self, n_iter):
        # Randomly Initialize the centroids
        self.Centroids = self.kmeanspp(self.X, self.K)
    
        # Compute Euclidean distances and assign clusters
        for n in range(n_iter):
            EuclideanDistance = np.array([]).reshape(self.m, 0)
            for k in range(self.K):
                tempDist = np.sum((self.X - self.Centroids[:, k]) ** 2, axis=1)
                EuclideanDistance = np.c_[EuclideanDistance, tempDist]
            C = np.argmin(EuclideanDistance, axis=1) + 1
    
            # Adjust the centroids
            Y = {}
            for k in range(self.K):
                Y[k + 1] = np.empty((self.X.shape[1], 0))  # Initialize with the correct shape
            for i in range(self.m):
                Y[C[i]] = np.c_[Y[C[i]], self.X[i].reshape(-1, 1)]  # Ensure correct shape for concatenation
    
            for k in range(self.K):
                Y[k + 1] = Y[k + 1].T  # Transpose to get the correct shape
            for k in range(self.K):
                if Y[k + 1].shape[0] > 0:  # Only update centroids if there are points in the cluster
                    self.Centroids[:, k] = np.mean(Y[k + 1], axis=0)
    
            self.Output = Y


    def predict(self):
        return self.Output, self.Centroids.T

    def WCSS(self):
        wcss = 0
        for k in range(self.K):
            wcss += np.sum((self.Output[k + 1] - self.Centroids[:, k]) ** 2)
        return wcss

In [93]:
data = pd.read_csv('iris.csv')
features = data.drop(columns=['Id','Species']) 
labels = data['Species']
print(features.head())

In [94]:
features = (features - features.mean()) / features.std()
print(features.describe())

In [95]:
K = 3
kmeans = Kmeans(features.values, K)
kmeans.fit(n_iter=100)
print(kmeans.WCSS())

In [101]:
output, centroids = kmeans.predict()
def select_closest_samples(X, output, centroids, n_samples=25):
    selected_samples = []
    for k in range(len(centroids)):
        cluster_points = X[output == k]
        if cluster_points.shape[0] == 0:
            continue
        distances = np.linalg.norm(cluster_points - centroids[k].reshape(1, -1), axis=1)
        closest_indices = np.argsort(distances)[:n_samples]
        selected_samples.extend(cluster_points[closest_indices])
    return np.array(selected_samples)

In [104]:
selected_samples = select_closest_samples(features.values, output, centroids)
selected_indices = np.array([np.where((features.values == sample).all(axis=1))[0][0] for sample in selected_samples])
selected_labels = labels.iloc[selected_indices].values
print(f"Number of selected samples: {len(selected_samples)}")
print(f"Number of selected labels: {len(selected_labels)}")


In [98]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(selected_samples, selected_labels, test_size=0.3)
remaining_indices = [i for i in range(features.shape[0]) if i not in selected_indices]
X_test = features.values[remaining_indices]
y_test = labels.iloc[remaining_indices].values

In [76]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]

In [77]:
model = LogisticRegression(learning_rate=0.01, num_iterations=1000)
model.fit(X_train, y_train)

In [79]:
y_val_pred = model.predict(X_val)
val_accuracy = np.mean(y_val_pred == y_val)
print(f'Validation Accuracy: {val_accuracy}')

In [99]:
y_test_pred = model.predict(X_test)
test_accuracy = np.mean(y_test_pred == y_test)
print(f'Test Accuracy: {test_accuracy}')