### 1. Data Generation:
   - Generate two normal distributions each of size 30,000 with feature dimension=100.
   - You may choose arbitrary mean and standard deviation.
   - Label the first set of feature vectors as "+1" and the second set of feature vectors as either "-1" or "0."


In [None]:
import numpy as np

X_1 = np.random.normal(loc=0.0, size=(30000, 100))
X_2 = np.random.normal(loc=3.0, size=(30000, 100))
X = np.concatenate((X_1, X_2), axis=0)
#X = np.random.normal(size=(2*30000,100))

positive_y = np.ones((30000,1))
negative_y = np.zeros((30000,1))
y = np.concatenate((positive_y, negative_y), axis=0)

data = np.concatenate((X, y), axis=1)

data.shape

(60000, 101)

### 2. Data Splitting
- Split the generated data into training, validation and test sets in 8:1:1 ratio.
- You are expected to write the code from scratch. (i.e. Do not use train_test_split function from sklearn)

In [None]:
# YOUR CODE HERE
def split_train_val_test(data, val_ratio=0.1, test_ratio=0.1):
    shuffled_indices = np.random.permutation(len(data))
    val_set_size = int((len(data) * val_ratio))
    test_set_size = int((len(data) * test_ratio))
    val_indices = shuffled_indices[:val_set_size]
    test_indices = shuffled_indices[val_set_size:(val_set_size + test_set_size)]
    train_indices = shuffled_indices[(val_set_size + test_set_size):]
    return data[train_indices], data[val_indices], data[test_indices]

train_set, val_set, test_set = split_train_val_test(data)

print(len(train_set), len(train_set) == 0.8 * len(data))
print(len(val_set), len(val_set) == 0.1 * len(data))
print(len(test_set), len(test_set) == 0.1 * len(data))

48000 True
6000 True
6000 True


### 3. Classifier Implementation
- Implement Linear Classifiers using three different methods.
  - Random Method
  - Perceptron Method
  - Gradient Descent

In [None]:
class RandomLinearClassifier:
    def __init__(self, train_set, val_set=None, test_set=None, k=100, dim=100):
        self.train_set = train_set
        self.val_set = val_set
        self.test_set = test_set
        self.k = k
        self.dim = dim
        self.best_weights_and_bias = None

    def train(self):
        best_loss_score = np.inf
        features = self.train_set[:, :self.dim]
        labels = self.train_set[:, self.dim]
        for i in range(self.k):
            # No heuristic is followed for choosing weights, random selection is taking place
            weights_and_bias = np.random.rand(self.dim + 1)
            weighted_sum = np.dot(features, weights_and_bias[:self.dim]) + weights_and_bias[self.dim]
            predictions = (weighted_sum > 0).astype(float)
            loss_score = np.mean(predictions == labels)
            print(f"Epoch: {i}, Loss: {loss_score}")
            if(loss_score < best_loss_score):
                best_loss_score = loss_score
                self.best_weights_and_bias = weights_and_bias

    def test(self):
        features = self.test_set[:, :self.dim]
        labels = self.test_set[:, self.dim]
        weighted_sum = np.dot(features, self.best_weights_and_bias[:self.dim]) + self.best_weights_and_bias[self.dim]
        predictions = (weighted_sum > 0).astype(float)
        return predictions, labels

In [None]:
class PerceptronClassifier:
    def __init__(self, train_set, val_set=None, test_set=None, tau=100, dim=100):
        self.train_set = train_set
        self.val_set = val_set
        self.test_set = test_set
        self.tau = tau
        self.dim = dim
        self.weights_and_bias = np.zeros(self.dim + 1)

    def train(self):
        features = self.train_set[:, :self.dim]
        labels = self.train_set[:, self.dim]

        for _ in range(self.tau):
            # Weights are getting checked for each training instance
            for i, (x, y) in enumerate(zip(features, labels)):
                weighted_sum = np.dot(x, self.weights_and_bias[:self.dim]) + self.weights_and_bias[self.dim]
                if (y * weighted_sum) <= 0.0:
                    self.weights_and_bias[:self.dim] += (y * x)
                    self.weights_and_bias[self.dim] += y

    def test(self):
        features = self.test_set[:, :self.dim]
        labels = self.test_set[:, self.dim]
        weighted_sum = np.dot(features, self.weights_and_bias[:self.dim]) + self.weights_and_bias[self.dim]
        predictions = (weighted_sum > 0).astype(float)
        return predictions, labels

In [None]:
class GradientBasedClassifier:
    def __init__(self, train_set, val_set=None, test_set=None, lr=0.01, epsilon=0.0002, regularizer=0.1, dim=100, k_fold=False):
        self.train_set = train_set
        self.val_set = val_set
        self.test_set = test_set
        self.lr = lr
        self.epsilon = epsilon
        self.regularizer = regularizer
        self.dim = dim
        self.weights_and_bias = np.random.rand(self.dim + 1)

    def sigmoid(self, z):
        # To avoid overflow error a different form of sigmoid function is used
        # Link (last answer): https://stackoverflow.com/questions/40726490/overflow-error-in-pythons-numpy-exp-function
        #return np.where(z >= 0.0, 1 / (1 + np.exp(-z)), np.exp(z) / (1 + np.exp(z)))
        return np.clip(np.exp(z) / (1 + np.exp(z)), 1e-15, 1 - 1e-15)

    def train(self):
        features = self.train_set[:, :self.dim]
        labels = self.train_set[:, self.dim]

        i = 0
        while(True):
            weighted_sum = np.dot(features, self.weights_and_bias[:self.dim]) + self.weights_and_bias[self.dim]
            #predictions = np.where(predictions == 0.0, 1e-15, predictions)
            #predictions = np.where(predictions == 1.0, 0.999999999999, predictions)
            predictions = self.sigmoid(weighted_sum)
            weight_gradient = (np.dot(features.T, (predictions - labels)) / len(labels)) + (self.regularizer * self.weights_and_bias[:self.dim])
            bias_gradient = np.mean(predictions - labels)

            current_val_loss = self.eval()

            self.weights_and_bias[:self.dim] -= (self.lr * weight_gradient)
            self.weights_and_bias[self.dim] -= (self.lr * bias_gradient)

            updated_val_loss = self.eval()

            # For keeping better track of training progress
            if(i % 100 == 0):
                print(f"Epoch: {i}, Loss: {updated_val_loss}")

            # Training should be stopped once the loss difference is close to zero and less than epsilon
            if(abs(updated_val_loss - current_val_loss) < self.epsilon):
                break
            i += 1

    def test(self):
        features = self.test_set[:, :self.dim]
        labels = self.test_set[:, self.dim]
        weighted_sum = np.dot(features, self.weights_and_bias[:self.dim]) + self.weights_and_bias[self.dim]
        predictions = self.sigmoid(weighted_sum)
        # Based on problem type, different thresholds can be used
        predictions = (predictions >= 0.5).astype(float)
        return predictions, labels

    def eval(self):
        features = self.val_set[:, :self.dim]
        labels = self.val_set[:, self.dim]
        weighted_sum = np.dot(features, self.weights_and_bias[:self.dim]) + self.weights_and_bias[self.dim]
        predictions = self.sigmoid(weighted_sum)
        loss = -np.mean((labels * np.log(predictions)) + ((1 - labels) * np.log(1 - predictions))) + ((self.regularizer / 2.0) * (np.linalg.norm(self.weights_and_bias[:self.dim]) ** 2))
        return loss

### 4. Performance Evaluation
- Evaluate the performance of each method on the test set using various evaluation metrics such as accuracy, precision, recall, F1-score.

In [None]:
def measure_performance(preds, labels):
    correct = 0
    tp = 0
    fp = 0
    fn = 0

    for (pred, label) in zip(preds, labels):
        if (pred == label):
            if (pred == 1.0 and label ==1.0):
                tp += 1
            correct += 1
        else:
            if (pred == 1.0 and label == 0.0):
                fp += 1
            if (pred == 0.0 and label == 1.0):
                fn += 1

    accuracy = correct / len(labels)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return accuracy, precision, recall, f1_score

In [None]:
#Random Linear Classifier
rlc = RandomLinearClassifier(train_set, val_set, test_set)
rlc.train()
predictions, labels = rlc.test()
accuracy, precision, recall, f1_score = measure_performance(predictions, labels)
print(f"Accuracy for Random Linear Classifier: {accuracy}")
print(f"Precision for Random Linear Classifier: {precision}")
print(f"Recall for Random Linear Classifier: {recall}")
print(f"F-1 score for Random Linear Classifier: {f1_score}")

Epoch: 0, Loss: 0.2684166666666667
Epoch: 1, Loss: 0.2528125
Epoch: 2, Loss: 0.2775625
Epoch: 3, Loss: 0.2596041666666667
Epoch: 4, Loss: 0.25514583333333335
Epoch: 5, Loss: 0.26195833333333335
Epoch: 6, Loss: 0.2581875
Epoch: 7, Loss: 0.27295833333333336
Epoch: 8, Loss: 0.2605625
Epoch: 9, Loss: 0.2795
Epoch: 10, Loss: 0.2696875
Epoch: 11, Loss: 0.282375
Epoch: 12, Loss: 0.2800625
Epoch: 13, Loss: 0.2748125
Epoch: 14, Loss: 0.2683125
Epoch: 15, Loss: 0.28227083333333336
Epoch: 16, Loss: 0.24983333333333332
Epoch: 17, Loss: 0.2766875
Epoch: 18, Loss: 0.2720208333333333
Epoch: 19, Loss: 0.2532083333333333
Epoch: 20, Loss: 0.2713541666666667
Epoch: 21, Loss: 0.2685625
Epoch: 22, Loss: 0.2622291666666667
Epoch: 23, Loss: 0.2623125
Epoch: 24, Loss: 0.2634166666666667
Epoch: 25, Loss: 0.277125
Epoch: 26, Loss: 0.28539583333333335
Epoch: 27, Loss: 0.2622291666666667
Epoch: 28, Loss: 0.25425
Epoch: 29, Loss: 0.24933333333333332
Epoch: 30, Loss: 0.25729166666666664
Epoch: 31, Loss: 0.274333333

In [None]:
# Perceptron Classifier
pc = PerceptronClassifier(train_set, val_set, test_set)
pc.train()
predictions, labels = pc.test()
accuracy, precision, recall, f1_score = measure_performance(predictions, labels)
print(f"Accuracy for Perceptron Classifier: {accuracy}")
print(f"Precision for Perceptron Classifier: {precision}")
print(f"Recall for Perceptron Classifier: {recall}")
print(f"F-1 score for Perceptron Classifier: {f1_score}")

Accuracy for Perceptron Classifier: 0.5008333333333334
Precision for Perceptron Classifier: 0.5008333333333334
Recall for Perceptron Classifier: 1.0
F-1 score for Perceptron Classifier: 0.6674069961132704


In [None]:
# Gradient Descent Based Classifier
gbc = GradientBasedClassifier(train_set, val_set, test_set)
gbc.train()
predictions, labels = gbc.test()
accuracy, precision, recall, f1_score = measure_performance(predictions, labels)
print(f"Accuracy for Gradient Descent Based Classifier: {accuracy}")
print(f"Precision for Gradient Descent Based Classifier: {precision}")
print(f"Recall for Gradient Descent Based Classifier: {recall}")
print(f"F-1 score for Gradient Descent based Classifier: {f1_score}")

Epoch: 0, Loss: 20.275146815899042
Epoch: 100, Loss: 0.7968716746469977
Epoch: 200, Loss: 0.6115766447155084
Epoch: 300, Loss: 0.46826736688042414
Epoch: 400, Loss: 0.36012644300367863
Epoch: 500, Loss: 0.28061129490096737
Epoch: 600, Loss: 0.22323222772533236
Epoch: 700, Loss: 0.1821379678680006
Epoch: 800, Loss: 0.15257588644540213
Accuracy for Gradient Descent Based Classifier: 0.9946666666666667
Precision for Gradient Descent Based Classifier: 1.0
Recall for Gradient Descent Based Classifier: 0.989351081530782
F-1 score for Gradient Descent based Classifier: 0.9946470391435263


## NOT COMPLETED: Cross Validation
- Implement cross validation on gradient descent based classifier model

# Cross Validation
Implement cross validation on gradient descent based classifier model

In [None]:
import math

def cross_validation(data, k=10):
    fold_size = math.floor(data.shape[0] / k)

    performance_metrics = {
        'accuracy': list(),
        'precision': list(),
        'recall': list(),
        'f1_score': list()
    }

    for i in range(k):
        fold_starting_index = i * fold_size
        fold_ending_index = (i+1) * fold_size
        #print(fold_starting_index)
        #print(fold_ending_index)
        train_set = np.concatenate((data[:fold_starting_index], data[fold_ending_index:]), axis=0)
        val_set = data[fold_starting_index:fold_ending_index]
        #print(train_set.shape)
        #print(val_set.shape)

        gd_classifier = GradientBasedClassifier(train_set, test_set=val_set)
        gd_classifier.train()
        predictions, labels = gd_classifier.test()
        accuracy, precision, recall, f1_score = measure_performance(predictions, labels)
        performance_metrics['accuracy'].append(accuracy)
        performance_metrics['precision'].append(precision)
        performance_metrics['recall'].append(recall)
        performance_metrics['f1_score'].append(f1_score)

    performance_metrics['accuracy'] = sum(performance_metrics['accuracy']) / len(performance_metrics['accuracy'])
    performance_metrics['precision'] = sum(performance_metrics['precision']) / len(performance_metrics['precision'])
    performance_metrics['recall'] = sum(performance_metrics['recall']) / len(performance_metrics['recall'])
    performance_metrics['f1_score'] = sum(performance_metrics['f1_score']) / len(performance_metrics['f1_score'])

    return performance_metrics


shuffled_indices = np.random.permutation(len(data))
shuffled_row = shuffled_indices[:len(data)]
shuffled_data = data[shuffled_row]

print(cross_validation(data))