### Machine Learning: Programming Exercise - 2


# Exercise 2.1

In [1]:
# 1. Write a function to generate an m+1 dimensional data set, of size n, consisting of m continuous independent
# variables (X) and one dependent binary variable (Y) defined as
# Y =
# (
# 1 if p(y = 1|~x) = 1
# 1+exp−~x.β~ > 0.5
# 0 otherwise
# where,
# • β is a random vector of dimensionality m + 1, representing the coefficients of the linear relationship
# between X and Y, and
# • ∀i ∈ [1, n], xi0 = 1
# To add noise to the labels (Y) generated, we assume a Bernoulli distribution with probability of success, θ,
# that determines whether or not the label generated, as above, is to be flipped. The larger the value of θ, the
# greater is the noise.
# The function should take the following parameters:
# • θ: The probability of flipping the label, Y
# • n: The size of the data set
# • m: The number of indepedent variables
# Output from the function should be:
# • X: An n × m numpy array of independent variable values (with a 1 in the first column)
# • Y : The n × 1 binary numpy array of output values
# • β: The random coefficients used to generate Y from X

In [2]:
import numpy as np

def generate_dataset(n, m, theta):
    # Step 1: Generate X (n × m array with first column as 1s)
    X = np.ones((n, m + 1))
    X[:, 1:] = np.random.randn(n, m)  # random values for independent variables
    
    # Step 2: Generate beta (random coefficients)
    beta = np.random.randn(m + 1)
    
    # Step 3: Calculate probability p(y=1|x)
    linear_combination = X.dot(beta)  # X.beta
    prob = 1 / (1 + np.exp(-linear_combination))  # Sigmoid function

    # Step 4: Generate binary Y using probability threshold of 0.5
    Y = np.where(prob > 0.5, 1, 0)
    
    # Step 5: Add noise (flip labels) using Bernoulli distribution
    noise = np.random.binomial(1, theta, n)
    Y = np.abs(Y - noise)  # Flip the labels based on noise
    
    return X, Y, beta

X, Y, beta = generate_dataset(n=100, m=3, theta=0.1)
print("X:", X)
print("Y:", Y)
print("Beta:", beta)


X: [[ 1.         -0.27329365 -1.05550119  0.15212318]
 [ 1.         -0.25045361  1.12212036 -1.19715069]
 [ 1.         -0.39695128  0.03661479 -1.33136218]
 [ 1.         -0.00808617 -0.04182455  2.82141115]
 [ 1.         -0.06940533 -0.12249763  0.02720888]
 [ 1.         -0.39008725  0.59292567 -0.14454713]
 [ 1.          0.44047834  0.64217533  0.61548929]
 [ 1.          1.31056003 -0.09592239 -0.38901027]
 [ 1.         -0.59931308  1.35583867  0.62423856]
 [ 1.         -0.97523765  1.08398632  0.47226829]
 [ 1.          0.15357714 -1.23240963  0.53901217]
 [ 1.          0.59662264  0.44799239  0.16329038]
 [ 1.          0.32427161 -0.82996629  1.23199848]
 [ 1.          0.77850915  1.00496193 -0.29395673]
 [ 1.         -1.18618473 -0.51910343 -0.73749938]
 [ 1.         -1.90554525  0.90299972  0.3902098 ]
 [ 1.         -0.5460051  -1.51301071 -0.70422384]
 [ 1.         -0.28971169 -0.01175759  0.8700295 ]
 [ 1.         -0.74243247 -0.20810552 -0.89117907]
 [ 1.          1.04424888  0

# Exercise 2.2

In [5]:
# 2 Write a function that learns the parameters of a logistic regression function given inputs
# • X: An n × m numpy array of independent variable values
# • Y : The n × 1 binary numpy array of output values
# • k: the number of iteractions (epochs)
# • τ : the threshold on change in Cost function value from the previous to current iteration
# • λ: the learning rate for Gradient Descent
# The function should implement the Gradient Descent algorithm as discussed in class that initialises β with
# random values and then updates these values in each iteraction by moving in the the direction defined by
# the partial derivative of the cost function with respect to each of the coefficients. The function should use
# only one loop that ends after a number of iterations (k) or a threshold on the change in cost function value
# (τ ).
# The output should be a m + 1 dimensional vector of coefficients and the final cost function value.


In [3]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_cost(X, Y, beta):
    n = len(Y)
    pred = sigmoid(X.dot(beta))
    cost = - (1/n) * np.sum(Y * np.log(pred) + (1 - Y) * np.log(1 - pred))
    return cost

def logistic_regression(X, Y, k, t, lam): #t->tow lam->lambda
    n, m = X.shape
    beta = np.random.randn(m)  # Initialize β randomly
    prev_cost = float('inf')  # Set to a very large value initially
    
    for i in range(k):
        # Calculate predictions
        pred = sigmoid(X.dot(beta))
        
        # Compute gradient of the cost function
        gradient = (1/n) * X.T.dot(pred - Y)   #T refers to the transpose of the matrix X.
        
        # Update β using gradient descent rule
        beta = beta - lam * gradient
        
        # Calculate current cost
        cost = compute_cost(X, Y, beta)
        
        # Break if change in cost is smaller than the threshold
        if abs(prev_cost - cost) < t:
            break
        
        prev_cost = cost
    
    return beta, cost

beta_learned, final_cost = logistic_regression(X, Y, k=1000, t=1e-5, lam=0.01)
print("Learned beta:", beta_learned)
print("Final cost:", final_cost)


Learned beta: [ 0.70811004 -0.19954538  0.82317049  0.58766335]
Final cost: 0.502071261239914


# Exercise 2.3

In [9]:
# 3 Create a report investigating how different values of n and θ impact the ability for your logistic regression
# function to learn the coefficients, β, used to generate the output vector Y . Also include your derivation of
# the partial derivative of the cost function with respect to the parameters of the model.

In [4]:
n_values = [100, 500, 1000]
theta_values = [0.1, 0.2, 0.5]

for n in n_values:
    for theta in theta_values:
        X, Y, beta_true = generate_dataset(n, 3, theta)
        beta_learned, _ = logistic_regression(X, Y, k=1000, t=1e-5, lam=0.01)
        print(f"For n={n} and theta={theta}:")
        print("True beta:", beta_true)
        print("Learned beta:", beta_learned)
        print("\n")


For n=100 and theta=0.1:
True beta: [-0.32962422 -0.81516611 -0.83943129  0.99121101]
Learned beta: [-0.40591769 -0.39865733 -0.50437026  0.45344487]


For n=100 and theta=0.2:
True beta: [ 0.26547762  0.80167407 -0.20524437  1.11414271]
Learned beta: [-0.06120269  1.16178099  0.30361762  0.93823348]


For n=100 and theta=0.5:
True beta: [-1.09798409  0.00309739 -1.34560518 -0.37056647]
Learned beta: [ 0.21529847  0.33656673 -0.20763275  0.07624289]


For n=500 and theta=0.1:
True beta: [-1.61791698 -1.70085017 -1.6360979  -0.86240187]
Learned beta: [-0.82439696 -0.67168178 -0.81479798 -0.33838537]


For n=500 and theta=0.2:
True beta: [ 0.55562971  1.06385378  0.65189559 -0.18655418]
Learned beta: [ 0.25647295  0.78709422  0.5325284  -0.65575173]


For n=500 and theta=0.5:
True beta: [ 0.15664084 -0.31705474 -0.19251546 -0.47224244]
Learned beta: [-0.18851968  0.35432232 -0.36137937 -0.03835191]


For n=1000 and theta=0.1:
True beta: [-0.09317351 -0.02808011  0.53010252  0.54168171]
L

# Exercise 2.4

In [11]:
# 4 Add L1 and L2 regularization to the Logistic Regression cost function. How does this impact the models
# learnt? How does the choice of regularization constant impact the β vector learned?

In [5]:
def compute_cost_regularized(X, Y, beta, alpha, regularization='L2'):
    n = len(Y)
    pred = sigmoid(X.dot(beta))
    cost = - (1/n) * np.sum(Y * np.log(pred) + (1 - Y) * np.log(1 - pred))
    
    # L1 Regularization
    if regularization == 'L1':
        cost += alpha * np.sum(np.abs(beta))
    
    # L2 Regularization
    elif regularization == 'L2':
        cost += (alpha/2) * np.sum(beta**2)
    
    return cost

def logistic_regression_regularized(X, Y, k, t, lam, alpha, regularization='L2'):
    n, m = X.shape
    beta = np.random.randn(m)
    prev_cost = float('inf')
    
    for i in range(k):
        pred = sigmoid(X.dot(beta))
        gradient = (1/n) * X.T.dot(pred - Y)
        
        # Regularization terms added to gradient update
        if regularization == 'L1':
            gradient += alpha * np.sign(beta)  # Add L1 regularization gradient
        elif regularization == 'L2':
            gradient += alpha * beta  # Add L2 regularization gradient
        
        beta = beta - lam * gradient
        
        cost = compute_cost_regularized(X, Y, beta, alpha, regularization)
        
        if abs(prev_cost - cost) < t:
            break
        
        prev_cost = cost
    
    return beta, cost

beta_learned_L2, final_cost_L2 = logistic_regression_regularized(X, Y, k=1000, t=1e-5, lam=0.01, alpha=0.1, regularization='L2')
print("Learned beta with L2 regularization:", beta_learned_L2)


Learned beta with L2 regularization: [-0.10904501 -0.05453203  0.03698111  0.08082452]


# Exercise 2.5

In [None]:
# 5 Merge the linear regression code base created in Exercise 1 and the logistic regression code base created in
# this Excercise and create an object oriented code base that maximises reuse of code across the algorithms.


In [6]:
import numpy as np

# Base class for Regression
class BaseRegression:
    def __init__(self, learning_rate=0.01, epochs=1000, tolerance=1e-6):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.tolerance = tolerance
        self.beta = None
    
    def fit(self, X, Y):
        pass
    
    def predict(self, X):
        pass
    
    def gradient_descent(self, X, Y):
        n, m = X.shape
        self.beta = np.random.randn(m)
        prev_cost = float('inf')
        
        for iteration in range(self.epochs):
            Y_pred = self.predict(X)
            cost = self.compute_cost(Y, Y_pred)
            
            # Check for convergence
            if np.abs(prev_cost - cost) < self.tolerance:
                break
            prev_cost = cost
            
            # Compute the gradient
            gradient = (X.T @ (Y_pred - Y)) / n
            self.beta -= self.learning_rate * gradient
            
        return self.beta, cost

    def compute_cost(self, Y, Y_pred):
        pass

# Linear Regression Class inheriting BaseRegression
class LinearRegression(BaseRegression):
    def __init__(self, learning_rate=0.01, epochs=1000, tolerance=1e-6):
        super().__init__(learning_rate, epochs, tolerance)

    def predict(self, X):
        return X @ self.beta  # Linear prediction (dot product)

    def compute_cost(self, Y, Y_pred):
        # Mean squared error cost function for linear regression
        n = len(Y)
        cost = np.mean((Y_pred - Y) ** 2) / 2
        return cost

# Logistic Regression Class inheriting BaseRegression
class LogisticRegression(BaseRegression):
    def __init__(self, learning_rate=0.01, epochs=1000, tolerance=1e-6):
        super().__init__(learning_rate, epochs, tolerance)

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))  # Sigmoid activation function

    def predict(self, X):
        return self.sigmoid(X @ self.beta)  # Logistic prediction

    def compute_cost(self, Y, Y_pred):
        # Cross-entropy cost function for logistic regression
        n = len(Y)
        cost = -np.mean(Y * np.log(Y_pred) + (1 - Y) * np.log(1 - Y_pred))
        return cost

# Generate dataset function (shared)
def generate_dataset_linear(sigma, n, m):
    X = np.hstack((np.ones((n, 1)), np.random.randn(n, m)))  # Add column for intercept
    beta = np.random.randn(m + 1)
    noise = np.random.normal(0, sigma, n)
    Y = X @ beta + noise
    return X, Y, beta

def generate_dataset_logistic(n, m):
    X = np.hstack((np.ones((n, 1)), np.random.randn(n, m)))
    beta = np.random.randn(m + 1)
    linear_combination = X @ beta
    prob = 1 / (1 + np.exp(-linear_combination))
    Y = np.where(prob > 0.5, 1, 0)
    return X, Y, beta

#Linear Regression
sigma = 1.0
n = 100
m = 5
X_lin, Y_lin, beta_true_lin = generate_dataset_linear(sigma, n, m)
linear_model = LinearRegression(learning_rate=0.01, epochs=1000, tolerance=1e-6)
beta_learned_lin, final_cost_lin = linear_model.gradient_descent(X_lin, Y_lin)
print("Linear Regression")
print("True Beta:", beta_true_lin)
print("Learned Beta:", beta_learned_lin)
print("Final Cost:", final_cost_lin)

#Logistic Regression
X_log, Y_log, beta_true_log = generate_dataset_logistic(n, m)
logistic_model = LogisticRegression(learning_rate=0.01, epochs=1000, tolerance=1e-6)
beta_learned_log, final_cost_log = logistic_model.gradient_descent(X_log, Y_log)
print("\nLogistic Regression")
print("True Beta:", beta_true_log)
print("Learned Beta:", beta_learned_log)
print("Final Cost:", final_cost_log)


Linear Regression
True Beta: [-0.58415645  1.07770459 -0.27891949 -1.1012186  -1.20653583 -0.81966296]
Learned Beta: [-0.59214292  1.03485403 -0.12117309 -1.09020499 -1.23258499 -0.75494973]
Final Cost: 0.37185026490464407

Logistic Regression
True Beta: [ 1.11219879  0.71721208 -0.10540705  0.96641242 -0.76874994  0.67392612]
Learned Beta: [ 1.53654913  0.86848006  0.1464446   1.08059923 -0.73115567  0.36276731]
Final Cost: 0.294610909780754
