In [156]:
import numpy as np
from scipy.linalg import inv
from scipy.stats import multivariate_normal
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import  GaussianNB

# Task 1.
Implementation (from scratch) of LDA, QDA and NB (Naive Bayes) methods for binary classification (classes 0 and 1).
## LDA  *Linear Discriminant Analysis*

In [157]:
class LDA:
    def __init__(self):        
        self.mean_0 = None
        self.mean_1 = None
        self.covariance = None
        self.prior_0 = None
        self.prior_1 = None
        self.intercept = None
        self.coefs = None

    def fit(self, X, y):
        X_0 = X[y == 0]
        X_1 = X[y == 1]

        self.mean_0 = np.mean(X_0, axis=0)
        self.mean_1 = np.mean(X_1, axis=0)

        self.covariance = (
            (X_0 - self.mean_0).T @ (X_0 - self.mean_0) +
            (X_1 - self.mean_1).T @ (X_1 - self.mean_1)
            )/(len(y)-2)

        self.prior_0 = len(X_0) / len(X)
        self.prior_1 = len(X_1) / len(X)
        
        self.intercept = -0.5 * (
            (self.mean_1 - self.mean_0).T.dot(inv(self.covariance)).dot(self.mean_1 + self.mean_0)
            ) + np.log(self.prior_1 / self.prior_0)
        
        self.coefs = inv(self.covariance).dot(self.mean_1 - self.mean_0)

    def predict_proba(self, Xtest):
        prob_class_0 = multivariate_normal.pdf(Xtest, self.mean_0, self.covariance)
        prob_class_1 = multivariate_normal.pdf(Xtest, self.mean_1, self.covariance)
        return (prob_class_1 * self.prior_1) / (prob_class_1 * self.prior_1 + prob_class_0 * self.prior_0)

    def predict(self, Xtest):
        return [1 if self.predict_proba(x) > 0.5 else 0 for x in Xtest]
    
    def get_params(self):
        return [
            ("Mean of class 0", self.mean_0),
            ("Mean of class 1", self.mean_1),
            ("Covariance", self.covariance),
            ("Coefficients", self.coefs),
            ("Intercept", self.intercept)
        ]

LDA code validation

In [158]:
X, y = generate_scheme_1(1000, 2, 1)
sklearn_LDA = LinearDiscriminantAnalysis(store_covariance=True)
sklearn_LDA.fit(X, y)
my_LDA = LDA()
my_LDA.fit(X, y)
print(f"My implementation: {my_LDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")
print(f"Scikit-learn implementation: \n{sklearn_LDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")

My implementation: [0.99995065 0.90151895 0.75435837 0.50033818 0.50744213 0.99633245
 0.76473734]
Scikit-learn implementation: 
[[4.93454934e-05 9.99950655e-01]
 [9.84810473e-02 9.01518953e-01]
 [2.45641635e-01 7.54358365e-01]
 [4.99661822e-01 5.00338178e-01]
 [4.92557869e-01 5.07442131e-01]
 [3.66754559e-03 9.96332454e-01]
 [2.35262657e-01 7.64737343e-01]]


## QDA *Quadratic Discriminant Analysis*

In [159]:
class QDA:
    def __init__(self):        
        self.mean_0 = None
        self.mean_1 = None
        self.covariance_0 = None
        self.covariance_1 = None
        self.prior_0 = None
        self.prior_1 = None
        # self.intercept = None
        # self.coefs = None

    def fit(self, X, y):
        X_0 = X[y == 0]
        X_1 = X[y == 1]

        self.mean_0 = np.mean(X_0, axis=0)
        self.mean_1 = np.mean(X_1, axis=0)

        self.covariance_0 = (X_0 - self.mean_0).T.dot(X_0 - self.mean_0) / (len(X_0)-1)
        self.covariance_1 = (X_1 - self.mean_1).T.dot(X_1 - self.mean_1) / (len(X_1)-1)
        
        self.prior_0 = len(X_0) / len(X)
        self.prior_1 = len(X_1) / len(X)
        
        # self.intercept = -0.5 * (
        #     (self.mean_1 - self.mean_0).T.dot(inv(self.covariance)).dot(self.mean_1 + self.mean_0)
        #     ) + np.log(self.prior_1 / self.prior_0)
        # 
        # self.coefs = inv(self.covariance).dot(self.mean_1 - self.mean_0)

    def predict_proba(self, Xtest):
        prob_class_0 = multivariate_normal.pdf(Xtest, self.mean_0, self.covariance_0)
        prob_class_1 = multivariate_normal.pdf(Xtest, self.mean_1, self.covariance_1)
        return (prob_class_1 * self.prior_1) / (prob_class_1 * self.prior_1 + prob_class_0 * self.prior_0)

    def predict(self, Xtest):
        return [1 if self.predict_proba(x) > 0.5 else 0 for x in Xtest]
    
    def get_params(self):
        return [
            ("Mean of class 0", self.mean_0),
            ("Mean of class 1", self.mean_1),
            ("Covariance of class 0", self.covariance_0),
            ("Covariance of class 1", self.covariance_1),
            # ("Coefficients", self.coefs),
            # ("Intercept", self.intercept)
        ]

QDA code validation

In [160]:
X, y = generate_scheme_1(1000, 2, 1)
sklearn_QDA = QuadraticDiscriminantAnalysis(store_covariance=True)
sklearn_QDA.fit(X, y)
my_QDA = QDA()
my_QDA.fit(X, y)
print(f"My implementation: {my_QDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")
print(f"Scikit-learn implementation: \n{sklearn_QDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")

My implementation: [0.99999242 0.89474588 0.75160241 0.52813823 0.53994963 0.99446766
 0.71673663]
Scikit-learn implementation: 
[[7.57846157e-06 9.99992422e-01]
 [1.05254122e-01 8.94745878e-01]
 [2.48397586e-01 7.51602414e-01]
 [4.71861775e-01 5.28138225e-01]
 [4.60050366e-01 5.39949634e-01]
 [5.53233699e-03 9.94467663e-01]
 [2.83263373e-01 7.16736627e-01]]


## Naive Bayes

In [161]:
class NB:
    def __init__(self):        
        self.mean_0 = None
        self.mean_1 = None
        self.covariance_0 = None
        self.covariance_1 = None
        self.prior_0 = None
        self.prior_1 = None
        # self.intercept = None
        # self.coefs = None

    def fit(self, X, y):
        X_0 = X[y == 0]
        X_1 = X[y == 1]

        self.mean_0 = np.mean(X_0, axis=0)
        self.mean_1 = np.mean(X_1, axis=0)

        self.covariance_0 = (X_0 - self.mean_0).T.dot(X_0 - self.mean_0) / (len(X_0)-1)
        self.covariance_1 = (X_1 - self.mean_1).T.dot(X_1 - self.mean_1) / (len(X_1)-1)
        
        self.prior_0 = len(X_0) / len(X)
        self.prior_1 = len(X_1) / len(X)
        
        self.y = y

    def predict_proba(self, Xtest):
        prob_class_0 = multivariate_normal.pdf(Xtest, self.mean_0, self.covariance_0) * len(self.y[self.y == 0]) / len(self.y)
        prob_class_1 = multivariate_normal.pdf(Xtest, self.mean_1, self.covariance_1) * len(self.y[self.y == 1]) / len(self.y)
        return prob_class_1 / (prob_class_0 + prob_class_1)

    def predict(self, Xtest):
        return [1 if self.predict_proba(x) > 0.5 else 0 for x in Xtest]
    
    def get_params(self):
        return [
            ("Mean of class 0", self.mean_0),
            ("Mean of class 1", self.mean_1),
            ("Covariance of class 0", self.covariance_0),
            ("Covariance of class 1", self.covariance_1),
            # ("Coefficients", self.coefs),
            # ("Intercept", self.intercept)
        ]

Naive Bayes code validation

In [162]:
X, y = generate_scheme_1(1000, 2, 1)
sklearn_NB = GaussianNB()
sklearn_NB.fit(X, y)
my_NB = NB()
my_NB.fit(X, y)
print(f"My implementation: {my_NB.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")
print(f"Scikit-learn implementation: \n{sklearn_NB.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")

My implementation: [0.99999994 0.90155687 0.48309541 0.0945518  0.08332751 0.99981984
 0.44587142]
Scikit-learn implementation: 
[[7.38474493e-08 9.99999926e-01]
 [1.02355510e-01 8.97644490e-01]
 [5.03708771e-01 4.96291229e-01]
 [8.90139669e-01 1.09860331e-01]
 [9.04106336e-01 9.58936640e-02]
 [2.03534748e-04 9.99796465e-01]
 [5.53224128e-01 4.46775872e-01]]


# Task 2. 
## Generating training and testing data.

In [164]:
def generate_scheme_1(n, p, a):
    y = np.random.binomial(1, 0.5, size=n)
    X_0 = np.random.normal(0, 1, size=(n, p))
    X_1 = np.random.normal(a, 1, size=(n, p))
    X = X_0 * (1 - y[:, np.newaxis]) + X_1 * y[:, np.newaxis]
    return X, y

In [165]:
def generate_scheme_2(n, a, rho):
    variance = 1
    mean_0 = np.zeros(2)
    cov_matrix_0 = [[variance, rho], [rho, variance]]
    X_0 = np.random.multivariate_normal(mean_0, cov_matrix_0, n)
    
    mean_1 = [a] * 2
    cov_matrix_1 = [[variance, -rho], [-rho, variance]]
    X_1 = np.random.multivariate_normal(mean_1, cov_matrix_1, n)
    
    y = np.random.binomial(1, 0.5, size=n)
    X = X_0 * (1 - y[:, np.newaxis]) + X_1 * y[:, np.newaxis]
    return X, y

## Compare LDA, QDA, and NB for both schemes 
Compute accuracy on the testing set.
Test for fixed value a = 2 and different values of ρ = 0, 0.1, 0.3, 0.5, 0.7, 0.9. 
Repeat the experiment for different train/test splits and generate boxplots showing the values of accuracy for each method and each value of the parameter ρ. 

In [168]:
X_scheme1, y_scheme1 = generate_scheme_1(1000, 2, 1)
X_scheme2, y_scheme2 = generate_scheme_2(1000, 2, 1)