In [264]:
import numpy as np
from scipy.stats import multivariate_normal
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import  GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.subplots as sp
import plotly.graph_objects as go
from pypdf import PdfWriter
from ucimlrepo import fetch_ucirepo 

#### Malwina Wojewoda
# Task 1.
Implementation (from scratch) of LDA, QDA and NB (Naive Bayes) methods for binary classification (classes 0 and 1).
## LDA  *Linear Discriminant Analysis*

In [265]:
class LDA:
    def __init__(self):        
        self.mean_0 = None
        self.mean_1 = None
        self.covariance = None
        self.prior_0 = None
        self.prior_1 = None

    def fit(self, X, y):
        X_0 = X[y == 0]
        X_1 = X[y == 1]

        self.mean_0 = np.mean(X_0, axis=0)
        self.mean_1 = np.mean(X_1, axis=0)

        self.covariance = (
            (X_0 - self.mean_0).T @ (X_0 - self.mean_0) +
            (X_1 - self.mean_1).T @ (X_1 - self.mean_1)
            )/(len(y)-2)

        self.prior_0 = len(X_0) / len(X)
        self.prior_1 = len(X_1) / len(X)

    def predict_proba(self, Xtest):
        prob_class_0 = multivariate_normal.pdf(Xtest, self.mean_0, self.covariance)
        prob_class_1 = multivariate_normal.pdf(Xtest, self.mean_1, self.covariance)
        return (prob_class_1 * self.prior_1) / (prob_class_1 * self.prior_1 + prob_class_0 * self.prior_0)

    def predict(self, Xtest):
        return [1 if self.predict_proba(x) > 0.5 else 0 for x in Xtest]
    
    def get_params(self):
        return [
            ("Mean of class 0", self.mean_0),
            ("Mean of class 1", self.mean_1),
            ("Covariance", self.covariance)
        ]

LDA code validation

In [266]:
X, y = generate_scheme_1(1000, 1)
sklearn_LDA = LinearDiscriminantAnalysis(store_covariance=True)
sklearn_LDA.fit(X, y)
my_LDA = LDA()
my_LDA.fit(X, y)
print(f"My implementation: {my_LDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")
print(f"Scikit-learn implementation: \n{sklearn_LDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")

My implementation: [0.99989752 0.88083673 0.72426362 0.48992168 0.48276751 0.99323917
 0.71267942]
Scikit-learn implementation: 
[[1.02476660e-04 9.99897523e-01]
 [1.19163272e-01 8.80836728e-01]
 [2.75736375e-01 7.24263625e-01]
 [5.10078317e-01 4.89921683e-01]
 [5.17232494e-01 4.82767506e-01]
 [6.76082930e-03 9.93239171e-01]
 [2.87320584e-01 7.12679416e-01]]


## QDA *Quadratic Discriminant Analysis*

In [267]:
class QDA:
    def __init__(self):        
        self.mean_0 = None
        self.mean_1 = None
        self.covariance_0 = None
        self.covariance_1 = None
        self.prior_0 = None
        self.prior_1 = None

    def fit(self, X, y):
        X_0 = X[y == 0]
        X_1 = X[y == 1]

        self.mean_0 = np.mean(X_0, axis=0)
        self.mean_1 = np.mean(X_1, axis=0)

        self.covariance_0 = (X_0 - self.mean_0).T.dot(X_0 - self.mean_0) / (len(X_0)-1)
        self.covariance_1 = (X_1 - self.mean_1).T.dot(X_1 - self.mean_1) / (len(X_1)-1)

        self.prior_0 = len(X_0) / len(X)
        self.prior_1 = len(X_1) / len(X)

    def predict_proba(self, Xtest):
        prob_class_0 = multivariate_normal.pdf(Xtest, self.mean_0, self.covariance_0)
        prob_class_1 = multivariate_normal.pdf(Xtest, self.mean_1, self.covariance_1)
        return (prob_class_1 * self.prior_1) / (prob_class_1 * self.prior_1 + prob_class_0 * self.prior_0)

    def predict(self, Xtest):
        return [1 if self.predict_proba(x) > 0.5 else 0 for x in Xtest]
    
    def get_params(self):
        return [
            ("Mean of class 0", self.mean_0),
            ("Mean of class 1", self.mean_1),
            ("Covariance of class 0", self.covariance_0),
            ("Covariance of class 1", self.covariance_1)
        ]

QDA code validation

In [268]:
X, y = generate_scheme_1(1000, 1)
sklearn_QDA = QuadraticDiscriminantAnalysis(store_covariance=True)
sklearn_QDA.fit(X, y)
my_QDA = QDA()
my_QDA.fit(X, y)
print(f"My implementation: {my_QDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")
print(f"Scikit-learn implementation: \n{sklearn_QDA.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")

My implementation: [0.99995657 0.86736854 0.72033457 0.50630147 0.5138332  0.99381749
 0.70622094]
Scikit-learn implementation: 
[[4.34267105e-05 9.99956573e-01]
 [1.32631464e-01 8.67368536e-01]
 [2.79665428e-01 7.20334572e-01]
 [4.93698528e-01 5.06301472e-01]
 [4.86166804e-01 5.13833196e-01]
 [6.18251150e-03 9.93817489e-01]
 [2.93779058e-01 7.06220942e-01]]


## Naive Bayes

In [269]:
class NB:
    def __init__(self):        
        self.mean_0 = None
        self.mean_1 = None
        self.covariance_0 = None
        self.covariance_1 = None
        self.prior_0 = None
        self.prior_1 = None

    def fit(self, X, y):
        X_0 = X[y == 0]
        X_1 = X[y == 1]

        self.mean_0 = np.mean(X_0, axis=0)
        self.mean_1 = np.mean(X_1, axis=0)

        self.covariance_0 = (X_0 - self.mean_0).T.dot(X_0 - self.mean_0) / (len(X_0)-1)
        self.covariance_1 = (X_1 - self.mean_1).T.dot(X_1 - self.mean_1) / (len(X_1)-1)

        self.prior_0 = len(X_0) / len(X)
        self.prior_1 = len(X_1) / len(X)
        
        self.y = y

    def predict_proba(self, Xtest):
        prob_class_0 = multivariate_normal.pdf(Xtest, self.mean_0, self.covariance_0) * len(self.y[self.y == 0]) / len(self.y)
        prob_class_1 = multivariate_normal.pdf(Xtest, self.mean_1, self.covariance_1) * len(self.y[self.y == 1]) / len(self.y)
        return prob_class_1 / (prob_class_0 + prob_class_1)

    def predict(self, Xtest):
        return [1 if self.predict_proba(x) > 0.5 else 0 for x in Xtest]
    
    def get_params(self):
        return [
            ("Mean of class 0", self.mean_0),
            ("Mean of class 1", self.mean_1),
            ("Covariance of class 0", self.covariance_0),
            ("Covariance of class 1", self.covariance_1)
        ]

Naive Bayes code validation

In [270]:
X, y = generate_scheme_1(1000, 1)
sklearn_NB = GaussianNB()
sklearn_NB.fit(X, y)
my_NB = NB()
my_NB.fit(X, y)

print(f"My implementation: {my_NB.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")
print(f"Scikit-learn implementation: \n{sklearn_NB.predict_proba(np.array([[8, 2], [2, 1], [1, 1], [0.5, 0.5], [0, 1], [3, 3], [0, 2]]))}")

My implementation: [0.99994143 0.88527537 0.73267049 0.50748422 0.49795311 0.99260896
 0.69975126]
Scikit-learn implementation: 
[[8.22118230e-05 9.99917788e-01]
 [1.17603608e-01 8.82396392e-01]
 [2.70063627e-01 7.29936373e-01]
 [4.94421362e-01 5.05578638e-01]
 [5.01992618e-01 4.98007382e-01]
 [8.93570198e-03 9.91064298e-01]
 [2.96729109e-01 7.03270891e-01]]


# Task 2. 
## Generating training and testing data.

In [271]:
def generate_scheme_1(n, a):
    y = np.random.binomial(1, 0.5, size=n)
    X_0 = np.random.normal(0, 1, size=(n, 2))
    X_1 = np.random.normal(a, 1, size=(n, 2))
    X = X_0 * (1 - y[:, np.newaxis]) + X_1 * y[:, np.newaxis]
    return X, y

In [272]:
def generate_scheme_2(n, a, rho):
    variance = 1
    mean_0 = np.zeros(2)
    cov_matrix_0 = [[variance, rho], [rho, variance]]
    X_0 = np.random.multivariate_normal(mean_0, cov_matrix_0, n)
    
    mean_1 = [a] * 2
    cov_matrix_1 = [[variance, -rho], [-rho, variance]]
    X_1 = np.random.multivariate_normal(mean_1, cov_matrix_1, n)
    
    y = np.random.binomial(1, 0.5, size=n)
    X = X_0 * (1 - y[:, np.newaxis]) + X_1 * y[:, np.newaxis]
    return X, y

## Compare LDA, QDA, and NB for both schemes for fixed value ρ=0.5 and different values of a = 0.1, 0.5, 1, 2, 3, 5
Compute accuracy on the testing set.
Repeat the experiment for different train/test splits and generate boxplots showing the values of accuracy for each method and each value of the parameter ρ. 

In [273]:
def first_experiment(a_values, scheme):
    results = {'classifier': [], 'a': [], 'accuracy': []}

    for a in a_values:
        if scheme == "scheme 1 for generating data":
            X, y = generate_scheme_1(1000, a)
        else:
            X, y = generate_scheme_2(1000, a, 0.5)
            
        for _ in range(20):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

            lda = LDA()
            lda.fit(X_train, y_train)
            results['classifier'].append('LDA')
            results['a'].append(str(a))
            results['accuracy'].append(accuracy_score(y_test, lda.predict(X_test)))

            qda = QDA()
            qda.fit(X_train, y_train)
            results['classifier'].append('QDA')
            results['a'].append(str(a))
            results['accuracy'].append(accuracy_score(y_test, qda.predict(X_test)))

            nb = NB()
            nb.fit(X_train, y_train)
            results['classifier'].append('NB')
            results['a'].append(str(a))
            results['accuracy'].append(accuracy_score(y_test, nb.predict(X_test)))

    results_df = pd.DataFrame(results)
    fig = px.box(results_df, x='a', y='accuracy', color='classifier',
                 title=f'LDA, QDA, and NB accuracy for different values of paremeter a, <br>{scheme}',
                 labels={'a': 'value of parameter a', 'Accuracy': 'accuracy'},
                 category_orders={'classifier': ['LDA', 'QDA', 'NB']})
    fig.update_layout(legend_title_text='classifier', height=600, width=800)
    pio.write_image(fig, file=f'BayesianSimulatedData1-{scheme}.pdf', format='pdf')
    fig.show()

In [274]:
first_experiment(a_values = [0.1, 0.5, 1, 2, 3, 5], scheme='scheme 1 for generating data')

In [275]:
first_experiment(a_values = [0.1, 0.5, 1, 2, 3, 5], scheme='scheme 2 for generating data')

In [276]:
merger = PdfWriter()
merger.append("BayesianSimulatedData1-scheme 1 for generating data.pdf") 
merger.append("BayesianSimulatedData1-scheme 2 for generating data.pdf")
merger.write("BayesianSimulatedData1.pdf")
merger.close()

## Compare LDA, QDA, and NB for both schemes for fixed value a=2 and different values of ρ = 0, 0.1, 0.3, 0.5, 0.7, 0.9.
Compute accuracy on the testing set.  
Repeat the experiment for different train/test splits and generate boxplots showing the values of
accuracy for each method and each value of the parameter ρ. 
Save the results in the file BayesianSimulatedData2.pdf

In [277]:
def second_experiment(rho_values, scheme):
    results = {'classifier': [], 'rho': [], 'accuracy': []}

    for rho in rho_values:
        if scheme == "scheme 1 for generating data":
            X, y = generate_scheme_1(1000, 2)
        else:
            X, y = generate_scheme_2(1000, 2, rho)
            
        for _ in range(20):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

            lda = LDA()
            lda.fit(X_train, y_train)
            results['classifier'].append('LDA')
            results['rho'].append(str(rho))
            results['accuracy'].append(accuracy_score(y_test, lda.predict(X_test)))

            qda = QDA()
            qda.fit(X_train, y_train)
            results['classifier'].append('QDA')
            results['rho'].append(str(rho))
            results['accuracy'].append(accuracy_score(y_test, qda.predict(X_test)))

            nb = NB()
            nb.fit(X_train, y_train)
            results['classifier'].append('NB')
            results['rho'].append(str(rho))
            results['accuracy'].append(accuracy_score(y_test, nb.predict(X_test)))

    results_df = pd.DataFrame(results)
    fig = px.box(results_df, x='rho', y='accuracy', color='classifier',
                 title=f'LDA, QDA, and NB accuracy for different values of paremeter rho,<br>{scheme}',
                 labels={'rho': 'value of parameter rho', 'Accuracy': 'accuracy'},
                 category_orders={'classifier': ['LDA', 'QDA', 'NB']})
    fig.update_layout(legend_title_text='classifier', height=600, width=800)
    pio.write_image(fig, file=f'BayesianSimulatedData2-{scheme}.pdf', format='pdf')
    fig.show()

In [278]:
second_experiment(rho_values = [0, 0.1, 0.3, 0.5, 0.7, 0.9], 
                  scheme='scheme 1 for generating data')

In [279]:
second_experiment(rho_values = [0, 0.1, 0.3, 0.5, 0.7, 0.9], 
                  scheme='scheme 2 for generating data')

In [280]:
merger = PdfWriter()
merger.append("BayesianSimulatedData2-scheme 1 for generating data.pdf") 
merger.append("BayesianSimulatedData2-scheme 2 for generating data.pdf")
merger.write("BayesianSimulatedData2.pdf")
merger.close()

## Scatter plot with curves that separate classes for LDA and QDA
For one chosen setting of parameters (e.g. a = 2, ρ = 0.5) generate a scatter plot showing observations from training set. 
Mark observations belonging to different classes using two different colors and two different symbols. 
Save the results in the file BayesianSimulatedData3.pdf


In [286]:
def third_experiment(X, y, scheme):
    lda = LDA()
    lda.fit(X, y)
    
    qda = QDA()
    qda.fit(X, y)
    
    fig = sp.make_subplots(rows=1, cols=2, subplot_titles=['LDA', 'QDA'])

    lda_trace = go.Scatter(
        x=X[y == 0, 0], y=X[y == 0, 1],
        mode='markers', marker=dict(color='deeppink', symbol='x', line=dict(color='black', width=1)),
        name='Class 0'
    )

    lda_trace_class_1 = go.Scatter(
        x=X[y == 1, 0], y=X[y == 1, 1],
        mode='markers', marker=dict(color='royalblue', symbol='diamond', line=dict(color='black', width=1)),
        name='Class 1'
    )

    xx_lda, yy_lda = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 200),
                                np.linspace(X[:, 1].min(), X[:, 1].max(), 200))
    Z_lda = lda.predict(np.c_[xx_lda.ravel(), yy_lda.ravel()])
    Z_lda = np.array(Z_lda).reshape(xx_lda.shape)

    lda_decision_boundary = go.Contour(
        x=np.linspace(X[:, 0].min(), X[:, 0].max(), 200),
        y=np.linspace(X[:, 1].min(), X[:, 1].max(), 200),
        z=Z_lda,
        colorscale=[[0, 'pink'], [1, 'lightskyblue']],
        opacity=0.9,
        showscale=False,
        name='LDA'
    )

    qda_trace = go.Scatter(
        x=X[y == 0, 0], y=X[y == 0, 1],
        mode='markers', marker=dict(color='deeppink', symbol='x', line=dict(color='black', width=1)),
        name='Class 0'
    )

    qda_trace_class_1 = go.Scatter(
        x=X[y == 1, 0], y=X[y == 1, 1],
        mode='markers', marker=dict(color='royalblue', symbol='diamond', line=dict(color='black', width=1)),
        name='Class 1'
    )

    xx_qda, yy_qda = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 200),
                                np.linspace(X[:, 1].min(), X[:, 1].max(), 200))
    Z_qda = qda.predict(np.c_[xx_qda.ravel(), yy_qda.ravel()])
    Z_qda = np.array(Z_qda).reshape(xx_qda.shape)

    qda_decision_boundary = go.Contour(
        x=np.linspace(X[:, 0].min(), X[:, 0].max(), 200),
        y=np.linspace(X[:, 1].min(), X[:, 1].max(), 200),
        z=Z_qda,
        colorscale=[[0, 'pink'], [1, 'lightskyblue']],
        opacity=0.9,
        showscale=False,
        name='QDA'
    )

    fig.add_trace(lda_trace, row=1, col=1)
    fig.add_trace(lda_trace_class_1, row=1, col=1)
    fig.add_trace(lda_decision_boundary, row=1, col=1)

    fig.add_trace(qda_trace, row=1, col=2)
    fig.add_trace(qda_trace_class_1, row=1, col=2)
    fig.add_trace(qda_decision_boundary, row=1, col=2)

    fig.update_layout(
        xaxis=dict(title='Feature 1'),
        yaxis=dict(title='Feature 2'),
        xaxis2=dict(title='Feature 1'),
        yaxis2=dict(title='Feature 2'),
        title=f'Classifiaction of observation using LDA and QDA with decision boundaries <br>'
              f'{scheme}',
        showlegend=False,
        height=500, 
        width=1000
    )

    pio.write_image(fig, file=f'BayesianSimulatedData3-{scheme}.pdf', format='pdf')
    fig.show()

In [296]:
X, y = generate_scheme_1(1000, 2)
third_experiment(X, y, 'scheme 1 for generating data')

In [297]:
X, y = generate_scheme_2(1000, 2, 2)
third_experiment(X, y, 'scheme 2 for generating data')


covariance is not symmetric positive-semidefinite.


covariance is not symmetric positive-semidefinite.



In [298]:
merger = PdfWriter()
merger.append("BayesianSimulatedData3-scheme 1 for generating data.pdf") 
merger.append("BayesianSimulatedData3-scheme 2 for generating data.pdf")
merger.write("BayesianSimulatedData3.pdf")
merger.close()

# 3.  Comparison of LDA, QDA and NB methods on real data

## Choose 3 datasets which are available online 
They should be related to binary classification problem. Please only focus on datasets with numerical features.

First dataset

In [None]:
banknote_authentication = fetch_ucirepo(id=267) 
  
X_ds1 = banknote_authentication.data.features 
y_ds1 = banknote_authentication.data.targets 
X_ds1 = X_ds1.values
y_ds1 = np.array(y_ds1).flatten()

Second dataset

In [None]:
monk_s_problems = fetch_ucirepo(id=70) 

X_ds2 = monk_s_problems.data.features 
y_ds2 = monk_s_problems.data.targets 
X_ds2 = X_ds2.values
y_ds2 = np.array(y_ds2).flatten()

Third dataset

In [None]:
statlog_australian_credit_approval = fetch_ucirepo(id=143) 

X_ds3 = statlog_australian_credit_approval.data.features 
y_ds3 = statlog_australian_credit_approval.data.targets 
X_ds3 = X_ds3.values
y_ds3 = np.array(y_ds3).flatten()

## Compare LDA, QDA, and NB on real data
Split data into training set and test set. 
Train the model on the train set and compute accuracy on the test set. 
Repeat the experiment for different train/test splits and generate boxplots showing the values of accuracy for each method. Save the results for three datasets in the file BayesianReal.pdf

In [None]:
def real_experiment(data):
    results = {'classifier': [], 'dataset': [], 'accuracy': []}

    for X, y, name in data:
        for _ in range(20):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
            
            lda = LDA()
            lda.fit(X_train, y_train)
            results['classifier'].append('LDA')
            results['dataset'].append(name)
            results['accuracy'].append(accuracy_score(y_test, lda.predict(X_test)))
            
            qda = QDA()
            qda.fit(X_train, y_train)
            results['classifier'].append('QDA')
            results['dataset'].append(name)
            results['accuracy'].append(accuracy_score(y_test, qda.predict(X_test)))
            
            nb = NB()
            nb.fit(X_train, y_train)
            results['classifier'].append('NB')
            results['dataset'].append(name)
            results['accuracy'].append(accuracy_score(y_test, nb.predict(X_test)))

    results_df = pd.DataFrame(results)
    fig = px.box(results_df, x='dataset', y='accuracy', color='classifier',
                 title=f'LDA, QDA, and NB accuracy for different datasets',
                 labels={'dataset': 'dataset', 'Accuracy': 'accuracy'},
                 category_orders={'classifier': ['LDA', 'QDA', 'NB']})
    fig.update_layout(legend_title_text='classifier', height=600, width=800)
    pio.write_image(fig, file=f'BayesianReal.pdf', format='pdf')
    fig.show()

In [None]:
data = [[X_ds1, y_ds1, 'banknote_authentication'], 
        [X_ds2, y_ds2, 'monk_s_problems'],
        [X_ds3, y_ds3, 'statlog_australian_credit_approval']]
real_experiment(data)