In [9]:
import numpy as np
from scipy import linalg as la
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn import model_selection as ms
import time

# Problem 1
Code up the Gaussian Discriminant Analysis algorithm.  Your code should have a `.fit` method that accepts a dataset $X,y$ where $y$ only takes on a finite number of values (classes), the `.fit` method should train the model (learn the parameters $π_c$, $μ_c$, and $Σ_c$ for each class $c$, using the standard Gaussian MLE for each $μ_c$, and $Σ_c$ and using the estimate $π_c$ = $\frac{\#y=c}{N}$.  Your code should also have a `.predict_proba method` that accepts a data set $X'$ and returns $p(y=c|x)$ for each $x$ in $X'$, and it should have a `.predict` method that accepts data $X'$ and returns the class prediction $\hat{y}$ for each $x$ in $X'$

In [38]:
class GaussianDiscriminantAnalysis:
    '''Do Gaussian Discriminant Analysis'''
    
    def fit(self, X, y):
        '''Train the model.'''
        # Train the dataset.
        self.classes = np.array(list(set(y)))
        self.mu = np.zeros((len(self.classes), X.shape[1]))
        self.sigma = np.zeros((len(self.classes), X.shape[1], X.shape[1]))
        self.sigma_inv = np.zeros((len(self.classes), X.shape[1], X.shape[1]))
        self.pi = np.zeros((len(self.classes)))
        for i in range(len(self.classes)):
            class_data = X[y==self.classes[i]]
            self.mu[i] = class_data.mean(axis=0)
            self.sigma[i] = np.cov(class_data, rowvar = False)
            self.sigma_inv[i] = la.inv(self.sigma[i])
            self.pi[i] = (y==self.classes[i]).mean()
        

    def predict_proba(self, Xprime):
        '''Predict probabilities for each class'''
        # Return P(y=c|x).
        probabilities = np.zeros((len(self.classes), Xprime.shape[0]))
        for i in range(len(self.classes)):
            det = la.det(.5*np.pi*self.sigma[i])**-.5
            centered = Xprime - self.mu[i]
            exponent = self.sigma_inv[i].dot(centered.T)
            exponent = centered.dot(exponent)
            exponent = -.5*exponent
            prob = det * np.exp(exponent) * self.pi[i]
            #print(np.diag(prob).shape)
            #print(probabilities[i].shape)
            probabilities[i] = np.diag(prob)
            '''
            probabilities[i] = la.det(.5*np.pi*self.sigma[i])**-.5 \
                    * np.exp(-.5*(Xprime - self.mu[i]).dot(self.sigma_inv[i].dot((Xprime - self.mu[i]).T))) \
                    * self.pi[i]'''
        return (probabilities / probabilities.sum(axis=0)).T

    def predict(self, Xprime):
        # Return y_hat for each x in X'. 
        return self.classes[np.argmax(self.predict_proba(Xprime), axis=1)]

# Problem 2
Apply your GDA code to the cancer dataset with an appropriate train-test split and compare the results (train and test speed and test accuracy) to logistic regression and Naive Bayes.  Is one of these much better than the others?  Explain.

In [42]:
models = ['Gaussian Discriminant', 'Logistic Regression', 'Naive Bayes\t']
train_times = []
test_times = []
accuracy = []

# Load data set.
cancer = datasets.load_breast_cancer()
X,y = cancer.data, cancer.target

# Train test split.
xtrain, xtest, ytrain, ytest = ms.train_test_split(X, y, test_size = .3)

# Run GDA.
gda = GaussianDiscriminantAnalysis()
start = time.clock()
gda.fit(xtrain, ytrain)
train_times.append(time.clock() - start)
start = time.clock()
gda_predict = gda.predict(xtest)
test_times.append(time.clock() - start)
accuracy.append((gda_predict == ytest).mean())

# Run Logistic Regression.
lr = LogisticRegression()
start = time.clock()
lr.fit(xtrain, ytrain)
train_times.append(time.clock() - start)
start = time.clock()
lr_predict = lr.predict(xtest)
test_times.append(time.clock() - start)
accuracy.append((lr_predict == ytest).mean())

# Run Naive Bayes.
gnb = GaussianDiscriminantAnalysis()
start = time.clock()
gnb.fit(xtrain, ytrain)
train_times.append(time.clock() - start)
start = time.clock()
gnb_predict = gnb.predict(xtest)
test_times.append(time.clock() - start)
accuracy.append((gnb_predict == ytest).mean())

In [43]:
# Compare.
print('Model\t\t\tTrain Time\tTest Time\tAccuracy')
for i in range(3):
    print('{}\t{}\t{}\t{}'.format(models[i], round(train_times[i], 7), 
                                  round(test_times[i], 7), round(accuracy[i], 7)))

Model			Train Time	Test Time	Accuracy
Gaussian Discriminant	0.0012476	0.0015918	0.9590643
Logistic Regression	0.0110732	0.0004135	0.9649123
Naive Bayes		0.0014441	0.0015126	0.9590643


# Problem 3
Compare your train and test speed and your test accuracy to the `discriminant_analysis.QuadraticDiscriminantAnalysis` method in scikit learn.

In [47]:
# Run sklearn's GDA
qda = QDA()
start = time.clock()
qda.fit(xtrain, ytrain)
train_times.append(time.clock() - start)
start = time.clock()
qda_predict = qda.predict(xtest)
test_times.append(time.clock() - start)
accuracy.append((qda_predict == ytest).mean())

# Compare test accuracy.
print('Model\t\t\tTrain Time\tTest Time\tAccuracy')
print('{}\t{}\t{}\t{}'.format(models[0], round(train_times[0], 7), 
                              round(test_times[0], 7), round(accuracy[0], 7)))
print('{}\t{}\t{}\t{}'.format(models[-1], round(train_times[-1], 7), 
                              round(test_times[-1], 7), round(accuracy[-1], 7)))

Model			Train Time	Test Time	Accuracy
Gaussian Discriminant	0.0012476	0.0015918	0.9590643
Naive Bayes		0.0022265	0.0005136	0.9590643
