In [1]:
from pandas import read_csv
import matplotlib.pyplot as plt #General Plotting
import numpy as np
from scipy.stats import multivariate_normal
from sklearn.metrics import confusion_matrix


In [28]:
# fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
winedata = read_csv('winequality-white.csv', sep = ';')
print(winedata.shape) # (4898, 12)
labels = np.array(winedata[winedata.columns[11]])
valid_classes = np.array([3, 4, 5, 6, 7, 8, 9])

X = np.array(winedata[winedata.columns[0:11]])
N = len(labels)
print(N)

N_cl = np.array([X[labels == i].shape[0] for i in valid_classes])
print("Num Class Labels: ", N_cl)

priors = np.array(N_cl/N)
print("Priors: ", priors)
print(priors.shape)

# First derive sample-based estimates of mean vector and covariance matrix:
#mu_hat = np.mean(X, axis=0)
mu_hat = np.array([np.mean(X[labels == i], axis = 0) for i in valid_classes]) # No labelled samples for 0, 1, 2, 10!
print(mu_hat.shape)

# Regularization
# Regularization
evals, _ = np.linalg.eig(np.cov(X.T))
reg = np.mean(evals)

Sigma_hat = np.array([np.cov(X[labels == i].T)+0.1*np.identity(11) for i in valid_classes]) 
print(Sigma_hat.shape)

class_cond_likelihoods = np.array([multivariate_normal.pdf(X, mu_hat[c], Sigma_hat[c]) for c in range(len(valid_classes))])
class_priors = np.diag(priors)
print(class_priors)
class_posteriors = class_priors.dot(class_cond_likelihoods)

decisions = np.argmax(class_posteriors, axis = 0) # account for classes starting from 3
decisions = np.array([i + 3 for i in decisions])
print(decisions.shape)
print(valid_classes.shape)

conf_matrix = confusion_matrix(decisions, labels)
print(conf_matrix)

errors = len(np.argwhere(decisions != labels))
print('Errors', errors, "Est P(error)", errors/N)

(4898, 12)
4898
Num Class Labels:  [  20  163 1457 2198  880  175    5]
Priors:  [0.0040833  0.03327889 0.29746835 0.44875459 0.17966517 0.03572887
 0.00102082]
(7,)
(7, 11)
(7, 11, 11)
[[0.0040833  0.         0.         0.         0.         0.
  0.        ]
 [0.         0.03327889 0.         0.         0.         0.
  0.        ]
 [0.         0.         0.29746835 0.         0.         0.
  0.        ]
 [0.         0.         0.         0.44875459 0.         0.
  0.        ]
 [0.         0.         0.         0.         0.17966517 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.03572887
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.00102082]]
(4898,)
(7,)
[[   7    2    5   12    2    4    0]
 [   1    3    7    6    0    0    0]
 [   5  106  929  615  102   16    0]
 [   7   50  496 1339  471   79    2]
 [   0    2   19  226  302   73    3]
 [   0    0    1    0    3    3    0]
 [   0    0    0    0    0    0    0]]
Errors 

In [2]:
# Visualization Dataset
fig = plt.figure(figsize=(10,10))
ax = plt.axes(projection = "3d")

for l in classes:
    X_class = X[(labels == 1),:]
    ft0 = X_class[:,0]
    ft1 = X_class ...


    plt.ylabel("z2")
    plt.legend()
    plt.title("PCA projections to 3D space")
    plt.show()





SyntaxError: invalid syntax (4024822515.py, line 8)

In [None]:
# ...

cd = np.argwh

In [None]:
# fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
winedata = read_csv('winequality-white.csv', sep = ';')
print(winedata.shape) # (4898, 12)
labels = np.array(winedata[winedata.columns[11]])

X = np.array(winedata[winedata.columns[0:11]])
N = len(labels)

# Find Class Statistics
classes, nclass = np.unique(labels, return_counts = True)
C = len(classes)
priors = nclass/N
class_means = np.array([np.mean(X[np.argwhere(labels == c)], axis = 0) for c in classes])
class_cov = np.array([np.cov(X[labels == c].T) for c in classes])
# print(class_cov[4])
# Regularization
evals, _ = np.linalg.eig(np.cov(X.T))
reg = np.mean(evals)
print(reg)

# Minimum Error Classification
mu = class_means.reshape(C,11)
Sigma = class_cov + reg*np.identity(11)

class_cond_likelihoods = np.array([multivariate_normal.pdf(X, mu[c], Sigma[c]) for c in range(C)])
class_priors = np.diag(priors)
class_posteriors = class_priors.dot(class_cond_likelihoods)

decisions = np.argmax(class_posteriors, axis = 0) + np.min(classes)*np.ones(N) # account for classes starting from 3

print(decisions.shape)
print(labels.shape)

conf_matrix = confusion_matrix(decisions, labels)
print(conf_matrix)

errors = len(np.argwhere(decisions != labels))
print('Errors', errors, "Est P(error)", errors/N)
