In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB,  MultinomialNB

In [None]:
# 데이터 불러오기. y값은 이미 범주형으로 되어있음.
dat_wine=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                     'wine/wine.data',header=None)
dat_wine.head()
dat_wine.columns = ['class label', 'alchohol', 'malic acid', 'ash', 
                    'alcalinity of ash', 'magnesium', 'total phenols', 
                    'flavanoids', 'nonflavanoid phenols', 
                    'proanthocyanins', 'color intensity', 'hue', 
                    'OD208', 'proline']  # Column names
print('class label:', np.unique(dat_wine['class label']))  # Class 출력

# 전체 data를 training set과 test set으로 split
from sklearn.model_selection import train_test_split
X, y = dat_wine.iloc[:,1:].values, dat_wine.iloc[:,0].values
X_train, X_test, y_train,y_test = \
    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

class label: [1 2 3]


In [None]:
X.shape, y.shape, X_test.shape

((178, 13), (178,), (54, 13))

In [None]:

cld=LinearDiscriminantAnalysis(store_covariance=True)
cld.fit(X_train, y_train)
y_train_pred_lda=cld.predict(X_train)
y_test_pred_lda=cld.predict(X_test)

# LDA 적합
print('LDA train accuracy:',np.around(
    accuracy_score(y_train, y_train_pred_lda), 2))
print('LDA test accuracy:',np.around(
    accuracy_score(y_test, y_test_pred_lda), 2))

LDA train accuracy: 1.0
LDA test accuracy: 0.98


In [None]:
cqd=QuadraticDiscriminantAnalysis(store_covariance=True)
cqd.fit(X_train, y_train)
y_train_pred_qda=cqd.predict(X_train)
y_test_pred_qda=cqd.predict(X_test)


# QDA 적합
print('QDA train accuracy:',np.around(
    accuracy_score(y_train, y_train_pred_qda), 2))
print('QDA test accuracy:',np.around(
    accuracy_score(y_test, y_test_pred_qda), 2))

QDA train accuracy: 1.0
QDA test accuracy: 0.96


In [None]:

gnb=GaussianNB()
gnb.fit(X_train, y_train)
y_train_pred_gnb=gnb.predict(X_train)
y_test_pred_gnb=gnb.predict(X_test)

# Gaussian NB 적합
print('Gaussian NB train accuracy:',np.around(
    accuracy_score(y_train, y_train_pred_gnb), 2))
print('Gaussian NB test accuracy:',np.around(
    accuracy_score(y_test, y_test_pred_gnb), 2))

Gaussian NB train accuracy: 0.99
Gaussian NB test accuracy: 0.98


In [None]:

mnb=MultinomialNB()
mnb.fit(X_train, y_train)
y_train_pred_mnb=mnb.predict(X_train)
y_test_pred_mnb=mnb.predict(X_test)


# Multinomial NB 적합
print('Multi NB train accuracy:',np.around(
    accuracy_score(y_train, y_train_pred_mnb), 2))
print('Multi NB test accuracy:',np.around(
    accuracy_score(y_test, y_test_pred_mnb), 2))

Multi NB train accuracy: 0.85
Multi NB test accuracy: 0.74


In [None]:
print(y_test_pred_lda[:5])
cld.predict_proba(X_test)[:5]

[3 3 3 2 2]


array([[4.82027630e-11, 9.53699966e-06, 9.99990463e-01],
       [4.16120014e-19, 1.20023483e-09, 9.99999999e-01],
       [4.13851085e-15, 2.75591141e-12, 1.00000000e+00],
       [7.51250315e-05, 9.99924835e-01, 3.97700359e-08],
       [5.24688464e-07, 9.99999475e-01, 9.27968095e-16]])

In [None]:
print(cld.covariance_)
print(cld.coef_)

[[ 2.41615068e-01  3.84592439e-02 -5.24867188e-03 -2.30283993e-03
  -7.86881482e-02  1.11713745e-02  1.33361710e-02 -4.67113583e-04
   1.07053192e-02  1.77454884e-01  4.91173257e-04 -1.62968428e-02
   8.54595097e+00]
 [ 3.84592439e-02  8.32381396e-01  2.83227199e-02  4.00338741e-01
  -5.87202072e-01 -9.42715080e-03 -9.52135930e-03  7.35759507e-03
   4.69856098e-02 -1.45330075e-01 -4.62261537e-02  5.60318655e-02
  -2.84926307e+01]
 [-5.24867188e-03  2.83227199e-02  6.80395376e-02  4.91619869e-01
   5.99821043e-01  2.86110643e-02  5.76375339e-02  7.22680996e-03
   4.46800139e-03  5.20767053e-02  3.63662334e-03  2.06618862e-02
  -5.77687326e-02]
 [-2.30283993e-03  4.00338741e-01  4.91619869e-01  8.43814527e+00
   4.38952042e+00  1.97406921e-01  4.41501428e-01  4.11469745e-02
   1.66482453e-01  5.67441496e-02  9.41799084e-04  3.51744582e-01
   4.43128671e+00]
 [-7.86881482e-02 -5.87202072e-01  5.99821043e-01  4.38952042e+00
   1.48134957e+02  8.43279534e-01  1.38061621e+00 -3.28155413e-01


In [None]:
print(np.around(cld.coef_,3))
dat_wine.median()[1:]

[[ 3.314e+00 -1.500e-01  3.526e+00 -8.890e-01  1.500e-02 -2.173e+00
   5.032e+00  3.121e+00 -6.930e-01 -8.990e-01  1.133e+00  4.182e+00
   1.500e-02]
 [-2.461e+00 -5.430e-01 -5.843e+00  3.740e-01 -8.000e-03 -6.650e-01
   2.138e+00  5.347e+00  4.540e-01 -5.820e-01  3.805e+00 -3.070e-01
  -8.000e-03]
 [-3.890e-01  1.009e+00  4.472e+00  5.380e-01 -7.000e-03  3.708e+00
  -9.492e+00 -1.198e+01  1.730e-01  1.998e+00 -7.172e+00 -4.731e+00
  -6.000e-03]]


alchohol                 13.050
malic acid                1.865
ash                       2.360
alcalinity of ash        19.500
magnesium                98.000
total phenols             2.355
flavanoids                2.135
nonflavanoid phenols      0.340
proanthocyanins           1.555
color intensity           4.690
hue                       0.965
OD208                     2.780
proline                 673.500
dtype: float64

In [None]:
print(cld.covariance_)

[[ 2.41615068e-01  3.84592439e-02 -5.24867188e-03 -2.30283993e-03
  -7.86881482e-02  1.11713745e-02  1.33361710e-02 -4.67113583e-04
   1.07053192e-02  1.77454884e-01  4.91173257e-04 -1.62968428e-02
   8.54595097e+00]
 [ 3.84592439e-02  8.32381396e-01  2.83227199e-02  4.00338741e-01
  -5.87202072e-01 -9.42715080e-03 -9.52135930e-03  7.35759507e-03
   4.69856098e-02 -1.45330075e-01 -4.62261537e-02  5.60318655e-02
  -2.84926307e+01]
 [-5.24867188e-03  2.83227199e-02  6.80395376e-02  4.91619869e-01
   5.99821043e-01  2.86110643e-02  5.76375339e-02  7.22680996e-03
   4.46800139e-03  5.20767053e-02  3.63662334e-03  2.06618862e-02
  -5.77687326e-02]
 [-2.30283993e-03  4.00338741e-01  4.91619869e-01  8.43814527e+00
   4.38952042e+00  1.97406921e-01  4.41501428e-01  4.11469745e-02
   1.66482453e-01  5.67441496e-02  9.41799084e-04  3.51744582e-01
   4.43128671e+00]
 [-7.86881482e-02 -5.87202072e-01  5.99821043e-01  4.38952042e+00
   1.48134957e+02  8.43279534e-01  1.38061621e+00 -3.28155413e-01


In [None]:
print(cld.covariance_.shape)
print(len(cqd.covariance_))
print(cqd.covariance_[0].shape)

(13, 13)
3
(13, 13)


In [None]:
cqd.covariance_[0] - cqd.covariance_[2]

array([[-5.93640133e-02, -1.67088994e-01, -2.50573919e-02,
        -5.22759895e-01,  1.40094401e+00,  7.43178950e-02,
         1.04977907e-01, -7.14904379e-04,  2.93373531e-02,
         5.30392614e-02,  1.00917503e-02,  1.42653271e-03,
         5.14974200e+01],
       [-1.67088994e-01, -7.89484472e-01, -5.95489329e-03,
        -7.33706402e-01,  3.19822358e+00,  3.95807825e-02,
         4.71497612e-02, -3.09847053e-02,  6.55050152e-02,
        -5.71270833e-02, -2.27635366e-02,  4.55326829e-02,
        -3.43030701e+01],
       [-2.50573919e-02, -5.95489329e-03,  1.46049030e-02,
        -1.47130543e-01,  2.68673642e-01, -2.57607871e-02,
        -2.16981541e-02,  7.34601996e-03, -3.36724002e-02,
        -1.50652670e-01,  3.57866962e-03, -8.31885948e-03,
        -1.84337375e+00],
       [-5.22759895e-01, -7.33706402e-01, -1.47130543e-01,
         3.90843958e-01, -3.30702605e-01, -4.76028908e-01,
        -4.86320413e-01,  2.06234618e-02, -4.27495191e-01,
        -1.52334545e+00,  2.26212999e