## 라이브러리

In [56]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## 모델 생성

In [19]:
def model_select(X, y, m):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)   
    if m == 'lda':
        lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
        y_pred = lda.fit(x_train, y_train).predict(x_test)
        result = cross_val_score(lda, x_train, y_train, cv=5)
        return lda, result.mean()
    elif m == 'qda':    
        qda = QuadraticDiscriminantAnalysis(store_covariances=True)
        y_pred = qda.fit(x_train, y_train).predict(x_test)
        result = cross_val_score(qda, x_train, y_train, cv=5)
        return qda, result.mean()
    elif m == 'lr':
        logit = LogisticRegression()
        y_pred = logit.fit(x_train, y_train).predict(x_test)
        result = cross_val_score(logit, x_train, y_train, cv=5)
        return logit, result.mean()
    elif m == 'nb':
        nb = GaussianNB()
        y_pred = nb.fit(x_train, y_train).predict(x_test)
        result = cross_val_score(nb, x_train, y_train, cv=5)
        return nb, result.mean()

## 2 차원 같은 공분산

In [48]:
'''Generate 2 Gaussians samples with the same covariance matrix'''
n, dim = 300, 2
np.random.seed(0)
C = np.array([np.random.random(size=dim), np.random.random(size=dim)])
X = np.r_[np.dot(np.random.randn(n, dim), C),
          np.dot(np.random.randn(n, dim), C) + np.array([1, 1])]
y = np.hstack((np.zeros(n), np.ones(n)))

lda, res = model_select(X, y, 'lda')
print("model : ",lda)
print("coefficient : ",lda.coef_)
print("covariance : ", lda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
qda, res = model_select(X, y, 'qda')
print("model : ",qda)
print("covariance : ", qda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
lr, res = model_select(X, y, 'lr')
print("model : ",lr)
print("coefficient : ",lr.coef_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
nb, res = model_select(X, y, 'nb')
print("model : ",nb)
print("theta : ",nb.theta_)
print("sigma_ : ", nb.sigma_)
print("result_mean : ", res)

model :  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=True, tol=0.0001)
coefficient :  [[ 4.00451834 -2.23818095]]
covariance :  [[ 0.66155654  0.71384413]
 [ 0.71384413  0.79551254]]
result_mean :  0.731217218303
--------------------------------------------------------------------------------------------------------------
model :  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=True, tol=0.0001)
covariance :  [array([[ 0.66625894,  0.72621449],
       [ 0.72621449,  0.8200371 ]]), array([[ 0.61346938,  0.67125217],
       [ 0.67125217,  0.75905788]])]
result_mean :  0.756370274914
--------------------------------------------------------------------------------------------------------------
model :  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1



## 2차원 다른 공분산

In [58]:
'''Generate 2 Gaussians samples with different covariance matrices'''
n, dim = 300, 2
np.random.seed(0)
C = np.array([np.random.random(size=dim), np.random.random(size=dim)]) * 2.
X = np.r_[np.dot(np.random.randn(n, dim), C),
              np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4])]
y = np.hstack((np.zeros(n), np.ones(n)))

lda, res = model_select(X, y, 'lda')
print("model : ",lda)
print("coefficient : ",lda.coef_)
print("covariance : ", lda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
qda, res = model_select(X, y, 'qda')
print("model : ",qda)
print("covariance : ", qda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
lr, res = model_select(X, y, 'lr')
print("model : ",lr)
print("coefficient : ",lr.coef_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
nb, res = model_select(X, y, 'nb')
print("model : ",nb)
print("theta : ",nb.theta_)
print("sigma_ : ", nb.sigma_)
print("result_mean : ", res)

model :  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=True, tol=0.0001)
coefficient :  [[-24.06099276  25.12175571]]
covariance :  [[ 2.94757662  2.86659396]
 [ 2.86659396  2.90979606]]
result_mean :  1.0
--------------------------------------------------------------------------------------------------------------
model :  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=True, tol=0.0001)
covariance :  [array([[ 2.66503578,  2.90485796],
       [ 2.90485796,  3.28014841]]), array([[ 2.97792761,  2.66188272],
       [ 2.66188272,  2.46039838]])]
result_mean :  1.0
--------------------------------------------------------------------------------------------------------------
model :  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty=



## 3차원 같은 공분산

In [68]:
'''Generate 3 Gaussians samples with the same covariance matrix'''
n, dim = 300, 3
np.random.seed(0)
C = np.array([np.random.random(size=dim), np.random.random(size=dim), np.random.random(size=dim)])
X = np.r_[np.dot(np.random.randn(n, dim), C),
          np.dot(np.random.randn(n, dim), C) + np.array([1, 1, 1])]
y = np.hstack((np.zeros(n), np.ones(n)))

lda, res = model_select(X, y, 'lda')
print("model : ",lda)
print("coefficient : ",lda.coef_)
print("covariance : ", lda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
qda, res = model_select(X, y, 'qda')
print("model : ",qda)
print("covariance : ", qda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
lr, res = model_select(X, y, 'lr')
print("model : ",lr)
print("coefficient : ",lr.coef_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
nb, res = model_select(X, y, 'nb')
print("model : ",nb)
print("theta : ",nb.theta_)
print("sigma_ : ", nb.sigma_)
print("result_mean : ", res)

model :  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=True, tol=0.0001)
coefficient :  [[ 4.50511535  1.26850375 -3.25024911]]
covariance :  [[ 0.76458319  0.98856865  1.08661285]
 [ 0.98856865  1.45657792  1.5501226 ]
 [ 1.08661285  1.5501226   1.71148739]]
result_mean :  0.810538298065
--------------------------------------------------------------------------------------------------------------
model :  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=True, tol=0.0001)
covariance :  [array([[ 0.77140998,  0.97550575,  1.07971675],
       [ 0.97550575,  1.40429755,  1.50401239],
       [ 1.07971675,  1.50401239,  1.66962701]]), array([[ 0.75706577,  0.99122098,  1.07128245],
       [ 0.99122098,  1.46761803,  1.53675133],
       [ 1.07128245,  1.53675133,  1.67791872]])]
result_mean :  0.798082157714
----------------------------------------------



## 3차원 다른 공분산

In [66]:
'''Generate 3 Gaussians samples with the different covariance matrix'''
n, dim = 300, 3
np.random.seed(0)
C = np.array([np.random.random(size=dim), np.random.random(size=dim), np.random.random(size=dim)]) * 2
X = np.r_[np.dot(np.random.randn(n, dim), C),
          np.dot(np.random.randn(n, dim), C) + np.array([1, 1, 4])]
y = np.hstack((np.zeros(n), np.ones(n)))

lda, res = model_select(X, y, 'lda')
print("model : ",lda)
print("coefficient : ",lda.coef_)
print("covariance : ", lda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
qda, res = model_select(X, y, 'qda')
print("model : ",qda)
print("covariance : ", qda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
lr, res = model_select(X, y, 'lr')
print("model : ",lr)
print("coefficient : ",lr.coef_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
nb, res = model_select(X, y, 'nb')
print("model : ",nb)
print("theta : ",nb.theta_)
print("sigma_ : ", nb.sigma_)
print("result_mean : ", res)

model :  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=True, tol=0.0001)
coefficient :  [[ -4.56119507 -12.10026033  14.52567108]]
covariance :  [[ 3.05833277  3.9542746   4.34645139]
 [ 3.9542746   5.82631169  6.20049041]
 [ 4.34645139  6.20049041  6.84594957]]
result_mean :  1.0
--------------------------------------------------------------------------------------------------------------
model :  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=True, tol=0.0001)
covariance :  [array([[ 3.08563992,  3.90202302,  4.31886698],
       [ 3.90202302,  5.61719018,  6.01604955],
       [ 4.31886698,  6.01604955,  6.67850803]]), array([[ 3.02826308,  3.96488394,  4.28512981],
       [ 3.96488394,  5.8704721 ,  6.14700533],
       [ 4.28512981,  6.14700533,  6.71167489]])]
result_mean :  1.0
-----------------------------------------------------------------



## 4차원 같은 공분산

In [67]:
'''Generate 4 Gaussians samples with the same covariance matrix'''
n, dim = 300, 4
np.random.seed(0)
C = np.array([np.random.random(size=dim), np.random.random(size=dim), np.random.random(size=dim), np.random.random(size=dim)])
X = np.r_[np.dot(np.random.randn(n, dim), C),
          np.dot(np.random.randn(n, dim), C) + np.array([1, 1, 1, 1])]
y = np.hstack((np.zeros(n), np.ones(n)))

lda, res = model_select(X, y, 'lda')
print("model : ",lda)
print("coefficient : ",lda.coef_)
print("covariance : ", lda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
qda, res = model_select(X, y, 'qda')
print("model : ",qda)
print("covariance : ", qda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
lr, res = model_select(X, y, 'lr')
print("model : ",lr)
print("coefficient : ",lr.coef_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
nb, res = model_select(X, y, 'nb')
print("model : ",nb)
print("theta : ",nb.theta_)
print("sigma_ : ", nb.sigma_)
print("result_mean : ", res)

model :  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=True, tol=0.0001)
coefficient :  [[-1.39573954  0.81865571  2.37261865 -0.71410103]]
covariance :  [[ 1.57075255  1.38188629  1.20033453  1.10358367]
 [ 1.38188629  1.74118491  0.96652166  1.1381091 ]
 [ 1.20033453  0.96652166  1.07797143  1.02172245]
 [ 1.10358367  1.1381091   1.02172245  1.25580487]]
result_mean :  0.666747377464
--------------------------------------------------------------------------------------------------------------
model :  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=True, tol=0.0001)
covariance :  [array([[ 1.44122415,  1.29934896,  1.05210008,  0.91980151],
       [ 1.29934896,  1.73636212,  0.86281534,  1.02265742],
       [ 1.05210008,  0.86281534,  0.9395569 ,  0.87156575],
       [ 0.91980151,  1.02265742,  0.87156575,  1.11981683]]), array([[ 1.65577532,  1



## 4차원 다른 공분산

In [62]:
'''Generate 4 Gaussians samples with the different covariance matrix'''
n, dim = 300, 4
np.random.seed(0)
C = np.array([np.random.random(size=dim), np.random.random(size=dim), np.random.random(size=dim), np.random.random(size=dim)]) *2
X = np.r_[np.dot(np.random.randn(n, dim), C),
          np.dot(np.random.randn(n, dim), C) + np.array([1, 1, 1, 4])]
y = np.hstack((np.zeros(n), np.ones(n)))

lda, res = model_select(X, y, 'lda')
print("model : ",lda)
print("coefficient : ",lda.coef_)
print("covariance : ", lda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
qda, res = model_select(X, y, 'qda')
print("model : ",qda)
print("covariance : ", qda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
lr, res = model_select(X, y, 'lr')
print("model : ",lr)
print("coefficient : ",lr.coef_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
nb, res = model_select(X, y, 'nb')
print("model : ",nb)
print("theta : ",nb.theta_)
print("sigma_ : ", nb.sigma_)
print("result_mean : ", res)

model :  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=True, tol=0.0001)
coefficient :  [[ 4.69208926 -2.97893276 -7.51618061  5.45387256]]
covariance :  [[ 6.28301019  5.52754514  4.80133811  4.41433468]
 [ 5.52754514  6.96473963  3.86608662  4.5524364 ]
 [ 4.80133811  3.86608662  4.31188573  4.08688979]
 [ 4.41433468  4.5524364   4.08688979  5.02321946]]
result_mean :  0.983310047025
--------------------------------------------------------------------------------------------------------------
model :  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=True, tol=0.0001)
covariance :  [array([[ 5.76489661,  5.19739585,  4.20840031,  3.67920606],
       [ 5.19739585,  6.94544848,  3.45126137,  4.09062966],
       [ 4.20840031,  3.45126137,  3.75822761,  3.486263  ],
       [ 3.67920606,  4.09062966,  3.486263  ,  4.47926734]]), array([[ 6.62310126,  5



## 2차원 같은 공분산, 비슷한 속성

In [71]:
'''Generate 2 Gaussians samples with the same covariance matrix, similar attribute'''
n, dim = 300, 2
np.random.seed(0)
a = np.random.random(size=dim)
b = np.random.randn(n, dim)

C = np.array([a, a])
X = np.r_[np.dot(b, C),
          np.dot(b, C) + np.array([1, 1])]
y = np.hstack((np.zeros(n), np.ones(n)))

lda, res = model_select(X, y, 'lda')
print("model : ",lda)
print("coefficient : ",lda.coef_)
print("covariance : ", lda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
qda, res = model_select(X, y, 'qda')
print("model : ",qda)
print("covariance : ", qda.covariance_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
lr, res = model_select(X, y, 'lr')
print("model : ",lr)
print("coefficient : ",lr.coef_)
print("result_mean : ", res)
print("--------------------------------------------------------------------------------------------------------------")
nb, res = model_select(X, y, 'nb')
print("model : ",nb)
print("theta : ",nb.theta_)
print("sigma_ : ", nb.sigma_)
print("result_mean : ", res)

model :  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=True, tol=0.0001)
coefficient :  [[ 0.73559588  0.56447281]]
covariance :  [[ 0.61843835  0.80592137]
 [ 0.80592137  1.05024092]]
result_mean :  0.691595225176
--------------------------------------------------------------------------------------------------------------
model :  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=True, tol=0.0001)
covariance :  [array([[ 0.5814727 ,  0.75774938],
       [ 0.75774938,  0.98746531]]), array([[ 0.60653609,  0.79041087],
       [ 0.79041087,  1.03002832]])]
result_mean :  1.0
--------------------------------------------------------------------------------------------------------------
model :  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
         



## 결론

* 차원이 높아질 수록 정확도가 떨어짐
* 다른 공분산으로 했을 때 정확도 거의 100%
* 비슷한 속성으로 했을 경우 NB와 LDA가 정확도가 떨어짐