In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.naive_bayes import GaussianNB as NB
from scipy.stats import norm
import urllib
from random import sample

Wczytanie danych i nazwanie kolumn.

In [2]:
link = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
f = urllib.request.urlopen(link)
data = pd.read_csv(f, sep=",", header=None)
data.columns = ["class", "Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium", "Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins", "Color.intensity", "Hue", "OD280OD315.of.diluted.wines", "Proline"]

In [3]:
LDA_discriminant = LDA()
LDA_pred = LDA_discriminant.fit(data[data.columns[1:]], data["class"]).predict(data[data.columns[1:]])
QDA_discriminant = QDA()
QDA_pred = QDA_discriminant.fit(data[data.columns[1:]], data["class"]).predict(data[data.columns[1:]])
NB_discriminant = NB()
NB_pred = NB_discriminant.fit(data[data.columns[1:]], data["class"]).predict(data[data.columns[1:]])

Funkcja zwracająca tablicę z parametrami klasyfikatora na danym zbiorze.

In [4]:
def CM_large(first, pred):
    
    CM = np.matrix(pd.crosstab(first, pred))
    
    ACC = np.sum(np.diag(CM))/np.sum(CM)
    
    TP = CM[1,1]
    TN = CM[0,0]
    
    #suma wartości w poszczególnych wierszach
    sums = CM.sum(axis = 1)
    
    TPR = TP / np.asscalar(sums[1])
    FPR = 1 - TN / np.asscalar(sums[0])
    
    return np.array([round(ACC, 4), TP, TN, round(TPR, 4), round(FPR, 4)])

Wyznaczenie parametrów klasyfikatorów dla pełnego zbioru danych.

In [5]:
resultLDA = CM_large(data["class"], LDA_pred)
resultQDA = CM_large(data["class"], QDA_pred)
resultNB = CM_large(data["class"], NB_pred)
result = pd.DataFrame(np.array([resultLDA, resultQDA, resultNB]), columns = ["ACC", "TP", "TN", "TPR", "FPR"], index = ["LDA", "QDA", "Bayes"])
print(result)

          ACC    TP    TN     TPR     FPR
LDA    1.0000  71.0  59.0  1.0000  0.0000
QDA    0.9944  70.0  59.0  0.9859  0.0000
Bayes  0.9888  70.0  58.0  0.9859  0.0169


Wyznaczenie parametrów klasyfikatorów dla 2 pierwszych składowych obserwacji.

In [6]:
LDA_pred_2 = LDA_discriminant.fit(data[data.columns[1:3]], data["class"]).predict(data[data.columns[1:3]])
QDA_pred_2 = QDA_discriminant.fit(data[data.columns[1:3]], data["class"]).predict(data[data.columns[1:3]])
NB_pred_2 = NB_discriminant.fit(data[data.columns[1:3]], data["class"]).predict(data[data.columns[1:3]])
resultLDA_2 = CM_large(data["class"], LDA_pred_2)
resultQDA_2 = CM_large(data["class"], QDA_pred_2)
resultNB_2 = CM_large(data["class"], NB_pred_2)
result_2 = pd.DataFrame(np.array([resultLDA_2, resultQDA_2, resultNB_2]), columns = ["ACC", "TP", "TN", "TPR", "FPR"], index = ["LDA", "QDA", "Bayes"])
print(result_2["ACC"])

LDA      0.8090
QDA      0.8146
Bayes    0.8090
Name: ACC, dtype: float64


Wyznaczenie parametrów klasyfikatorów dla 5 pierwszych składowych obserwacji.


In [7]:
LDA_pred_5 = LDA_discriminant.fit(data[data.columns[1:6]], data["class"]).predict(data[data.columns[1:6]])
QDA_pred_5 = QDA_discriminant.fit(data[data.columns[1:6]], data["class"]).predict(data[data.columns[1:6]])
NB_pred_5 = NB_discriminant.fit(data[data.columns[1:6]], data["class"]).predict(data[data.columns[1:6]])
resultLDA_5 = CM_large(data["class"], LDA_pred_5)
resultQDA_5 = CM_large(data["class"], QDA_pred_5)
resultNB_5 = CM_large(data["class"], NB_pred_5)
result_5 = pd.DataFrame(np.array([resultLDA_5, resultQDA_5, resultNB_5]), columns = ["ACC", "TP", "TN", "TPR", "FPR"], index = ["LDA", "QDA", "Bayes"])
print(result_5["ACC"])

LDA      0.8764
QDA      0.8876
Bayes    0.8539
Name: ACC, dtype: float64


Wyznaczenie parametrów klasyfikatorów dla 10 pierwszych składowych obserwacji.


In [8]:
LDA_pred_10 = LDA_discriminant.fit(data[data.columns[1:11]], data["class"]).predict(data[data.columns[1:11]])
QDA_pred_10 = QDA_discriminant.fit(data[data.columns[1:11]], data["class"]).predict(data[data.columns[1:11]])
NB_pred_10 = NB_discriminant.fit(data[data.columns[1:11]], data["class"]).predict(data[data.columns[1:11]])
resultLDA_10 = CM_large(data["class"], LDA_pred_10)
resultQDA_10 = CM_large(data["class"], QDA_pred_10)
resultNB_10 = CM_large(data["class"], NB_pred_10)
result_10 = pd.DataFrame(np.array([resultLDA_10, resultQDA_10, resultNB_10]), columns = ["ACC", "TP", "TN", "TPR", "FPR"], index = ["LDA", "QDA", "Bayes"])
print(result_10["ACC"])

LDA      0.9888
QDA      0.9944
Bayes    0.9607
Name: ACC, dtype: float64


Podzielenie zbioru danych na podzbiory w stosunku 50/25/25 (80/40/40)

In [9]:
c = range(160)
indexes = sample(c, 160)
PU = data.loc[indexes[0:80]]
PW = data.loc[indexes[81:120]]
PT = data.loc[indexes[121:160]]

Predykcja zbioru PW

In [10]:
LDA_pred_2_PW = LDA_discriminant.fit(PU[PU.columns[1:3]], PU["class"]).predict(PW[PW.columns[1:3]])
QDA_pred_2_PW = QDA_discriminant.fit(PU[PU.columns[1:3]], PU["class"]).predict(PW[PW.columns[1:3]])
NB_pred_2_PW = NB_discriminant.fit(PU[PU.columns[1:3]], PU["class"]).predict(PW[PW.columns[1:3]])
resultLDA_2_PW = CM_large(PW["class"], LDA_pred_2_PW)
resultQDA_2_PW = CM_large(PW["class"], QDA_pred_2_PW)
resultNB_2_PW = CM_large(PW["class"], NB_pred_2_PW)
result_2_PW = pd.DataFrame(np.array([resultLDA_2_PW, resultQDA_2_PW, resultNB_2_PW]), columns = ["ACC", "TP", "TN", "TPR", "FPR"], index = ["LDA", "QDA", "Bayes"])
print(result_2_PW["ACC"])

LDA      0.6923
QDA      0.7436
Bayes    0.7436
Name: ACC, dtype: float64


Predykcja zbioru PT metodą Bayes

In [11]:
NB_pred_2_PT = NB_discriminant.fit(PU[PU.columns[1:3]], PU["class"]).predict(PT[PT.columns[1:3]])
resultNB_2_PT = CM_large(PT["class"], NB_pred_2_PT)
result_2_PT = pd.DataFrame(resultNB_2_PT.reshape(1,5), columns = ["ACC", "TP", "TN", "TPR", "FPR"], index = ["Bayes"])
print(result_2_PT["ACC"])

Bayes    0.8205
Name: ACC, dtype: float64


Funkcja główna kroswalidacji, tworzy macierz pomyłek

In [139]:
def CV_main(learn, test):
    learn_classifier = LDA()
    test_pred = learn_classifier.fit(learn[learn.columns[1:3]], learn["class"]).predict(test[test.columns[1:3]])
    CM = np.matrix(pd.crosstab(test["class"], test_pred))
    return (np.sum(CM) - np.sum(np.diag(CM)))

Funkcja wykonuje kroswalidację na podanym zbiorze danych dzieląc go na K pseudoprób

In [144]:
def CV(data, K):
    N = len(data)
    data_shuffled = data.sample(frac = 1)
    elements_in_group = np.floor(N/K)
    a = np.zeros(K+1)
    for i in range(K):
        a[i+1] = elements_in_group*(i+1)+1
    a = a.astype(int)
    res = np.zeros(K)  
    for i in range(K):
        res[i] = CV_main(data_shuffled.drop(range(a[i],a[i+1])),data_shuffled[a[i]:a[i+1]])
    return res, (K-1)*elements_in_group

Porównanie wyników metody LDA dla 2 pierwszych składowych

In [148]:
ar, PUcrossval = CV(data,5)
print("Skuteczność LDA (ACC):", "\npowtórne podstawienie: ", result_2.loc["LDA", "ACC"], "\npodzielenie na grupy: ", result_2_PW.loc["LDA", "ACC"], "\nkroswalidacja: ", np.round(1-sum(ar/PUcrossval),3))

Skuteczność LDA (ACC): 
powtórne podstawienie:  0.809 
podzielenie na grupy:  0.6923 
kroswalidacja:  0.729
