Import các thư viện cần thiết

In [1]:
import os
import numpy as np
import sklearn
import time
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

Khai báo đường dẫn file dữ liệu

In [2]:
# set names to the paths because they're too long
data_path = 'D:/Code/python/mat3533/practice04/data/'
# train path
train_images_path = os.path.join(data_path, 'train-images-idx3-ubyte.gz')
train_labels_path = os.path.join(data_path, 'train-labels-idx1-ubyte.gz')
# test path
test_images_path = os.path.join(data_path, 't10k-images-idx3-ubyte.gz')
test_labels_path = os.path.join(data_path, 't10k-labels-idx1-ubyte.gz')

Xây dựng phương thức đọc dữ liệu từ tệp gzip, giải nén và đưa về định dạng là một dãy ảnh (một dãy ma trận
nguyên)

In [3]:
def get_mnist_data(images_path, labels_path, num_images, shuffle=False, _is=True, image_size=28):
    """
    This shuffle param is active when .gz is downloaded at:
    - 'http://yann.lecun.com/exdb/mnist/'
    - This function return random num_images in 60000 or 10000
    """
    # read data
    import gzip # to decompress gz (zip) file
    # open file training to read training data
    f_images = gzip.open(images_path,'r')
    # skip 16 first bytes because these are not data, only header infor
    f_images.read(16)
    # general: read num_images data samples if this parameter is set;
    # if not, read all (60000 training or 10000 test)
    real_num = num_images if not shuffle else (60000 if _is else 10000)
    # read all data to buf_images (28x28xreal_num)
    buf_images = f_images.read(image_size * image_size * real_num)
    # images
    images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
    images = images.reshape(real_num, image_size, image_size,)
    # Read labels
    f_labels = gzip.open(labels_path,'r')
    f_labels.read(8)
    labels = np.zeros((real_num)).astype(np.int64)
    # rearrange to correspond the images and labels
    for i in range(0, real_num):
        buf_labels = f_labels.read(1)
        labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)

    # shuffle to get random images data
    if shuffle is True:
        rand_id = np.random.randint(real_num, size=num_images)
        images = images[rand_id, :]
        labels = labels[rand_id,]
    # change images data to type of vector 28x28 dimentional
    images = images.reshape(num_images, image_size * image_size)
    return images, labels

Lấy 5000 ảnh và kiểm tra dữ liệu

In [6]:
train_images, train_labels = get_mnist_data(
    train_images_path, train_labels_path, 5000, shuffle=True)
test_images, test_labels = get_mnist_data(
    test_images_path, test_labels_path, 10000, _is=False, shuffle=True)
print(train_images.shape, train_labels.shape)
print(test_images.shape, test_labels.shape)

(5000, 784) (5000,)
(10000, 784) (10000,)


Đưa dữ liệu về 2 chiều và thực hiện mô hình Naive Bayes

In [7]:
pca = PCA(n_components=2)
train_images_2d = pca.fit_transform(train_images)

X_train, X_test, y_train, y_test = train_test_split(train_images_2d, train_labels, test_size=0.3, random_state=42)

start_time = time.time()

model_pca_002_nb = GaussianNB()
model_pca_002_nb.fit(X_train, y_train)

y_pred_pca_nb = model_pca_002_nb.predict(X_test)
print("Prediction: ", y_pred_pca_nb)

accuracy_pca_nb = accuracy_score(y_test, y_pred_pca_nb)
print("Accuracy: ", accuracy_pca_nb)

train_time = time.time() - start_time

print("Time: ", train_time)

Prediction:  [2 3 7 ... 4 0 7]
Accuracy:  0.41533333333333333
Time:  0.007994651794433594


Sử dụng dữ liệu nguyên bản, chạy mô hình Naive Bayes

In [8]:
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(train_images, train_labels, test_size=0.3, random_state=42)

start_time = time.time()

model_full_nb = GaussianNB()
model_full_nb.fit(X_train_full, y_train_full)

y_pred_full_nb = model_full_nb.predict(X_test_full)
print("Prediction: ", y_pred_full_nb)

accuracy_full_nb = accuracy_score(y_test_full, y_pred_full_nb)
print("Accuracy: ", accuracy_full_nb)

train_time_full = time.time() - start_time

print("Time: ", train_time_full)

Prediction:  [8 6 1 ... 8 0 9]
Accuracy:  0.5866666666666667
Time:  0.12240767478942871


Sử dungjtaajp dữ liệu 500 ảnh, đưa về 100 chiều và thực hiện mô hình Naive Bayes

In [9]:
pca_100 = PCA(n_components=100)
train_images_100d = pca_100.fit_transform(train_images)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(train_images_100d, train_labels, test_size=0.3, random_state=42)

start_time = time.time()

model_pca_100_nb = GaussianNB()
model_pca_100_nb.fit(X_train_pca, y_train_pca)

y_pred_pca_nb = model_pca_100_nb.predict(X_test_pca)
print("Prediction: ", y_pred_pca_nb)

accuracy_pca_nb = accuracy_score(y_test_pca, y_pred_pca_nb)
print("Accuracy: ", accuracy_pca_nb)

train_time = time.time() - start_time

print("Time: ", train_time)

Prediction:  [8 6 9 ... 4 0 9]
Accuracy:  0.8573333333333333
Time:  0.02588963508605957
