In [1]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Input
import numpy as np
import time
import util

In [2]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
# Ban đầu X_train (60000, 28, 28)
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)

In [3]:
num_classes, smtk = 10, 0
Y_train_nocat = Y_train
Y_train = to_categorical(Y_train, num_classes)
Y_test = to_categorical(Y_test, num_classes)

# Ví dụ
# Y_train = [2, 5, 9]  # Nhãn gốc
# Y_train = to_categorical(Y_train, num_classes=10)
# Kết quả:
# [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]  # 2
#  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]  # 5
#  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]] # 9


In [4]:
batch_size = 32
# subset, random = False, False  # all
subset, random = True, False  # greedy
# subset, random = True, True  # random
subset_size = .4 if subset else 1.0
epochs = 15
reg = 1e-4
runs = 5
save_subset = False

folder = f'/tmp/mnist'

In [5]:
model = Sequential([
    Input(shape=(784,)),
    Dense(100, activation='sigmoid', kernel_regularizer=l2(reg)),
    Dense(10, activation='softmax', kernel_regularizer=l2(reg)) # num_class = 10
])

model.compile(
    loss='categorical_crossentropy',
    metrics=['accuracy'],
    optimizer='sgd'
)

In [6]:
train_loss, test_loss = np.zeros((runs, epochs)), np.zeros((runs, epochs))
train_acc, test_acc = np.zeros((runs, epochs)), np.zeros((runs, epochs))
train_time = np.zeros((runs, epochs))
grd_time, sim_time, pred_time = np.zeros((runs, epochs)), np.zeros((runs, epochs)), np.zeros((runs, epochs))
not_selected = np.zeros((runs, epochs)) # Số lượng mẫu không được chọn (nếu có cơ chế lựa chọn dữ liệu)
times_selected = np.zeros((runs, len(X_train))) # Đếm số lần mỗi mẫu trong X_train được chọn qua các runs
best_acc = 0 # Lưu accuracy tốt nhất trong tất cả các runs

In [7]:
if save_subset:
    B = int(subset_size * len(X_train))
    selected_ndx = np.zeros((runs, epochs, B)) # Chỉ số của các mẫu được chọn
    selected_wgt = np.zeros((runs, epochs, B)) # Trọng số tương ứng


In [8]:
for run in range(runs):
    X_subset = X_train
    Y_subset = Y_train
    W_subset = np.ones(len(X_subset))
    ordering_time,similarity_time, pre_time = 0, 0, 0
    loss_vec, acc_vec, time_vec = [], [], []
    for epoch in range(0, epochs):
        print('Epoch {}/{}'.format(epoch, epochs - 1))
        num_batches = int(np.ceil(X_subset.shape[0] / float(batch_size)))

        for index in range(num_batches):
            X_batch = X_subset[index * batch_size:(index + 1) * batch_size]
            Y_batch = Y_subset[index * batch_size:(index + 1) * batch_size]
            W_batch = W_subset[index * batch_size:(index + 1) * batch_size]

            start = time.time()
            loss, accuracy  = model.train_on_batch(X_batch, Y_batch, sample_weight=W_batch)
            train_time[run][epoch] += time.time() - start

            train_loss[run][epoch] = loss
            train_acc[run][epoch] = accuracy

        if subset:
            if random:
                # indices = np.random.randint(0, len(X_train), int(subset_size * len(X_train)))
                indices = np.arange(0, len(X_train))
                np.random.shuffle(indices)
                indices = indices[:int(subset_size * len(X_train))]
                W_subset = np.ones(len(indices))
            else:
                start = time.time()
                _logits = model.predict(X_train)
                pre_time = time.time() - start
                features = _logits - Y_train

                indices, W_subset, _, _, ordering_time, similarity_time = util.get_orders_and_weights(
                    int(subset_size * len(X_train)), features, 'euclidean', smtk, 0, False, Y_train_nocat)

                W_subset = W_subset / np.sum(W_subset) * len(W_subset)  # todo

            # if save_subset:
            #     selected_ndx[run, epoch], selected_wgt[run, epoch] = indices, W_subset

            grd_time[run, epoch], sim_time[run, epoch], pred_time[run, epoch] = ordering_time, similarity_time, pre_time
            times_selected[run][indices] += 1
            not_selected[run, epoch] = np.sum(times_selected[run] == 0) / len(times_selected[run]) * 100
        else:
            pred_time = 0
            indices = np.arange(len(X_train))

        X_subset = X_train[indices, :]
        Y_subset = Y_train[indices]

        start = time.time()
        score = model.evaluate(X_test, Y_test, verbose=1)
        eval_time = time.time()-start

        start = time.time()
        score_loss = model.evaluate(X_train, Y_train, verbose=1)
        print(f'eval time on training: {time.time()-start}')

        test_loss[run][epoch], test_acc[run][epoch] = score[0], score[1]
        train_loss[run][epoch], train_acc[run][epoch] = score_loss[0], score_loss[1]
        best_acc = max(test_acc[run][epoch], best_acc)

        grd = 'random_wor' if random else 'grd_normw'
        print(f'run: {run}, {grd}, subset_size: {subset_size}, epoch: {epoch}, test_acc: {test_acc[run][epoch]}, '
              f'loss: {train_loss[run][epoch]}, best_prec1_gb: {best_acc}, not selected %:{not_selected[run][epoch]}')

    if save_subset:
        print(
            f'Saving the results to {folder}_{subset_size}_{grd}_{runs}')

        np.savez(f'{folder}_{subset_size}_{grd}_{runs}',
                 # f'_{grd}_{args.lr_schedule}_start_{args.start_subset}_lag_{args.lag}_subset',
                 train_loss=train_loss, test_acc=test_acc, train_acc=train_acc, test_loss=test_loss,
                 train_time=train_time, grd_time=grd_time, sim_time=sim_time, pred_time=pred_time,
                 not_selected=not_selected, times_selected=times_selected,
                 subset=selected_ndx, weights=selected_wgt)
    else:
        print(
            f'Saving the results to {folder}_{subset_size}_{grd}_{runs}')

        np.savez(f'{folder}_{subset_size}_{grd}_{runs}',
                 # f'_{grd}_{args.lr_schedule}_start_{args.start_subset}_lag_{args.lag}',
                 train_loss=train_loss, test_acc=test_acc, train_acc=train_acc, test_loss=test_loss,
                 train_time=train_time, grd_time=grd_time, sim_time=sim_time, pred_time=pred_time,
                 not_selected=not_selected, times_selected=times_selected)


Epoch 0/14
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
not equal_num
0
[    1    21    34 ... 59952 59972 59987]
5923
1
[    3     6     8 ... 59979 59984 59994]
6742
2
[    5    16    25 ... 59983 59985 59991]
5958
3
[    7    10    12 ... 59978 59980 59996]
6131
4
[    2     9    20 ... 59943 59951 59975]
5842
5
[    0    11    35 ... 59968 59993 59997]
5421
6
[   13    18    32 ... 59982 59986 59998]
5918
7
[   15    29    38 ... 59963 59977 59988]
6265
8
[   17    31    41 ... 59989 59995 59999]
5851
9
[    4    19    22 ... 59973 59990 59992]
5949
Selecting with ratios [0.09871667 0.11236667 0.0993     0.10218333 0.09736667 0.09035
 0.09863333 0.10441667 0.09751667 0.09915   ]
Class proportions [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8888 - loss: 0.4357
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8831 - loss: 0.4467
eval t

KeyboardInterrupt: 