In [1]:
import tensorflow as tf
import keras
import numpy as np
import os

In [2]:
BATCH_SIZE = 256

In [3]:
def load_data_artifact(dir_path):
    train, valid = tf.keras.preprocessing.image_dataset_from_directory(
        dir_path,
        label_mode='binary',
        image_size=[200, 200],
        validation_split=0.2,
        subset='both',
        shuffle=True,
        seed=42,
        interpolation='area',
        batch_size=BATCH_SIZE
    )

    return train, valid

In [4]:
train, valid = load_data_artifact("../../datasets/artifact")

Found 1934329 files belonging to 2 classes.
Using 1547464 files for training.
Using 386865 files for validation.


In [5]:
train = train.map(lambda x, y: (tf.keras.applications.vgg19.preprocess_input(x), y))
valid = valid.map(lambda x, y: (tf.keras.applications.vgg19.preprocess_input(x), y))

In [6]:
model = keras.applications.VGG19(False, 'imagenet', input_shape=(200, 200, 3))
model.trainable = False

In [7]:
model.summary()

Model: "vgg19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200, 200, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 200, 200, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 200, 200, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 100, 100, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 100, 100, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 100, 100, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 50, 50, 128)       0     

In [8]:
def extract_features(dataset, model, sample_size, filename, to_process):
    all_features = []
    all_labels = []
    i = 0
    processed = 0
    
    for images, labels in dataset:
        batch_features = model.predict(images)
        all_features.append(batch_features)
        all_labels.append(labels.numpy())
        i += 1
        processed += 1
        if i >= sample_size:
            files = os.listdir("../../datasets/artifact_features/vgg/")
            file_count = len([file for file in files if filename+"_x" in file])
            np.save("../../datasets/artifact_features/vgg/"+filename+"_x_"+str(file_count)+".npy", np.concatenate(all_features, axis=0))
            np.save("../../datasets/artifact_features/vgg/"+filename+"_y_"+str(file_count)+".npy", np.concatenate(all_labels, axis=0))
            all_features.clear()
            all_labels.clear()
            print("Total processed images: ", processed*BATCH_SIZE)
            if processed >= to_process:
                break
            i = 0

In [9]:
extract_features(train, model, (2**15)//BATCH_SIZE, filename='train', to_process=500000//BATCH_SIZE)

Total processed images:  32768
Total processed images:  65536
Total processed images:  98304
Total processed images:  131072
Total processed images:  163840
Total processed images:  196608
Total processed images:  229376
Total processed images:  262144
Total processed images:  294912
Total processed images:  327680
Total processed images:  360448
Total processed images:  393216
Total processed images:  425984
Total processed images:  458752
Total processed images:  491520
Total processed images:  524288


In [14]:
extract_features(valid, model, (2**15)//BATCH_SIZE, filename='valid', to_process=98304//BATCH_SIZE)

Total processed images:  32768
Total processed images:  65536
Total processed images:  98304


ValueError: need at least one array to concatenate