VGG16 s feature extraction


In [1]:
#!unzip -qq '/content/drive/MyDrive/Datafiles/it4n/cwe3categ.zip'
#!unzip -qq '/content/drive/MyDrive/Datafiles/digits/digits_small.zip'
#!unzip -qq '/content/drive/MyDrive/Datafiles/it4n/cwe3categ_augmented.zip'
#!unzip -qq '/content/drive/MyDrive/Datafiles/it4n/it4n_train_reduced.zip'

# TESTOVACI DATA 
# (CWE pics jsou navic tak trochu protected mimo svatou pudu alma mater)
!wget -qq 'https://github.com/lukyfox/Datafiles/raw/master/digits/digits.zip'  
!unzip -qq '/content/digits.zip'

unzip:  cannot find or open /content/drive/MyDrive/Datafiles/it4n/cwe3categ.zip, /content/drive/MyDrive/Datafiles/it4n/cwe3categ.zip.zip or /content/drive/MyDrive/Datafiles/it4n/cwe3categ.zip.ZIP.


In [None]:
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping, Callback
from keras import backend
from tensorflow.keras.applications import VGG16

from keras.metrics import Precision, Recall
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter


class AccuracyCallback(Callback):
    def __init__(self, test_data, classes):
        self.test_data = test_data
        self.classes = list(classes)

    def on_epoch_end(self, epoch, logs=None):
        x_data, y_data = self.test_data

        correct = 0
        incorrect = 0

        x_result = self.model.predict(x_data, verbose=0)

        class_correct = [0] * len(self.classes)
        class_incorrect = [0] * len(self.classes)

        for i in range(len(x_data)):
            x = x_data[i]
            y = y_data[i]

            res = x_result[i]

            actual_label = np.argmax(y)
            #print('actual_label =',actual_label)
            pred_label = np.argmax(res)
            #print('pred_label =', pred_label)

            if(pred_label == actual_label):
                class_correct[actual_label] += 1   
                correct += 1
            else:
                class_incorrect[actual_label] += 1
                incorrect += 1
        print('\tclass_correct =', class_correct)
        print('\tclass_incorrect =', class_incorrect)
        print("\tCorrect: %d" %(correct))
        print("\tIncorrect: %d" %(incorrect))

        for i in range(len(self.classes)):
            tot = float(class_correct[i] + class_incorrect[i])
            print(f'tot({tot}) = float(class_correct[i]({class_correct[i]}) + class_incorrect[i]({class_incorrect[i]}))')
            class_acc = -1
            if (tot > 0):
                class_acc = float(class_correct[i]) / tot
                print(f'class_acc({class_acc}) = float(class_correct[i])({class_correct[i]}) / tot({tot})')

            print("\tself.classes[i] = %s: class_acc = %.3f" %(self.classes[i],class_acc)) 

        acc = float(correct) / float(correct + incorrect)  
        print(f'acc({acc}) = float(correct)({correct}) / float(correct + incorrect)({correct+incorrect})')

        print("\tCurrent Network Accuracy: %.3f" %(acc))



# reset all states, mozna to k nicemu neni, ale pro ten pocit...
backend.clear_session()

image_shape = (330, 330, 3)
# VGG16 ma minimalni pozadavky na image shape (32, 32, 3), pro test "mnist"
# (10k pics trenovacich cislic 0-2, 3k validacnich, dim 28x28 v 1 gs kanalu) tedy musim 
# tak jako tak volit minimalni shape vstupu dle VGG16. Se stejnym nastavenim site 
# se s mnist dostavam k presnosti pres 99% a overfittingu pod 1%. Testovanim 
# mnist sady (o vysokem kontrastu) zjistuju, jestli model vubec funguje (a on asi fakt funguje),
# navic uz je i tady patrny posun v presnosti oproti CNN simple.
# Presnost pro kapslovku je 75-80%, prakticky stejna jako u CNN simple.
image_shape = (32, 32, 3)

# conv_base je konvolucni baze z VGG19, include_top=False rika, ze do conv_base nenacitam klasifikator, 
# ale pripojim si vlastni - duvodem je, ze trenuju nad daty, ktera nejsou z ImageNet (je tam 1000 trid, 
# ale strev se asi netyka ani jedna), muj klasifikator se tak musi naucit nove vzory ale kvuli nizkemu poctu dat 
# je stale dobre vyuzit obecne features z konvolucnich vrstev VGG19
conv_base = VGG16(weights='imagenet', include_top=False, input_shape=image_shape)
for layer in conv_base.layers[:]:
  layer.trainable = False
# vypis vrstev modelu
conv_base.summary()

# 2. nacteni dat a jejich prepocet dle konvoluce z VGG16
# ImageDataGenerator uz mi staci jen jeden, protoze nemusi mit augmentaci. Duvodem je,
# ze pouzivam jiz naucene features z konvolucni casti VGG16 a tudiz nemam kde
# uplatnit svou pripadnou augmentaci (transformace mi k nicemu nejsou, ve vahach
# z VGG16 jsou jiz uplatneny transformace z uceni na ImageNetu)
imagedatagen = ImageDataGenerator()
batch_size = 20

# funkce pro extrakci features z vlastni datove sady protazene pres VGG16, prvni
# beh trva dlouho, zejmena pro vetsi datovou sadu, ale pak se asi nakesuje a je
# to celkem ficak... (je to funkce, protoze stejny kod pouzivam 2x)
def extract_features_from_conv_layers(source_dir):
  # vytahnu si data z adresaru do kategorizovaneho toku
  imageflow = imagedatagen.flow_from_directory(
      source_dir, 
      # color mode rgb nechavam u VGG16 i pro mnist
      color_mode= 'rgb', 
      batch_size=batch_size,
      class_mode='categorical',
      target_size = image_shape[:2])
  
  counter = Counter(imageflow.classes)
  print(counter.items())
  class_dict = imageflow.class_indices
  # zjistim pocet samplu v source_dir
  sample_count = imageflow.samples
  # vytvorim nulovy tenzor o rozmerech (sample_count, 10, 10, 512) - 3 posledni 
  # cisla vychazeji ze shapu vystupni konvolucni VGG16 vrstvy (info ze summary) 
  features = np.zeros(shape=(sample_count, 10, 10, 512))
  # udelam to same pro labely, ty jsou vlastne jen numpy polem
  labels = np.zeros(shape=(sample_count,len(class_dict.keys())))
  # a v cyklu plnim features a labels prepocitanymi features z konvolucni baze,
  # nove features (a nezmenene labels) jsou vstupem do vlastniho klasifikatoru  
  i = 0
  for input_batch, label_batch in imageflow:
    # predict vraci vystupni parametry po konvoluci
    # https://www.youtube.com/watch?v=ZJRPTBVBV5c&feature=youtu.be&t=1407
    features[i*batch_size : (i+1)*batch_size] = conv_base.predict(input_batch)
    labels[i*batch_size : (i+1)*batch_size] = label_batch
    i += 1
    # iterace v generatoru bezi do nekonecna (jestli tomu spravne rozumim, 
    # flow_from_directory prohledava cilove adresare v nekonecne smycce), proto
    # musim beh ukoncit, jakmile zpracuju posledni sample
    #print('samples processed:', i*batch_size, 'of', sample_count, 'label_batch =', label_batch)
    if i*batch_size >= sample_count:
      print('sample processing finished: i*batch_size >= sample_count =', i*batch_size, '>=', sample_count)
      break

  return features, labels, sample_count, counter

# prepocitam sample pictures ze vstupnich slozek podle konvolucnich vrstev VGG16 
# a vratim pro vstup do vlastniho klasifikatoru (train_features tedy obsahuje 
# features z kapslovky transformovane dle VGG19 naucene z ImageNetu)
#train_path = '/content/_splitted/train'
train_path = '/content/digits/train'
#train_path = '/content/cwe3categ_augmented/train'
#train_path = '/content/it4n_balanced'
#train_path = '/content/drive/MyDrive/Datafiles/cwe_dataset/train'
#train_path = '/content/drive/MyDrive/Datafiles/it4n_2categs/train'

train_features, train_labels, sample_count, classes = extract_features_from_conv_layers(train_path)
# reshapnu vystup, aby odpovidal ocekavanemu vstupu do klasifikatoru (obdoba Flatten vrstvy),
# 10*10*512 naopak odpovida vystupu z posledni konvolucni vrstvy VGG16
print('sample_count:', sample_count)
print(train_features.shape)
train_features = np.reshape(train_features, (sample_count, 10*10*512))
print(train_features.shape)

#validation_path = '/content/_splitted/validation'
validation_path = '/content/digits/validation'
#validation_path = '/content/cwe3categ_augmented/validation'
#validation_path = '/content/drive/MyDrive/Datafiles/cwe_dataset/validation'
#validation_path = '/content/drive/MyDrive/Datafiles/it4n_2categs/validation'
val_features, val_labels, sample_count, classes = extract_features_from_conv_layers(validation_path)
print(val_features.shape)
val_features = np.reshape(val_features, (sample_count, 10*10*512))
print(val_features.shape)

# 3. definice vlastniho klasifikatoru k napojeni na vystupni features z VGG16,
# jedna se vlastne o stejny klasifikator, jaky jsem pouzil u simple CNN
model = Sequential()
# vstupni vrstva ma input_dim = 10*10*512 = 51200, coz odpovida features po reshapingu (Dense vyzaduje 1D tenzor na vstupu),
# bez reshapu vyse by se musela jako prvni vrstva klasifikatoru dat Flatten
model.add(Dense(256, activation='relu', input_dim=51200))
model.add(Dropout(0.5))
model.add(Dense(len(classes.keys()), activation='softmax'))

# 4. sestaveni modelu a spusteni trenovani
opt = Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=6)
accuracy_callback = AccuracyCallback((val_features, val_labels), classes.keys())

class_weight = {0: 100.0, 1: 1.0, 2: 1000.0}
class_weight = {0: 1.0, 1: 1.0, 2: 1.0}

epochs = 20
history = model.fit(
    train_features, train_labels,
    epochs=epochs,
    validation_data=(val_features, val_labels),
    callbacks=[early_stopping, accuracy_callback],
    class_weight=class_weight
    )

# 5. vypis vysledku a vizualizace

#print(classification_report(val_features, val_labels))
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
xepochs = range(1, len(accuracy)+1)

plt.plot(xepochs, accuracy, 'bo', label='Training accuracy')
plt.plot(xepochs, val_accuracy, 'b', label='Validation accuracy')
plt.title('Accuracy')
plt.figure()

plt.plot(xepochs, loss, 'bo', label='Training loss')
plt.plot(xepochs, val_loss, 'b', label='Validation loss')
plt.title('Loss')
plt.figure()

df = pd.DataFrame(history.history)
print(df)

#for data, label in validation_imageflow:
#  print('data batch shape:', data.shape)
#  plt.imshow(data[0])
#  plt.title(str(label[0]))
#  print('label batch shape:', label.shape)
#  break

plt.show()