In [0]:
import sys
sys.version
sys.version_info

In [0]:
from keras.utils import to_categorical
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Activation, ZeroPadding2D
from keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D
from keras.layers.normalization import BatchNormalization
from sklearn.ensemble import RandomForestClassifier
from keras import regularizers
import csv
import matplotlib.pyplot as plt
import time

In [0]:
ZIPPED_ORIGINAL = "./data/processedData.zip" #"/content/drive/My Drive/processedData.zip"
ZIPPED_DATA = "./data/extendedData.zip" #"/content/drive/My Drive/extendedData.zip"
ZIPPED_KAGGLE = "./data/processed_kaggle.zip" #"/content/drive/My Drive/processed_kaggle.zip"

In [0]:
import zipfile
with zipfile.ZipFile(ZIPPED_DATA,"r") as zip_ref:
    zip_ref.extractall("./data")
with zipfile.ZipFile(ZIPPED_KAGGLE, "r") as kagzip:
    kagzip.extractall("./data")
with zipfile.ZipFile(ZIPPED_ORIGINAL, "r") as origzip:
    origzip.extractall("./data")

In [0]:
TRAIN_LAB_PATH = "./data/extendedTrainLabels.csv" # "./data/train_labels.csv"
PREPROCESSED_TRAINING = "./data/extendedTrainData.npy" # "./data/processedData.npy"
PREPROCESSED_KAGGLE = "./data/processed_kaggle.npy"
PREPROCESSED_ORIGINAL = "./data/processedData.npy"
ORIG_LAB_PATH = "./data/train_labels.csv"

In [0]:
CATEGORIES = ['apple', 'empty', 'moustache', 'mouth', 'mug', 'nail', 'nose', 'octagon', 'paintbrush', 'panda', 'parrot', 'peanut', 'pear', 'pencil', 'penguin', 'pillow', 'pineapple', 'pool', 'rabbit', 'rhinoceros', 'rifle', 'rollerskates', 'sailboat', 'scorpion', 'screwdriver', 'shovel', 'sink', 'skateboard', 'skull', 'spoon', 'squiggle']

def getIndexOf(category):
    return CATEGORIES.index(category)

def getCategoryOf(index):
    return CATEGORIES[index]

def load(infile):
    unformatted_images = np.load(infile, encoding='bytes')
    formatted = []
    for i,img in enumerate(unformatted_images):
        formatted.append([i, img[0]])
    return formatted

def formatXData(X, xDimension = 40):
    X = np.asarray(X)
    # Convert to matrix form
    X = X.reshape(-1, xDimension, xDimension, 1)
    # Convert to float
    X = X.astype('float32')
    # Scale pixel values between 0 and 1
    X = X / 255
    return X.astype('float32')

def addRotations(X,y):
    newX = []
    newY = []
    for i,XMatrix in enumerate(X):
        newX.append(XMatrix)
        newY.append(y[i])
        newX.append(np.rot90(XMatrix, 1))
        newY.append(y[i])
        newX.append(np.rot90(XMatrix, 2))
        newY.append(y[i])
        newX.append(np.rot90(XMatrix, 3))
        newY.append(y[i])
    return np.asarray(newX),np.asarray(newY)

def formatData(images, labels, xDimension = 40, onehot=False):
    if not onehot:
        categories = list(set(labels['Category']))
        X = []
        y = []
        for i, img in enumerate(images):
            label = labels.at[i,'Category']
            categoryNum = getIndexOf(label)
            X.append(img[1])
            y.append(categoryNum)
        y = to_categorical(y)
        X = formatXData(X, xDimension)
    else:
        X = []
        y = labels
        for i, img in enumerate(images):
            X.append(img[1])
        X = formatXData(X, xDimension)
    return X.astype('float32'), y

def split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1) 
    return X_train, y_train, X_test, y_test, X_val, y_val

def visualizePredictions(images, predictions, indices, shape=(40, 40)):
    fig, ax = plt.subplots(figsize=(len(CATEGORIES) + 1, len(indices) * 3))

    for spot, i in enumerate(indices):

        # plot probabilities:
        ax = plt.subplot2grid((len(indices), 5), (spot, 0), colspan=4);
        plt.bar(np.arange(len(CATEGORIES)), predictions[i], 0.35, align='center');
        plt.xticks(np.arange(len(CATEGORIES)), CATEGORIES)
        plt.tick_params(axis='x', bottom='off', top='off')
        plt.ylabel('Probability')
        plt.ylim(0,1)
        plt.subplots_adjust(hspace = 0.5)

        # plot picture:
        ax = plt.subplot2grid((len(indices), 5), (spot, 4));
        plt.imshow(images[i].reshape(shape),cmap='gray_r', interpolation='nearest');
        plt.xlabel(getCategoryOf(np.argmax(predictions[i]))); # get the label from the dict
        plt.xticks([])
        plt.yticks([])

def visualizePredictionsJustWrong(images, predictions, actual, start = 0, end = 10, shape=(40, 40)):
    fig, ax = plt.subplots(figsize=(len(CATEGORIES) + 1, 30))
    numFound = 0
    for i in range(start, len(images)):
        if getCategoryOf(np.argmax(predictions[i])) != getCategoryOf(np.argmax(actual[i])):
            # plot probabilities:
            ax = plt.subplot2grid((end - start, 5), (numFound, 0), colspan=4);
            plt.bar(np.arange(len(CATEGORIES)), predictions[i], 0.35, align='center');
            plt.xticks(np.arange(len(CATEGORIES)), CATEGORIES)
            plt.tick_params(axis='x', bottom='off', top='off')
            plt.ylabel('Probability')
            plt.ylim(0,1)
            plt.subplots_adjust(hspace = 0.5)

            # plot picture:
            ax = plt.subplot2grid((end - start, 5), (numFound, 4));
            plt.imshow(images[i].reshape(shape),cmap='gray_r', interpolation='nearest');
            plt.xlabel(str(i) + ":" +getCategoryOf(np.argmax(predictions[i])) + "/" +getCategoryOf(np.argmax(actual[i]))); # get the label from the dict
            plt.xticks([])
            plt.yticks([])
            
            numFound += 1
            if numFound >= end - start:
                return
            
def plotHistory(history):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

def loadModel(path):
    return load_model(path)

def savePredictions(outfile, predictions):
    if outfile == '':
        for i, prediction in enumerate(predictions):
            index = np.argmax(prediction)
            print(i,getCategoryOf(index))
    else:
        with open(outfile,'w') as csvfile:
            fieldnames = ['Id', 'Category']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for i, prediction in enumerate(predictions):
                index = np.argmax(prediction)
                writer.writerow({'Id': i, 'Category': getCategoryOf(index)})


In [0]:
def paddingAndNormalization(input_shape=(40, 40, 1)):
    model = Sequential()
    model.add(ZeroPadding2D(padding=(1, 1), input_shape=input_shape))
    model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 kernel_initializer='he_normal'))

    model.add(MaxPooling2D((2, 2)))
#     model.add(Dropout(0.25))
    model.add(ZeroPadding2D(padding=(1, 1)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(ZeroPadding2D(padding=(1, 1)))
    model.add(Dropout(0.25))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(ZeroPadding2D(padding=(1, 1)))
    model.add(Dropout(0.4))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])
    model.summary()
    return model

In [0]:
batch_size = 128
epochs = 40
num_classes = len(CATEGORIES)

In [0]:
# Separate originals from transformed images for sanity
training_imgs = load(PREPROCESSED_TRAINING)
labels = np.genfromtxt(TRAIN_LAB_PATH, skip_header=0)
origs_img = []
origs_lab = []
labels = labels.tolist()
idx = int(len(training_imgs)/60)

for i in range(0, idx):
    j = 60*i - i
    origs_img.append(training_imgs.pop(j))
    origs_lab.append(labels.pop(j))
    
labels = np.array(labels)
origs_lab = np.array(origs_lab)

In [0]:
# X, y are extended data minus originals. X_val, y_val are the originals.
X,y = formatData(training_imgs, labels, onehot=True) # Onehot because comes from concurrent_rotate.py
X_val, y_val = formatData(origs_img, origs_lab, onehot=True)

In [0]:
# Colab GPU memory!
del training_imgs
del labels
import gc
gc.collect()

In [0]:
# New model
model5 = paddingAndNormalization()

In [0]:
# Keep theses from history across training!
losses = []
accs = []

In [0]:
# Initial 40 epochs on extended data.
history = model5.fit(X, y,
          batch_size=batch_size,
          epochs=40,
          verbose=1,
          validation_data=(X_val, y_val))

losses.extend(history.history['loss'])
accs.extend(history.history['acc'])

model5.save('model.h5')
np.savetxt('modelacc.txt', accs)
np.savetxt('modelloss.txt', losses)

In [0]:
# If continuing training from saved trained model from above, start here, else ignore.
model5 = load_model('model.h5')
accs = np.loadtxt('modelacc.txt').tolist()
losses = np.loadtxt('modelloss.txt').tolist()

In [0]:
# If continuing from saved extra-trained model from below, start here, else ignore.
model5 = load_model('model-extra.h5')
# Load saved loss/acc history for extra-trained model.
losses = np.loadtxt('modelacc-extra.txt').tolist()
accs = np.loadtxt('modelloss-extra.txt').tolist()

In [0]:
# 5 epochs extended data / 10 epochs original data (for submission to Kaggle, so train on all data).
history = model5.fit(X, y,
          batch_size=batch_size,
          epochs=5,
          verbose=1)#,
#           validation_data=(X_val, y_val))

losses.extend(history.history['loss'])
accs.extend(history.history['acc'])

history = model5.fit(X_val, y_val,
          batch_size=batch_size,
          epochs=10,
          verbose=1)

losses.extend(history.history['loss'])
accs.extend(history.history['acc'])

preprocessed_kaggle = load(PREPROCESSED_KAGGLE)
X_kaggle = formatXData(list(map(lambda x: x[1], preprocessed_kaggle)))
kaggle_predictions = model5.predict(X_kaggle)

# Save predictions
savePredictions('model-extra.csv',kaggle_predictions)
# Save model with extra training
model5.save('model-extra.h5')  # creates a HDF5 file 'my_model.h5'
# Save loss/accuracy history
np.savetxt('modelacc-extra.txt', accs)
np.savetxt('modelloss-extra.txt', losses)

In [0]:
# Summarize history for accs
plt.plot(accs)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(losses)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()