In [None]:
# CELL 0 : import 

from os import walk 
from keras.preprocessing.image import ImageDataGenerator
from matplotlib.pyplot import imshow, subplots, title, xticks, legend, show, figure
from numpy import zeros, arange, where
from keras import Sequential
from keras.layers import RandomZoom, RandomRotation, Conv2D, MaxPool2D, Dropout, Flatten, Dense
from random import randint
from tensorflow import gather  
from keras.utils import to_categorical

In [None]:
# First part : training the model 

#-----------------------------------------------------------------------------

# CELL 1.1 : accessing to data 
# Needs [ CELL 0 : import ]

# defines variables 

#-----------------------------------------------------------------------------

path_training = 'BelgiumTSC_Training\Training'
path_testing = 'BelgiumTSC_Testing\Testing'

nbr_class = 62 # len(next(walk(path_training))[1])  counting the number of classes 
print('number of classes : ', nbr_class)

#-----------------------------------------------------------------------------

# resol is the format for the images that we want for convenience

resol = (50,50) # Square image only ! (data augmentation flip)
print("Resolution of images : ", resol)

In [None]:
# CELL 1.2 : accessing to data 
# Needs [ CELL 1.1 ]

#-----------------------------------------------------------------------------

# retrieving the number of images to be treated in the training folder

DIR = path_training
counter = 0
for root, dirs, files in walk(DIR) :
    for file in files:    
        if file.endswith('.ppm') or file.endswith('.jpeg'):
            counter += 1

print("number of images in the training folder : ", counter)

#-----------------------------------------------------------------------------

# Creating an image.DirectoryIterator to work over the images of the training folder  

train_datagen = ImageDataGenerator(rescale=1./255)
training_set = train_datagen.flow_from_directory(path_training,target_size = resol,
batch_size = counter,class_mode = 'binary', color_mode='rgb')

In [None]:
# CELL 1.3 : accessing to data 
# Needs [ CELL 1.1 ]

#-----------------------------------------------------------------------------

# retrieving the number of images to be treated in the testing folder

DIR = path_testing
counter = 0
for root, dirs, files in walk(DIR) :
    for file in files:    
        if file.endswith('.ppm') or file.endswith('.jpeg'):
            counter += 1

print("number of images in the testing folder : ", counter)


#-----------------------------------------------------------------------------

# Creating an image.DirectoryIterator to work over the images of the testing folder 

test_datagen = ImageDataGenerator(rescale=1./255)
test_set = test_datagen.flow_from_directory(path_testing,target_size = resol,
batch_size = counter,class_mode = 'binary', color_mode='rgb')

In [None]:
# CELL 1.4 : storing in arrays
# Needs [ CELL 1.2, CELL 1.3 ]

#-----------------------------------------------------------------------------

# Storing all the information in arrays for convenience 

X_train , y_train = training_set.next()
X_test , y_test = test_set.next()

print("Shape of X_train : ", X_train.shape)
print("Shape of y_train : ", y_train.shape)
print("Shape of X_test : ", X_test.shape)
print("Shape of y_test : ", y_test.shape)

In [None]:
# CELL 1.5 : visualisation of the initial sets
# Needs [ CELL 1.4 ] 

#-----------------------------------------------------------------------------

# Print the number of signs of each type in the initial sets

initial_nbr_train = zeros(nbr_class, dtype=int)
initial_nbr_test = zeros(nbr_class, dtype=int)

for i in range(nbr_class) :
    initial_nbr_train[i] = int((y_train.copy() == i).sum())  # Number of images of class i in the Training set
    initial_nbr_test[i] = int((y_test.copy() == i).sum())   # Number of images of class i in the Test set 

#-----------------------------------------------------------------------------

print("Number of each sign in the train set : ")
print()
print(initial_nbr_train)
print()
print("Total of signs : ", initial_nbr_train.sum())
print()

print("Number of each sign in the test set : ")
print()
print(initial_nbr_test)
print()
print("Total of signs : ", initial_nbr_test.sum())

In [None]:
# CELL 1.6 :  visualisation of the dataset
# Needs [ CELL 1.5 ]

# Defines a fct that plot histo of what's in the set 
# Also print the same info as CELL 1.5 

#-----------------------------------------------------------------------------

def graphs (nrb_train, y_train) : 

    print("Number of each sign in the train set : ")
    print()
    print(nrb_train)
    print()
    print("Total of signs : ", nrb_train.sum())
    print()

    print("Number of each sign in the test set : ")
    print()
    print(initial_nbr_test)
    print()
    print("Total of signs : ", initial_nbr_test.sum())

    #-----------------------------------------------------------------------------

    # We plot an histo showing how many signs of each class we have in each set 

    fig, ax = subplots(figsize = (20, 7))
    bins = [x + 0.5 for x in range(-1, nbr_class)]
    ax.hist([y_train.copy(), y_test.copy()], range = (0, nbr_class-1), bins=bins, edgecolor = 'white', color = ['blueviolet','black'], label = ['y_train', 'y_test'])
    title("Visualisation of the number of signs of each class in each set")
    xticks(arange(nbr_class))
    legend()
    show()

In [None]:
# CELL 1.7 :  visualisation of the dataset
# Needs [ CELL 1.6 :  visualisation of the dataset ]

# Visualisation of the initial sets 

#-----------------------------------------------------------------------------

graphs(initial_nbr_train, y_train)

In [None]:
# CELL 1.8 : randomization for augmentation 
# Needs [ CELL 1.4 ]

# This cell is about data augmentation 
# We'll randomize our initial set and do our augmentation with this (cells later)

#-----------------------------------------------------------------------------

data_augmentation = Sequential() 

data_augmentation.add(RandomZoom(0.05))
data_augmentation.add(RandomRotation(0.05)) 

#-----------------------------------------------------------------------------

# We apply the augmentation on our datasets 
augmented_image_train = data_augmentation(X_train.copy())
augmented_image_test = data_augmentation(X_test.copy())

#-----------------------------------------------------------------------------

print("Shape of the randomized test set : ", augmented_image_test.shape)
print("Shape of the randomized training set : ", augmented_image_train.shape)


In [None]:
# CELL 1.9 : test of the randomization 
# Needs [ CELL 1.8 ]

# A little test to see the result of the augmentation 

#-----------------------------------------------------------------------------

index = randint(0,len(X_train))
figure()
imshow(X_train[index])
figure()
imshow(augmented_image_train[index])

In [None]:
# CELL 1.10 : augmentation
# Needs [ CELL 1.8 ] 

#-----------------------------------------------------------------------------

# Second method of data augmentation 
# All classes are represented with the same number of sign, the max already in 

memory = zeros(nbr_class, dtype=int)
lim = max(initial_nbr_train) #500
counter = 0

#-----------------------------------------------------------------------------

for i in range(nbr_class) :

    memory[i] = lim - initial_nbr_train[i]
    counter += lim - initial_nbr_train[i]

#-----------------------------------------------------------------------------

# We create new sets that we'll fill with the data of the initial sets + the augmented data

X_train_second = zeros((len(X_train) + counter, resol[0], resol[1], 3))
y_train_second = zeros(len(y_train) + counter)

X_train_second[:len(X_train)] = X_train.copy()
y_train_second[:len(y_train)] = y_train.copy()

#-----------------------------------------------------------------------------

# We'll start adding values at this index

index = len(X_train)

nbr_train_second = initial_nbr_train.copy() 

#-----------------------------------------------------------------------------

for i in range(nbr_class) :

    indices = where(y_train.copy() == i)
    augmented_image = gather(X_train.copy(), indices=indices[0])

    for j in range(memory[i]) :

        idx = randint(0, len(indices[0])-1)

        nbr_train_second[i] += 1 

        X_train_second[index] = augmented_image[idx]
        y_train_second[index] = i
        index += 1

#-----------------------------------------------------------------------------

print("Shape of the augmented training set with second method : ", X_train_second.shape)
print("Shape of the augmented training target with second method : ", y_train_second.shape)

In [None]:
# CELL 1.11 :  visualisation of the dataset
# Needs [ CELL 1.10 ]

# Visualisation of the augmented sets 

#-----------------------------------------------------------------------------

graphs(nbr_train_second,y_train_second )

In [None]:
# CELL 1.12 : categorical
# Needs [ CELL 1.10 ]

# We put our results to categorical

#-----------------------------------------------------------------------------

y_test_tc = to_categorical(y_test.copy(), nbr_class)
y_train_tc = to_categorical(y_train.copy(), nbr_class)

print("Shape of y_train without augmentation : ", y_train_tc.shape)
print("Shape of y_test without augmentation : ", y_test_tc.shape)

#-----------------------------------------------------------------------------

y_train_second_tc = to_categorical(y_train_second.copy(), nbr_class)

print("Shape of y_train with second method of augmentation : ", y_train_second_tc.shape)
print("Shape of y_test with second method of augmentation : ", y_test_tc.shape)

In [None]:
# CELL 1.13 : construction of the model 
# Needs [ CELL 0 ]

# We construct our model

#-----------------------------------------------------------------------------

def construct_model (array) : 

    model = Sequential()

    # Tune the number of filters for the second Conv2D 
    # Choose an optimal value from 64-128
    
    model.add(Conv2D(kernel_size=(6,6),filters=112, activation='relu', input_shape=X_train_second.shape[1:]))
    model.add(MaxPool2D(pool_size=(3,3)))
    model.add(Dropout(rate=0.25))

    model.add(Conv2D(kernel_size=(3,3),filters=208, activation='relu'))
    model.add(MaxPool2D(pool_size=(3,3)))
    model.add(Dropout(rate=0.25))
    
    model.add(Conv2D(kernel_size=(2,2),filters=256, activation='relu'))
    model.add(MaxPool2D(pool_size=(2,2)))
    model.add(Dropout(rate=0.25))

    model.add(Flatten())

    model.add(Dense(82, activation = 'relu'))
    model.add(Dense(nbr_class, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
# CELL 1.14 : construction of the model 
# Needs [ CELL 1.13, CELL 1.10 ]

# We construct our model based on the augmented dataset 

#-----------------------------------------------------------------------------

model_second = construct_model(X_train_second.copy())
model_second.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# We train here the model with the first method of augmentation 

eps = 12 # The number of time we want the model to train on the entire training_set
# steps = 300 # the number of iteration for each epochs 

#-----------------------------------------------------------------------------

validation_X_second = X_test.copy()
validation_y_second = y_test_tc.copy()

#-----------------------------------------------------------------------------

mycallbacks = [EarlyStopping(restore_best_weights=True, patience=4), ReduceLROnPlateau(patience=4)] 

#-----------------------------------------------------------------------------

anc_second = model_second.fit(X_train_second.copy(),
                              y_train_second_tc.copy(),
                              validation_data=(validation_X_second,validation_y_second),
                              epochs=eps,
                              # steps_per_epoch=steps, 
                              callbacks = mycallbacks)

#-----------------------------------------------------------------------------

#added aug.flow to (X_train, y_train) to do data augmentation
# validation_data=(X_test_new_first, y_test_new_first)
# validation_data=(X_test, y_test)

# talk : aug.flow ? 
# talk : validation data strange 

In [None]:
from matplotlib.pyplot import figure, subplot, plot, title, xlabel, ylabel, legend, tight_layout, show

def perf(anc) : 

    # Plot of performances 

    figure()

    subplot(2, 1, 1)
    plot(anc.history['accuracy'], label='training accuracy', color = 'darkblue')
    plot(anc.history['val_accuracy'], label='test accuracy', color = 'magenta')
    title('Accuracy')
    xlabel('epochs')
    ylabel('accuracy')
    legend()

    subplot(2, 1, 2)
    plot(anc.history['loss'], label='training loss', color = 'darkblue')
    plot(anc.history['val_loss'], label='test loss', color = 'magenta')
    title('Loss')
    xlabel('epochs')
    ylabel('loss')
    legend()

    tight_layout()
    show()

In [None]:
perf(anc_second)

In [None]:
# Here's a function that will give the score that we can see on kaggle 

true = y_test_tc.copy().argmax(axis=1)

print("True codes : ", true)
print("Number of true codes : ", len(true))

predict = model_second.predict(X_test.copy()).argmax(axis=1)

print("Predictions : ", predict)
print("Number of predictions : ", len(predict))

right = 0 

for i in range(len(true)) : 
    if predict[i] == true[i] :  
        right += 1 

print("Number of right : ", right)
print("Number of elements : ", len(true))

print("Ratio : ", right/len(true))

In [None]:
label_names = open("dict.csv").read().strip().split("\n")[0:]
label_names = [l.split(",")[1] for l in label_names]

In [None]:
from sklearn.metrics import classification_report

# Evaluate the network

print("[INFO] evaluating network...")
predictions_second = model_second.predict(X_test.copy()) 
print(classification_report(y_test_tc.copy().argmax(axis=1),
	predictions_second.argmax(axis=1), target_names=label_names, labels=range(nbr_class)))

In [None]:
from os import listdir
from numpy import asarray, append, array
from PIL import Image

# We store all of the images from the kaggle folder in an array 
# recall that target has been defined above 

images = [] 
names = []

# get the path/directory
folder_dir = 'eval_kaggle1'

for image in listdir(folder_dir):
    # check if the image ends with ppm
    if (image.endswith(".ppm")):
        img = Image.open(folder_dir + '/' + image)
        img = img.resize(resol) # (30,30) as an example 
        img = asarray(img)
        images.append(img) 
        names.append(image.replace('.ppm',''))

images = array(images)

print("Number of images and their resolution in the kaggle dataset : ", images.shape)

In [None]:
# Here is a code to save all ppm in jpeg in a directory called names (must be created)

# get the path/directory
folder_dir = "eval_kaggle1"

for image in listdir(folder_dir):
    # check if the image ends with ppm
    if (image.endswith(".ppm")):
        img = Image.open(folder_dir + '/' + image)
        img.save("names" + '/' + image.replace('.ppm','.jpg'), format = 'JPEG') 

In [None]:
from csv import DictReader

# We will store the data in dict.csv in a dict 

data = {}

with open('dict.csv', 'r') as f:
    d_reader = DictReader(f, fieldnames=["num", "sign"])

    #get fieldnames from DictReader object and store in list
    for row in d_reader:
        data[row['num']] = row['sign']

In [None]:
from matplotlib.pyplot import figure, show, title, imshow 
from csv import DictWriter

# We then print the image with num and the sign predicted as a title 

predictions = model_second.predict(images).argmax(axis=1) 

for i in range(20): 
    figure(figsize = (10,10))
    imshow(images[i])
    sign = data[str(predictions[i])]
    title(str(predictions[i]) + " : " + str(sign))

# Here is the code to write the results in a CSV for kaggle 

with open('final_' + '.csv', 'w', newline='') as csvfile:
    fieldnames = ['Id', 'Category']
    writer = DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for i in range(len(predictions)): 
        writer.writerow({'Id' : names[i], 'Category' : predictions[i]}) 