In [62]:
#### Dependencies

from PIL import Image
from skimage.feature import greycomatrix, greycoprops

import glob
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import statistics


In [63]:
#### Global variables

training_data    = []
validation_data  = []
class_names      = []
map_8bit_to_3bit = [i // 32 for i in range(256)]

# Scaling
glcm_mean_values         = []
glcm_stdev_values        = []
glcm_min_values          = []
glcm_max_values          = []
glcm_max_subs_min_values = []
glcm_q1_values           = []
glcm_q3_values           = []
glcm_q3_subs_q1_values   = []

In [82]:
#### Functions

def load_img(img_path):
    return Image.open(img_path).convert('L')

def get_img_size(img):
    return img.size

def print_img(img):
    plt.imshow(img, cmap='gray')
    
def get_resized_img(img, dimension):
    return img.resize(dimension)
    
def get_img_colors(img):
    return list(img.getdata())

def get_3bit_img_colors(img):
    img_colors = get_img_colors(img)
    
    loop_count = 0
    for img_color in img_colors:
        img_colors[loop_count] = map_8bit_to_3bit[img_color]
        
        loop_count += 1
        
    return img_colors

def get_img_matrix(img_colors):
    img_matrix = []
    
    loop_count = 0
    img_square_dimension = int(math.sqrt(len(img_colors)))
    
    for row in range(img_square_dimension):
        temp_row = []
        for col in range(img_square_dimension):
            temp_row.append(img_colors[loop_count])
            
            loop_count += 1
        img_matrix.append(temp_row)
        
    return img_matrix

def get_img_features(img, scale_mode='non_scaled', glcm_components=['contrast', 'correlation', 'energy', 'homogeneity', 'ASM', 'dissimilarity']):
    img_3bit_colors = get_3bit_img_colors(img)
    img_matrix = get_img_matrix(img_3bit_colors)
    
    glcm_matrix = greycomatrix(img_matrix, distances=[1], angles=[0], levels=12, symmetric=False, normed=False)
    
    img_features = []
    for glcm_component in glcm_components:
        img_features.append(greycoprops(glcm_matrix, glcm_component)[0][0])
        
    scaled_img_features = get_scaled_img_features(img_features, scale_mode)
        
    return tuple(scaled_img_features)

def get_scaled_img_features(img_features, scale_mode='non_scaled'):
    if(scale_mode == 'non_scaled'):
        return tuple(img_features)
    else:
        scaled_img_features = []
        glcm_length = len(img_features)

        for glcm_index in range(glcm_length):
            if(scale_mode == 'standard_scaled'):
                scaled_img_features.append((img_features[glcm_index] - glcm_mean_values[glcm_index]) / glcm_stdev_values[glcm_index])
            elif(scale_mode == 'min_max_scaled'):
                scaled_img_features.append((img_features[glcm_index] - glcm_min_values[glcm_index]) / glcm_max_subs_min_values[glcm_index])
            elif(scale_mode == 'robust_scaled'):
                scaled_img_features.append((img_features[glcm_index] - glcm_q1_values[glcm_index]) / glcm_q3_subs_q1_values[glcm_index])
            else:
                scaled_img_features.append(img_features[glcm_index])

        return tuple(scaled_img_features)
    
# Modelling

def init_model():
    global training_data
    global validation_data
    
    training_data = []
    validation_data = []
    
def load_class_names(training_path):
    global class_names
    
    class_names = [class_name for class_name in os.listdir(training_path)]
            
def insert_img_features_into_training_data(class_name, img_features):
    global training_data
    
    row_tuple = (class_name,) + img_features
    training_data.append(row_tuple)
        
def insert_img_features_into_validation_data(class_name, img_features):
    global validation_data
    
    row_tuple = (class_name,) + img_features
    validation_data.append(row_tuple)

def load_preprocessed_img(img_path, dimension=(250, 250)):
    img = load_img(img_path)
    img = get_resized_img(img, dimension)
    
    return img

def get_single_glcm_feature(model, glcm_index):
    subset = []
    
    for row in model:
        subset.append(row[glcm_index])
        
    return subset

def get_features_only_model(model):
    subset = []
    
    for row in model:
        subset.append(row[1:])
        
    return subset

def get_standard_scaled_model(model):
    global glcm_mean_values
    global glcm_stdev_values
    
    standard_scaled_model = []
    
    if(len(model) > 0):
        model_row_length  = len(model[0])
        glcm_mean_values  = [statistics.mean(get_single_glcm_feature(model, i)) for i in range(1, model_row_length)]
        glcm_stdev_values = [statistics.stdev(get_single_glcm_feature(model, i)) for i in range(1, model_row_length)]
        
        glcm_length = len(glcm_mean_values)
        
        for row in model:
            temp_row = []
            temp_row.append(row[0])
            temp_row += list(get_scaled_img_features(row[1:], 'standard_scaled'))

            standard_scaled_model.append(tuple(temp_row))
    
    return standard_scaled_model
        
        
def get_min_max_scaled_model(model):
    global glcm_min_values
    global glcm_max_values
    global glcm_max_subs_min_values
    
    min_max_scaled_model = []
    
    if(len(model) > 0):
        model_row_length     = len(model[0])
        glcm_min_values      = [min(model)[i] for i in range(1, model_row_length)]
        glcm_max_values      = [max(model)[i] for i in range(1, model_row_length)]
        
        glcm_length = len(glcm_min_values)
        glcm_max_subs_min_values = [glcm_max_values[i] - glcm_min_values[i] for i in range(glcm_length)]

        for row in model:
            temp_row = []
            temp_row.append(row[0])
            temp_row += list(get_scaled_img_features(row[1:], 'min_max_scaled'))
                
            min_max_scaled_model.append(tuple(temp_row))
            
    return min_max_scaled_model

def get_robust_scaled_model(model):
    global glcm_q1_values
    global glcm_q3_values
    global glcm_q3_subs_q1_values
    
    robust_scaled_model = []
    
    if(len(model) > 0):
        features_only_model = get_features_only_model(model)
        
        model_row_length = len(model[0])
        glcm_q1_values   = np.quantile(features_only_model, .25, axis=0)
        glcm_q3_values   = np.quantile(features_only_model, .75, axis=0)
        
        glcm_length = len(glcm_q1_values)
        glcm_q3_subs_q1_values = [glcm_q3_values[i] - glcm_q1_values[i] for i in range(glcm_length)]
        
        for row in model:
            temp_row = []
            temp_row.append(row[0])
            temp_row += list(get_scaled_img_features(row[1:], 'robust_scaled'))
                
            robust_scaled_model.append(temp_row)
            
    return robust_scaled_model
    
def load_data(training_path, validation_path, data_path, model_name, img_type='*.jpg', scale_mode='non_scaled', glcm_components=['contrast', 'correlation', 'energy', 'homogeneity', 'ASM', 'dissimilarity'], is_skip=False):
    global training_data
    global validation_data
    
    print('Loading data...')
    
    load_class_names(training_path)
    
    training_data_path   = data_path + model_name + '_' + scale_mode + '_training.data'
    validation_data_path = data_path + model_name + '_' + scale_mode + '_validation.data'
    
    if(os.path.exists(training_data_path) and os.path.exists(validation_data_path) and not is_skip):
        training_data_file = open(training_data_path, 'r')
        training_data_str = training_data_file.read().split('\n')
        
        for row in training_data_str:
            row = row[1:len(row) - 1].split(', ')
            row[1:] = [float(value) for value in row[1:]]
            row_tuple = tuple(row)
            training_data.append(row_tuple)
        
        validation_data_file = open(validation_data_path, 'r')
        validation_data_str = validation_data_file.read().split('\n')
        
        for row in validation_data_str:
            row = row[1:len(row) - 1].split(', ')
            row[1:] = [float(value) for value in row[1:]]
            row_tuple = tuple(row)
            validation_data.append(row_tuple)
    else:
        for class_name in class_names:
            img_paths = glob.glob(training_path + class_name + '/' + img_type)

            for img_path in img_paths:
                img = load_preprocessed_img(img_path)
                img_features = get_img_features(img, glcm_components)

                insert_img_features_into_training_data(class_name, img_features)

        for class_name in class_names:
            img_paths = glob.glob(validation_path + class_name + '/' + img_type)

            for img_path in img_paths:
                img = load_preprocessed_img(img_path)
                img_features = get_img_features(img, glcm_components)

                insert_img_features_into_validation_data(class_name, img_features)
                
        if(scale_mode == 'standard_scaled'):
            training_data   = get_standard_scaled_model(training_data)
            validation_data = get_standard_scaled_model(validation_data)
        elif(scale_mode == 'min_max_scaled'):
            training_data   = get_min_max_scaled_model(training_data)
            validation_data = get_min_max_scaled_model(validation_data)
        elif(scale_mode == 'robust_scaled'):
            training_data   = get_robust_scaled_model(training_data)
            validation_data = get_robust_scaled_model(validation_data)
                
        if(not os.path.exists(data_path)):
            os.mkdir(data_path)
                
        with open(training_data_path, 'w+') as file_writer:
            training_data_last_index = len(training_data) - 1
            for row_index in range(training_data_last_index):
                file_writer.write(str(training_data[row_index]).replace('\'', '') + '\n')
            file_writer.write(str(training_data[training_data_last_index]).replace('\'', ''))
                
        with open(validation_data_path, 'w+') as file_writer:
            validation_data_last_index = len(validation_data) - 1
            for row_index in range(validation_data_last_index):
                file_writer.write(str(validation_data[row_index]).replace('\'', '') + '\n')
            file_writer.write(str(validation_data[validation_data_last_index]).replace('\'', ''))
            
    print('--> Done\n')
    
def train(training_rate=0.8):
    print('Training...')
    training_data_sample = []
    testing_data_sample = []
    
    for row in training_data:
        random_splitter = random.uniform(0, 1)

        if(random_splitter <= training_rate):
            training_data_sample.append(row)
        else:
            testing_data_sample.append(row)
            
    print('--> Done')
    return training_data_sample, testing_data_sample   
    
def get_euclidean_distance(training_img_features, img_features):
    distance = 0.0
    
    for glcm_index in range(6):
        distance += (training_img_features[glcm_index] - img_features[glcm_index]) ** 2
        
    return math.sqrt(distance)

def get_img_features_class(img_features, training_data_sample, k_neighbors=1):
    minimum_training_img_distances = [-1 for i in range(k_neighbors)]
    minimum_training_img_class_names = ['unknown' for i in range(k_neighbors)]
    is_first_loop = True
    for row in training_data_sample:
        training_img_class_name = row[0]
        training_img_features = row[1:]
        
        euclidean_distance = get_euclidean_distance(training_img_features, img_features)
            
        if(is_first_loop):
            minimum_training_img_distances[0] = euclidean_distance
            minimum_training_img_class_names[0] = training_img_class_name
            is_first_loop = False
        else:
            for i in range(k_neighbors):
                if(euclidean_distance < minimum_training_img_distances[i] or minimum_training_img_distances[i] == -1):
                    for j in range(k_neighbors - 1, i, -1):
                        minimum_training_img_distances[j] = minimum_training_img_distances[j-1]
                        minimum_training_img_class_names[j] = minimum_training_img_class_names[j-1]
                    minimum_training_img_distances[i] = euclidean_distance
                    minimum_training_img_class_names[i] = training_img_class_name
                    break
    
    minimum_training_img_class_names = list(filter(lambda val: val != 'unknown', minimum_training_img_class_names))
    img_class_name = max(set(minimum_training_img_class_names), key=minimum_training_img_class_names.count)
    
    return img_class_name
    
def get_img_class(img, training_data_sample, k_neighbors=1, scale_mode='non_scaled', glcm_components=['contrast', 'correlation', 'energy', 'homogeneity', 'ASM', 'dissimilarity']):
    img_features = get_img_features(img, scale_mode, glcm_components)
    img_class_name = get_img_features_class(img_features, training_data_sample, k_neighbors)
    
    return img_class_name
    
def test(testing_data_sample, training_data_sample, k_neighbors=1):
    print('Testing...')
    total_correct_answer = 0
    total_guess = 0
    
    for row in testing_data_sample:
        expected_class_name = row[0]
        test_img_features = row[1:]
        
        test_img_class_name = get_img_features_class(test_img_features, training_data_sample, k_neighbors)
            
        if(expected_class_name == test_img_class_name):
            total_correct_answer += 1
                
        total_guess += 1
    
    accuracy = (total_correct_answer / total_guess) * 100
    
    print('--> Done, accuracy = ' + str(accuracy) + ' %')
    
    return accuracy
    
def validate(training_data_sample, k_neighbors=1):
    print('Validating...')
    total_correct_answer = 0
    total_guess = 0
    
    for row in validation_data:
        expected_class_name = row[0]
        validation_img_features = row[1:]
        
        img_class_name = get_img_features_class(validation_img_features, training_data_sample, k_neighbors)
            
        if(expected_class_name == img_class_name):
            total_correct_answer += 1
                
        total_guess += 1
            
    print('--> Done\n')
    
    accuracy = (total_correct_answer / total_guess) * 100
    
    return accuracy
    
def save_model(k_neighbors, epochs, training_model, testing_model, model_path, model_name, scale_mode='non_scaled'):
    knn_model_path = model_path + str(k_neighbors) + '/'
    training_model_path = knn_model_path + model_name + '_' + scale_mode + '_training.model'
    testing_model_path  = knn_model_path + model_name + '_' + scale_mode + '_validation.model'
    model_epochs_path   = knn_model_path + model_name + '_' + scale_mode + '_epochs.model'
    
    if(not os.path.exists(model_path)):
        os.mkdir(model_path)
    
    if(not os.path.exists(knn_model_path)):
        os.mkdir(knn_model_path)
                
    with open(training_model_path, 'w+') as file_writer:
        training_model_last_index = len(training_model) - 1
        for row_index in range(training_model_last_index):
            file_writer.write(str(training_model[row_index]).replace('\'', '') + '\n')
        file_writer.write(str(training_model[training_model_last_index]).replace('\'', ''))
                
    with open(testing_model_path, 'w+') as file_writer:
        testing_model_last_index = len(testing_model) - 1
        for row_index in range(testing_model_last_index):
            file_writer.write(str(testing_model[row_index]).replace('\'', '') + '\n')
        file_writer.write(str(testing_model[testing_model_last_index]).replace('\'', ''))
        
    with open(model_epochs_path, 'w+') as file_writer:
        file_writer.write(str(epochs))
        
    print('The best model has been saved.')
        
def get_saved_model(k_neighbors, model_path, model_name, scale_mode='non_scaled'):
    print('Load previous best model in the same k_neighbors (k=' + str(k_neighbors) + ')...')
    knn_model_path = model_path + str(k_neighbors) + '/'
    training_model_path = knn_model_path + model_name + '_' + scale_mode + '_training.model'
    testing_model_path  = knn_model_path + model_name + '_' + scale_mode + '_validation.model'
    model_epochs_path   = knn_model_path + model_name + '_' + scale_mode + '_epochs.model'
    
    training_model = []
    testing_model  = []
    model_epochs   = 0
    
    if(os.path.exists(training_model_path) and os.path.exists(testing_model_path) and os.path.exists(model_epochs_path)):
        training_model_file = open(training_model_path, 'r')
        training_model_str = training_model_file.read().split('\n')
        
        for row in training_model_str:
            row       = row[1:len(row) - 1].split(', ')
            row[1:]   = [float(value) for value in row[1:]]
            row_tuple = tuple(row)
            training_model.append(row_tuple)
        
        testing_model_file = open(testing_model_path, 'r')
        testing_model_str = testing_model_file.read().split('\n')
        
        for row in testing_model_str:
            row       = row[1:len(row) - 1].split(', ')
            row[1:]   = [float(value) for value in row[1:]]
            row_tuple = tuple(row)
            testing_model.append(row_tuple)
            
        model_epochs = int(open(model_epochs_path, 'r').read())\
        
        print('--> Done, k=' + str(k_neighbors) + ', epochs=' + str(model_epochs) + '.\n')
    else:
        print('--> Failed, model not found in the same k_neighbors (k=' + str(k_neighbors) + ')\n')
        
    return training_model, testing_model, model_epochs


In [87]:
#### Main

# Props
iterations           = 5
epochs               = 20
training_rate        = 0.8
img_type             = '*.jpg'
k_neighbors          = 3
glcm_components      = ['contrast', 'correlation', 'energy', 'homogeneity', 'ASM', 'dissimilarity']
perfect_test_overlap = k_neighbors + (k_neighbors // 2)
scale_modes          = ['non_scaled', 'standard_scaled', 'min_max_scaled', 'robust_scaled']
scale_mode_index     = 0
is_skip_load         = True        

# Defining model name
model_name = 'cap_tulis_glcm_knn'

# Defining paths
root_path            = './'
test_path            = root_path + 'test/'
training_path        = root_path + 'training/'
validation_path      = root_path + 'validation/'
data_path            = root_path + 'data/'
model_path           = root_path + 'model/'

# Init
init_model()
load_data(training_path, validation_path, data_path, model_name, img_type, scale_modes[scale_mode_index], glcm_components, is_skip_load)
best_model_accuracy  = 0.0
best_model_index     = 0
best_model_epochs    = 0
training_data_sample = []
testing_data_sample  = []
is_previous_best_model_loaded = False

# Load the saved previous best model
print('\n')
previous_best_training_data_sample, previous_best_testing_data_sample, previous_best_model_epochs = get_saved_model(k_neighbors, model_path, model_name, scale_modes[scale_mode_index])
if(len(previous_best_training_data_sample) > 0):
    is_previous_best_model_loaded = True
    best_model_accuracy = validate(previous_best_training_data_sample, k_neighbors)
    best_model_epochs   = previous_best_model_epochs
    training_data_sample.append(previous_best_training_data_sample)
    testing_data_sample.append(previous_best_testing_data_sample)
    
    print('Best model accuracy')
    print('Epochs   : ' + str(best_model_epochs))
    print('Accuracy : ' + str(best_model_accuracy))
    print('\n\n')
    
# Learning
for iteration in range(iterations):
    print('=========================================')
    print('Iteration ' + str(iteration + 1))
    print('=========================================')
    data_sample_index = iteration + is_previous_best_model_loaded
    training_data_sample.append([])
    testing_data_sample.append([])
    perfect_accuracy_count = 0
    
    epoch = 1
    while(epoch <= epochs):
        print('Epoch ' + str(epoch))
        new_training_data_sample, new_testing_data_sample = train(training_rate)
        training_data_sample[data_sample_index] += new_training_data_sample
        testing_data_sample[data_sample_index]  += new_testing_data_sample
        testing_accuracy = test(testing_data_sample[data_sample_index], training_data_sample[data_sample_index], k_neighbors)
        print()
        
        if(testing_accuracy == 100.0):
            perfect_accuracy_count += 1
            if(perfect_accuracy_count == perfect_test_overlap):
                break
                
        epoch += 1

    model_accuracy = validate(training_data_sample[data_sample_index], k_neighbors)
    
    if(model_accuracy > best_model_accuracy):
        best_model_accuracy = model_accuracy
        best_model_epochs   = epoch
        best_model_index    = data_sample_index
    elif(model_accuracy == best_model_accuracy and epoch < best_model_epochs):
        best_model_accuracy = model_accuracy
        best_model_epochs   = epoch
        best_model_index    = data_sample_index

    print('Accuracy: ' + str(model_accuracy) + ' %')
    print('\n\n')
    
    
# Print the best model
print('\nBest Model')
print('Name     : ' + model_name + '_' + scale_modes[scale_mode_index])
print('Index    : ' + str(best_model_index - is_previous_best_model_loaded))
print('Epochs   : ' + str(best_model_epochs))
print('Accuracy : ' + str(best_model_accuracy))

if(best_model_index == 0 and is_previous_best_model_loaded):
    print('The best model still the previous best model.\n')
else:
    print('NEW BEST MODEL!!!\n')

# Save the best model
save_model(k_neighbors, best_model_epochs, training_data_sample[best_model_index], testing_data_sample[best_model_index], model_path, model_name, scale_modes[scale_mode_index])

Loading data...
--> Done



Load previous best model in the same k_neighbors (k=3)...
--> Done, k=3, epochs=7.

Validating...
--> Done

Best model accuracy
Epochs   : 7
Accuracy : 95.703125



Iteration 1
Epoch 1
Training...
--> Done
Testing...
--> Done, accuracy = 91.75257731958763 %

Epoch 2
Training...
--> Done
Testing...
--> Done, accuracy = 96.51741293532339 %

Epoch 3
Training...
--> Done
Testing...
--> Done, accuracy = 100.0 %

Epoch 4
Training...
--> Done
Testing...
--> Done, accuracy = 100.0 %

Epoch 5
Training...
--> Done
Testing...
--> Done, accuracy = 100.0 %

Epoch 6
Training...
--> Done
Testing...
--> Done, accuracy = 100.0 %

Validating...
--> Done

Accuracy: 95.703125 %



Iteration 2
Epoch 1
Training...
--> Done
Testing...
--> Done, accuracy = 95.28301886792453 %

Epoch 2
Training...
--> Done
Testing...
--> Done, accuracy = 95.37037037037037 %

Epoch 3
Training...
--> Done
Testing...
--> Done, accuracy = 97.23926380368098 %

Epoch 4
Training...
--> Done
Testing...
--> 

In [88]:
img = load_img(test_path + 'cap.jpg')
print(get_img_class(img, training_data_sample[best_model_index], k_neighbors, scale_modes[scale_mode_index], glcm_components))
# test correct

cap


In [89]:
img = load_img(test_path + 'cap2.jpg')
print(get_img_class(img, training_data_sample[best_model_index], k_neighbors, scale_modes[scale_mode_index], glcm_components))
# test false

tulis


In [90]:
img = load_img(test_path + 'tulis.jpg')
print(get_img_class(img, training_data_sample[best_model_index], k_neighbors, scale_modes[scale_mode_index], glcm_components))
# test correct

tulis
