In [None]:
from pathlib import Path
import numpy as np 
import xml.etree.ElementTree as ET
from sklearn.svm import SVC
from skimage.io import imread
from skimage import img_as_float
from skimage.transform import resize
# from concurrent.futures import ThreadPoolExecutor as PoolExecutor
# from concurrent.futures import as_completed
from joblib import Parallel, delayed
from time import time

from pillclassification.feature_extraction import feature_extraction
from pillclassification.functions import crop_center

images_dir = Path('dataset/merge')
filenames = [x for x in images_dir.iterdir() if x.suffix != '.xml']

samples_num = len(filenames)
feature_number = 10

In [None]:
# calculating labels 
try:
    tree = ET.parse(images_dir / 'images.xml')
except ET.ParseError:
    print('Parse error on {}'.format(images_dir / 'images.xml'))
    exit(-1)

se = list(tree.getroot())[0]

labels_set = set()
for e in list(se):
    labels_set.add(e.find('NDC9').text)

labels = sorted(list(labels_set))
class_num = len(labels)


In [None]:
def extract_features(f):
    # loading the image 
    try:
        img = imread(f)
    except ValueError as e:
        return None
    
    if img.shape[-1] == 4:
        img = img[:,:,:3]
    
    # cropping in the center
    img = crop_center(img, crop_scale=0.65)

    # rescaling with fixed width
    width = 600
    img = resize(img, (int(img.shape[0] * (width / img.shape[1])), width), anti_aliasing=True)

    # the img must be in float format 
    img = img_as_float(img)

    # feature extraction 
    try:
        hu, rgb_val = feature_extraction(img)
    except ValueError:
        return None
    
    label = -1
    for e in list(se):
        if e.find('File').find('Name').text == f.name:
            label = labels.index(e.find('NDC9').text)
            break
    
    return np.append(hu, rgb_val), label

In [None]:
# extracting features
x_data = np.zeros((samples_num, feature_number))
y_data = np.zeros(samples_num, dtype=np.int32)

def test(f):
    return f

with Parallel(n_jobs=12) as parallel:
    s = time()
    for i, r in enumerate(parallel(delayed(extract_features)(f) for f in filenames)):
        if r is not None:
            print('Data from image', i)
            x_data[i, :], y_data[i] = r
        else:
            print('No data from image', i)
            x_data[i, :], y_data[i] = [-1,-1]
    print(time() - s)

# printing and saving the features as npy file 
np.save('x_data_saved', x_data)
np.save('y_data_saved', y_data)

In [None]:
# loading features if already extracted
x_data = np.load('x_data_saved.npy')
y_data = np.load('y_data_saved.npy')

print(x_data.shape)
print(y_data.shape)

print(y_data)
class_num = np.max(y_data) + 2
samples_num = x_data.shape[0]

print(class_num)

In [None]:
# dividing dataset
from math import ceil

# division factor
factor = 2/3

# calculating the number of elements in the train set
train_n = 0
test_n = 1
for i in range(class_num):
    y_len = len(y_data[y_data == i])
    train_n += ceil(factor * y_len)
    test_n += y_len - ceil(factor * y_len)

print(train_n + test_n, x_data.shape[0])

x_train = np.zeros_like(x_data[:train_n, :])
y_train = np.zeros_like(y_data[:train_n])

x_test = np.zeros_like(x_data[train_n:, :])
y_test = np.zeros_like(y_data[train_n:])


train_idx, test_idx = 0, 0
for i in range(class_num):
    x = x_data[y_data == i, :]
    y = y_data[y_data == i]

    train_n = ceil(factor * len(y))
    test_n = len(y) - train_n

    x_train[train_idx:train_idx+train_n, :] = x[:train_n, :]
    y_train[train_idx:train_idx+train_n] = y[:train_n]

    x_test[test_idx:test_idx+test_n, :] = x[train_n:, :]
    y_test[test_idx:test_idx+test_n] = y[train_n:]

    train_idx += train_n
    test_idx += test_n

In [None]:
# SVC training

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

OVO = OneVsOneClassifier(make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-6, max_iter=100000)))
OVR = OneVsRestClassifier(make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-6, max_iter=100000)))
knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=1))

clcOVO = OVO.fit(x_train, y_train) # fitting
clcOVR = OVR.fit(x_train, y_train)
clc_knn = knn.fit(x_train, y_train)


In [None]:
x = extract_features(filenames[0])

In [None]:

s1 = clcOVO.score(x_test, y_test)
s2 = clcOVR.score(x_test, y_test)
s3 = clc_knn.score(x_test, y_test)
print(s1*100, s2*100, s3*100)