In [8]:
from pathlib import Path
import numpy as np 
import xml.etree.ElementTree as ET
from sklearn.svm import SVC
from skimage.io import imread
from skimage import img_as_float
from skimage.transform import resize
# from concurrent.futures import ThreadPoolExecutor as PoolExecutor
# from concurrent.futures import as_completed
from joblib import Parallel, delayed
from time import time

from pillclassification.feature_extraction import feature_extraction
from pillclassification.functions import crop_center

images_dir = Path('utils/Dataset/merge')
filenames = [x for x in images_dir.iterdir() if x.suffix != '.xml']

samples_num = len(filenames)
feature_number = 10

In [9]:
# calculating labels 
try:
    tree = ET.parse(images_dir / 'images.xml')
except ET.ParseError:
    print('Parse error on {}'.format(images_dir / 'images.xml'))
    exit(-1)

se = list(tree.getroot())[0]

labels_set = set()
for e in list(se):
    labels_set.add(e.find('NDC9').text)

labels = sorted(list(labels_set))
class_num = len(labels)


In [14]:
# extracting features
x_data = np.zeros((samples_num, feature_number))
y_data = np.zeros(samples_num, dtype=np.int32)

def extract_features(f):
    # loading the image 
    try:
        img = imread(f)
    except ValueError as e:
        return None
    
    if img.shape[-1] == 4:
        img = img[:,:,:3]
    
    # cropping in the center
    img = crop_center(img, crop_scale=0.65)

    # rescaling with fixed width
    width = 600
    img = resize(img, (int(img.shape[0] * (width / img.shape[1])), width), anti_aliasing=True)

    # the img must be in float format 
    img = img_as_float(img)

    # feature extraction 
    try:
        hu, rgb_val = feature_extraction(img)
    except ValueError:
        return None
    
    label = -1
    for e in list(se):
        if e.find('File').find('Name').text == f.name:
            label = labels.index(e.find('NDC9').text)
            break
    
    return np.append(hu, rgb_val), label

def test(f):
    return f

with Parallel(n_jobs=20) as parallel:
    s = time()
    for i, r in enumerate(parallel(delayed(extract_features)(f) for f in filenames)):
        if r is not None:
            print('Data from image', i)
            x_data[i, :], y_data[i] = r
        else:
            print('No data from image', i)
            x_data[i, :], y_data[i] = [-1,-1]
    print(time() - s)

# printing and saving the features as npy file 
np.save('x_data_saved', x_data)
np.save('y_data_saved', y_data)

Data from image 0
No data from image 1
Data from image 2
Data from image 3
Data from image 4
Data from image 5
Data from image 6
Data from image 7
Data from image 8
Data from image 9
Data from image 10
Data from image 11
Data from image 12
Data from image 13
Data from image 14
Data from image 15
Data from image 16
Data from image 17
Data from image 18
Data from image 19
Data from image 20
Data from image 21
Data from image 22
Data from image 23
Data from image 24
Data from image 25
Data from image 26
Data from image 27
Data from image 28
Data from image 29
Data from image 30
Data from image 31
Data from image 32
Data from image 33
Data from image 34
Data from image 35
Data from image 36
Data from image 37
Data from image 38
Data from image 39
Data from image 40
Data from image 41
Data from image 42
Data from image 43
Data from image 44
Data from image 45
Data from image 46
Data from image 47
Data from image 48
Data from image 49
Data from image 50
Data from image 51
Data from image 52


In [None]:
# loading features if already extracted
x_data = np.load('x_data_saved.npy')
y_data = np.load('y_data_saved.npy')

print(x_data.shape)
print(y_data.shape)

print(y_data)

In [49]:
# SVC training

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# print(y_data)

kernel = 'linear'
max_iteration = -1

models = []
for i in range(class_num):
    models.append(make_pipeline(StandardScaler(), SVC(kernel=kernel, max_iter=max_iteration, probability=True)))

for i in range(len(labels)):
    models[i].fit(x_data, y_data == i) # training 


In [51]:
x = extract_features(filenames[0])

  kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(samples)


In [52]:
print(x[0])
print(y_data)
for i in range(len(labels)):
    r = models[i].predict([x[0]])
    if r[0]:
        print(models[i].decision_function([x[0]]))
        print(i)

[ 6.18188981e-01  2.51910163e+00  5.57609557e+00  4.90991052e+00
 -1.01529576e+01 -6.16950292e+00  1.19991557e+01  5.40519960e-04
  5.45654015e-04  5.46700388e-04]
[ 1 -1 14 21 15 22  2  3 11 20  6 20 14 15 12 12 20  0 21  5 10  0 15  8
  5 17  6 24  1  2 13 23  9 24  3 22 17 18 21 14 24 17 19  3  8  3 20  1
 23  7 19  2 24 18  5 12  9  6 10 19  9  5 17 22  8 22  7 12 16 19 20  6
 16 11 13 11 23 19  4 24  6 24 19 11  8  9 19  0  8 10 15 10 17 20 15 23
 21 21 17  0 18 10 19  3  8 13 12 21 15 19 11  5 16 11 24 20  7 20  8  8
  7 14 25 13 25 13 12 23  9  6 14 21 10 21 21  6  9  4 14 14  1  6 24  3
  8  2  1 11 16  0  2 13 12  8  2 18 25  2  4 14  8 23 18 16 10  4  0 24
 10 22  8  7 21  4 17  5 25  5 24 15 21 17 19 13 12 10 13 17 13 22 12  2
  5 13 12 24 20 11  6 17  0  7 21  7  5 10 16  1 12 14  4 19 11 21 25 22
 11  2  1 20 17 20 16 12 10 23 25 13 24  2  7 20 17 20  9 18 22 17 11  5
 15 22  2  6  1  4 23 20 21 20 19  4 14  4 19  5 13 11 17  0  8  0 14 17
 10  8  3  2 14 22 12  9  1  5  3