In [1]:
from pathlib import Path
import numpy as np 
import xml.etree.ElementTree as ET
from sklearn.svm import SVC, LinearSVC
from skimage.io import imread
from skimage import img_as_float
from skimage.transform import resize
# from concurrent.futures import ThreadPoolExecutor as PoolExecutor
# from concurrent.futures import as_completed
from joblib import Parallel, delayed
from time import time
from tqdm import tqdm

from pillclassification.feature_extraction import feature_extraction
from pillclassification.functions import crop_center, generate_image
from utils.utils import tqdm_joblib

images_dir = Path('utils/Dataset/merge')
filenames = [x for x in images_dir.iterdir() if x.suffix != '.xml']
bg_dir = Path('utils/Dataset/backgrounds')
bgs = [x for x in bg_dir.iterdir()]

samples_num = len(filenames)
feature_number = 10

In [2]:
# calculating labels 
try:
    tree = ET.parse(images_dir / 'images.xml')
except ET.ParseError:
    print('Parse error on {}'.format(images_dir / 'images.xml'))
    exit(-1)

se = list(tree.getroot())[0]

labels_set = set()
segmented = 0

for e in list(se):
    labels_set.add(e.find('NDC9').text)
    layout = e.find('Layout')
    if layout is not None and layout.text == "MC_C3PI_REFERENCE_SEG_V1.6":
        segmented += 1

labels = sorted(list(labels_set))
class_num = len(labels)

generate_n = 10

final_samples = segmented * generate_n + len(filenames) - segmented
print(final_samples)

3406


In [3]:
def extract_features(f):
    # loading the image 
    try:
        img = imread(f)
    except ValueError as e:
        return None
    
    images = []
    if img.shape[-1] == 4:
        for _ in range(generate_n):
            images.append(generate_image(img, bgs[np.random.randint(0, len(bgs))]))
    else:
        # cropping in the center
        img = crop_center(img, crop_scale=0.65)

        # rescaling with fixed width
        width = 600
        img = resize(img, (int(img.shape[0] * (width / img.shape[1])), width), anti_aliasing=True)

        # the img must be in float format 
        images.append(img_as_float(img))

    # feature extraction
    features = np.ndarray((0,10))
    labels_ = []
    for img in images:
        try:
            hu, rgb_val = feature_extraction(img)
        except ValueError:
            continue

        label = -1
        for e in list(se):
            if e.find('File').find('Name').text == f.name:
                label = labels.index(e.find('NDC9').text)
                break
        features = np.append(features, [np.append(hu, rgb_val)], axis=0)
        labels_ = np.append(labels_, label)
    
    return features, labels_

In [4]:
# extracting features
x_data = np.ndarray((0, feature_number))
y_data = np.array([], dtype=np.int32)

with tqdm_joblib(tqdm(desc="Feature extraction", total=len(filenames))) as progress_bar:
    with Parallel(n_jobs=10) as parallel:
        for i, r in enumerate(parallel(delayed(extract_features)(f) for f in filenames)):
            x_data = np.concatenate((x_data, r[0]))
            y_data = np.concatenate((y_data, r[1]))

# printing and saving the features as npy file
print(x_data)
print(y_data)

Feature extraction:  24%|██▍       | 178/742 [08:56<28:19,  3.01s/it]  


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
np.save('x_data_saved', x_data)
np.save('y_data_saved', y_data)

In [86]:
# loading features if already extracted
x_data = np.load('x_data_saved.npy')
y_data = np.load('y_data_saved.npy')

# print(x_data.shape)
# print(y_data.shape)

x_data = np.delete(x_data, 1, 0)
y_data = np.delete(y_data, 1, 0)

# print(y_data)

In [87]:
n_train = x_data.shape[0] * 2 // 3
print(x_data.shape[0] - n_train)

247


In [88]:
# SVC training

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# print(y_data)

lin_clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-6, verbose=1, max_iter=10000))

lin_clf.fit(x_data[:n_train], y_data[:n_train])

[LibLinear]

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc',
                 LinearSVC(max_iter=10000, random_state=0, tol=1e-06,
                           verbose=1))])

In [10]:
x = extract_features(filenames[3])

  kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(samples)


In [105]:
# print(x[0])
# print(y_data)

import matplotlib.pyplot as plt
# r = lin_clf.predict([x[0]])
# print(lin_clf.decision_function([x[0]]))
df = lin_clf.decision_function(x_data[n_train:])

print(lin_clf.score(x_data[n_train:], y_data[n_train:]))

0.31983805668016196
