In [1]:
import os
import sys
import joblib
import numpy as np
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.transform import resize
from sklearn import svm, metrics

In [None]:
def resize_all(src, pklname, include=None, width=64, height=None):
    """
    load images from path, resize them and write them as arrays
    to a dictionary together with labels and metadata.
    The dictionary is written to a pickle file named
    '{pklname}_{width}x{height}px.pkl'
    
    
    Parameter
    ---------
    src: str
        path to data
    pklname: str
        path to output file
    width: int
        target width of image in pixels
    include: set[str]
        set containing str
    """
    
    height = height if height is not None else width
    
    data = dict()
    data['description'] = f'resized ({int(width)}x{int(height)}) character images in greyscale'
    data['label'] = []
    data['filename'] = []
    data['data'] = []
    
    pklname = f"{pklname}_{width}x{height}px.pkl"
    
    i = 0
    part = 1
    for subdir in os.listdir(src):
        if include is None or subdir in include:
            print(f"{subdir}: {i}")
            i += 1
            current_path = os.path.join(src, subdir)
            
            for file in os.listdir(current_path):
                if file[-3:] in {'jpg', 'png'}:
                    im = imread(os.path.join(current_path, file))
                    im = resize(im, (width, height))
                    data['label'].append(subdir)
                    data['filename'].append(file)
                    data['data'].append(im)
        if i % 759 == 0:
            print("Dump")
            joblib.dump(data, f"{pklname}.part{part}", compress=True)
            part += 1
            data = dict()
            data['description'] = f'resized ({int(width)}x{int(height)}) character images in greyscale'
            data['label'] = []
            data['filename'] = []
            data['data'] = []
    # joblib.dump(data, pklname, compress=True)

In [4]:
data_path = f'{sys.path[0]}/Input/Images'

base_name = 'all_characters'
width = 64

# resize_all(src=data_path, pklname=base_name, width=width)
print("Done!")

Done!


In [5]:
from collections import Counter

data = joblib.load(f'{base_name}_{width}x{width}px.pkl.part1')
data2 = joblib.load(f'{base_name}_{width}x{width}px.pkl.part2')

data['data'] += data2['data']
data['label'] += data2['label']
data['filename'] += data2['filename']

data2 = []

print('Number of samples: ', len(data['data']))
print('keys: ', list(data.keys()))
print('description: ', data['description'])
print('image shape: ', data['data'][0].shape)
print('labels:', np.unique(data['label']))

Counter(data['label'])

Number of samples:  151801
keys:  ['description', 'label', 'filename', 'data']
description:  resized (64x64) character images in greyscale
image shape:  (64, 64)


  return array(a, dtype, copy=False, order=order, subok=True)


TypeError: '<' not supported between instances of 'list' and 'str'

In [None]:
from sklearn.model_selection import train_test_split

n_samples = len(data['data'])


X = np.array(data['data']).reshape(n_samples, -1)
y = np.array(data['label'])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
    random_state=42)

In [None]:
#clf = svm.SVC(gamma=0.001)

clf.fit(X_train, y_train)

predicted = clf.predict(X_test)

In [None]:
print(f"Classification report for classifier {clf}:\n"
      f"{metrics.classification_report(y_test, predicted)}\n")

In [None]:
plt.rcParams["font.family"] = 'Noto Serif CJK JP'

_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, prediction in zip(axes, X_test, predicted):
    ax.set_axis_off()
    image = image.reshape(64, 64)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title(f'Prediction: {prediction}')