In [None]:
%%time
# Update the dataset incase it has changed.
! cd ../input; ./update-dataset.sh >/dev/null

# Clean the input directory structure.
! rm -rf cmp23 preprocessed features
! mkdir -p {cmp23,preprocessed}/{fe,}male features

# Self explanatory.
! cp ../input/cmp23-handwritten-males-vs-females/Males/Males/* cmp23/male
! cp ../input/cmp23-handwritten-males-vs-females/Females/Females/* cmp23/female

# Rename all the dataset files to simple names.
! function rename { for path in "$@"; do cd $path; c=1; for file in $(/bin/ls); do /bin/mv $file $(printf "%03d" $c).jpg; let c=c+1; done; cd - >/dev/null; done; }; rename cmp23/{fe,}male

# Check the directory tree.
! tree --filelimit 100

In [1]:
# Imports.
from utils import *
from helpers import *
from processing import *
from features import *
#from model import *

In [2]:
# Constants.
ALL_IMAGES = get_all_images()
LABELS = load_labels()

TST_IMAGES = [
    cmp('female', 9),
    cmp('male', 67),
    cmp('male', 89),
    cmp('female', 3),
    cmp('male', 12),
    cmp('female', 4),
] # Some images' paths used oberservation.

In [None]:
# Preview the test images before preprocessing them.
preview(TST_IMAGES)

In [None]:
%%time
# Preprocess the images and write them to the disk.
for image_path in ALL_IMAGES:
    imwrite(pre(image_path), preprocess(image_path))

In [None]:
# Preview the test images after preprocessing.
preview(map(pre, TST_IMAGES))

In [None]:
%%time
# GLCM features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(glcm(imread_bw(image_path)))

save_feature('glcm', cmp_features)

In [None]:
features = load_feature('glcm')
svm_test(features, LABELS, C=10, kernel='rbf')

In [None]:
%%time
# LBP features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(lbp(imread_bw(image_path)))

save_feature('lbp', cmp_features)

In [None]:
features = load_feature('lbp')
svm_test(features, LABELS, C=10, kernel='rbf')

In [None]:
%%time
# HoG features.
cmp_features = []

# Get the best resize ratio.
resize_w, resize_h = get_avg_image_shape()
# Upscale the resize parameters so not to lose any resolution.
resize_factor = int(resize_w * 1.15), int(resize_h * 1.15)

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(hog(imread_bw(image_path), resize_factor))

# Run PCA first to shrink the number of features.
cmp_features = pca(cmp_features)
save_feature('hog', cmp_features)

In [None]:
features = load_feature('hog')
svm_test(features, LABELS, C=10, kernel='rbf')

In [None]:
%%time
# Chain Code features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(chain_codes_and_pairs(imread(image_path)))

save_feature('chain_codes_and_pairs', cmp_features)

In [None]:
features = load_feature('chain_codes_and_pairs')
svm_test(features, LABELS, C=10, kernel='rbf')

In [None]:
%%time
# Distribution of segment slopes and distribution of curvature features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(slopes_and_curves(imread(image_path)))

save_feature('slopes_and_curves', cmp_features)

In [None]:
features = load_feature('slopes_and_curves')
svm_test(features, LABELS, C=10, kernel='rbf')

In [None]:
features = combine_features(
    load_feature('chain_codes_and_pairs'),
    load_feature('slopes_and_curves'),
    load_feature('lbp'),
    load_feature('glcm'),
    #load_feature('hog'),
)

svm_test(features, LABELS, C=10, kernel='rbf')

In [None]:
%%time
# Hinge features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(hinge(imread(image_path)))

save_feature('hinge', cmp_features)

In [None]:
features = load_feature('hinge')
svm_test(features, LABELS, C=10, kernel='rbf')

In [None]:
%%time
# COLD features.
cmp_features = []
import time
for image_path in map(pre, ALL_IMAGES):
    s = time.time()
    cmp_features.append(cold(imread(image_path), max_cnts=1000))
    # print(f'took {time.time() - s}')

save_feature('cold', cmp_features)

In [None]:
features = load_feature('cold')
svm_test(features, LABELS, C=10, kernel='rbf', times=100)

# Learning

In [3]:
from features import FEATURES

import pickle
import sklearn.ensemble
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, StackingClassifier, VotingClassifier



class SVM(SVC):
    pass


class RFC(RandomForestClassifier):
    pass


class DTC(DecisionTreeClassifier):
    pass


class ANN(MLPClassifier):
    pass


class KNN(KNeighborsClassifier):
    pass


class GenderClassifier(sklearn.ensemble.AdaBoostClassifier):
    def __init__(self,):
        pass

    def predict(self, features: dict) -> int:
        assert all(feature_name in FEATURES
                   for feature_name in features.keys()), "Encountered an unknown feature!"
        return super().predict(features)

    def pickle(self, file_name='classifier.pkl'):
        with open(file_name, 'wb') as clf_file:
            pickle.dump(self, clf_file)


In [None]:
svm = SVM(kernel='rbf', C=10, probability=True)
rfc = RFC()
ann = ANN(**{
    'solver': 'lbfgs',
    'hidden_layer_sizes': (30,),
    'max_iter': 10000,
})
knn = KNN()
clfs = [('svm', svm),
        ('ann', ann),
        ('rfc', rfc),
        ('knn', knn)]
stk = StackingClassifier(clfs)# max_samples=0.75, max_features=0.90)
c_test(load_feature('hinge'), LABELS, stk, count=3)
c_test(load_feature('hinge'), LABELS, svm)

****************************************************************** [66.67%]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [None]:
FEATURES = {
  'lbp',
  #'hog',
  'glcm',
  #'cold',
  'hinge',
  'slopes_and_curves',
  'chain_codes_and_pairs',
}

chain_features = combine_features(
    load_feature('slopes_and_curves'),
    load_feature('chain_codes_and_pairs'),
)

ann_kwargs = {
    'solver': 'lbfgs',
    'hidden_layer_sizes': (30,),
    'max_iter': 10000,
}
c_test(chain_features, LABELS, ANN(**ann_kwargs))