# Data Preperation

In [None]:
%%time
# Update the dataset incase it has changed.
! cd ../input; ./update-dataset.sh >/dev/null

# Clean the input directory structure.
! rm -rf cmp23 preprocessed features meta
! mkdir -p {cmp23,preprocessed}/{fe,}males features meta

# Copy the images.
! cp ../input/cmp23-handwritten-males-vs-females/Males/Males/* cmp23/males
! cp ../input/cmp23-handwritten-males-vs-females/Females/Females/* cmp23/females

# Remove some image which we know will break our preprocessing pipeline.
! rm -f cmp23/males/{M61,M69,M73,M96}.jpg cmp23/females/{F38,F64}.jpg

# Rename all the dataset files to simple names.
! function rename { for path in "$@"; do cd $path; c=1; for file in $(/bin/ls); do /bin/mv $file $(printf "%03d" $c).jpg; let c=c+1; done; cd - >/dev/null; done; }; rename cmp23/{fe,}males

# Check the directory tree.
! tree --filelimit 100

In [None]:
# Imports and Constants.
from utils import *
from helpers import *
from processing import *
from features import *
from model import *


ALL_IMAGES = get_all_images()
LABELS = load_labels()

TST_IMAGES = [
    cmp('female', 9),
    cmp('male', 67),
    cmp('male', 89),
    cmp('female', 3),
    cmp('male', 12),
    cmp('female', 4),
] # Some images' paths used oberservation.

def svm_test(xs, ys, count=100, test_size=0.2, **kwargs):
    return g_test(xs, ys, svm(**kwargs), count, test_size)

In [None]:
# Preview the test images before preprocessing them.
preview(TST_IMAGES)

# Preprocessing

In [None]:
%%time
# Preprocess the images and write them to the disk.
for image_path in ALL_IMAGES:
    imwrite(pre(image_path), preprocess(imread(image_path,
                                               apply_tresh=False)))

In [None]:
# Preview the test images after preprocessing.
preview(map(pre, TST_IMAGES))

# Feature Extraction

## GLCM

In [None]:
%%time
# GLCM features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(glcm(imread_np(image_path)))

save_feature('glcm', cmp_features)

In [None]:
features = load_feature('glcm')
svm_test(features, LABELS)

## Local Binary Pattern

In [None]:
%%time
# LBP features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(lbp(imread_np(image_path)))

save_feature('lbp', cmp_features)

In [None]:
features = load_feature('lbp')
svm_test(features, LABELS)

## Histogram of Oriented Gradients

In [None]:
%%time
# HoG features.
cmp_features = []

# Get the best resize ratio.
resize_w, resize_h = get_avg_image_shape()
# Upscale the resize parameters so not to lose any resolution.
resize_factor = int(resize_w * 1.15), int(resize_h * 1.15)

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(hog(imread_np(image_path), resize_factor))

# Run PCA first to shrink the number of features.
cmp_features = pca(cmp_features, 'hog')
save_feature('hog', cmp_features)

In [None]:
features = load_feature('hog')
svm_test(features, LABELS)

## Dist. of Chain Codes & Dist. of Chain Code Pairs

In [None]:
%%time
# Chain Code features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(chain_codes_and_pairs(imread(image_path)))

save_feature('chain_codes_and_pairs', cmp_features)

In [None]:
features = load_feature('chain_codes_and_pairs')
svm_test(features, LABELS)

## Dist. of Segement Slopes & Dist. of Curvatures

In [None]:
%%time
# Distribution of segment slopes and distribution of curvature features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(slopes_and_curves(imread(image_path)))

save_feature('slopes_and_curves', cmp_features)

In [None]:
features = load_feature('slopes_and_curves')
svm_test(features, LABELS)

## Hinge

In [None]:
%%time
# Hinge features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(hinge(imread(image_path)))

save_feature('hinge', cmp_features)

In [None]:
features = load_feature('hinge')
svm_test(features, LABELS)

## COLD

In [None]:
%%time
# COLD features.
cmp_features = []

for image_path in map(pre, ALL_IMAGES):
    cmp_features.append(cold(imread(image_path)))

save_feature('cold', cmp_features)

In [None]:
features = load_feature('cold')
svm_test(features, LABELS)

# Learning

In [None]:
from sklearn.model_selection import train_test_split
import random

ALL_FEATURES = dict((f, load_feature(f)) for f in FEATURES)

TRAIN_FEATURES = {}
TEST_FEATURES = {}
rnd = int(random.random() * 93024949)

for feature in ALL_FEATURES.keys():
    TRAIN_FEATURES[feature], TEST_FEATURES[feature], TRAIN_LABELS, TEST_LABELS = train_test_split(load_feature(feature), LABELS, random_state=rnd)

clfs = {
    'svm_1': {},
    #'svm_2': {'C': 50000},
    #'ann_1': {},
    #'ann_2': {'hidden_layer_sizes': (20, 15), 'max_iter': 1000},
    'knn_1': {},
    'rfc_1': {},
    'dtc_1': {},
}

In [None]:
gc = GenderClassifier(clfs)
gc.fit(TRAIN_FEATURES, TRAIN_LABELS, booster=4)

In [None]:
feature_inverse = []
for i in range(len(TEST_LABELS)):
    feature_inverse.append(dict(
        (feat, feat_vals[i]) for feat, feat_vals in TEST_FEATURES.items()
    ))

gc.score(feature_inverse, TEST_LABELS, True)
gc.score(feature_inverse, TEST_LABELS)

In [None]:
gc = GenderClassifier(clfs)
gc.fit(ALL_FEATURES, LABELS, booster=3)
gc.pickle()