# Calculate image features

## Image files

In [4]:
import os
from collections import namedtuple

imgTuple = namedtuple('imgTuple', ['file', 'img_class', 'img_dir'])

data_dir = '../../downloads/hw6_data/'

def get_immediate_subdirectories(some_dir):
    return [name for name in os.listdir(some_dir)
           if os.path.isdir(os.path.join(some_dir, name))]

categories = get_immediate_subdirectories(data_dir)

f0 = os.listdir(os.path.join(data_dir, categories[4]))[0]
f0_path = os.path.join(data_dir, categories[4]) + '/' + f0

n_categories = 20
n_images_per_category = 10

imgs = []
for cat in categories:
    for f in os.listdir(os.path.join(data_dir, cat)):
        imgs.append(imgTuple(file=f, img_class=cat, img_dir=os.path.join(data_dir, cat)))

## Feature-calculating functions

In [27]:
def isColorImg(some_color_func):
    def wrapper(*args, **kwargs):
        try:
            return some_color_func(*args, **kwargs)
        except:
            return 0
    return wrapper

In [6]:
import cv2
from skimage.filters import sobel
from skimage.color import rgb2gray
from skimage.feature import blob_dog, blob_log, blob_doh

@isColorImg
def calc_avgRGB(img_ndarray, color):
    "Calculate pixel average value of one color channel image"
    color_dict={'red':0, 'green':1, 'blue':2}
    tmp_img = np.zeros(img_ndarray.shape)
    tmp_img[:, :, color_dict[color]] = img_ndarray[:, :, color_dict[color]]
    width, height, _ = tmp_img.shape
    return np.sum(tmp_img/(width*height))

@isColorImg
def calc_stdRGB(img_ndarray, color):
    "Calculate standard deviation of one color channel image"
    color_dict={'red':0, 'green':1, 'blue':2}
    return np.std(img_ndarray[:, :, color_dict[color]])

def calc_colorStd_div(img_ndarray, color, extremum):
    "Calculate segmented standard deviation of a color"
    n_Xdivs = 4
    n_Ydivs = 4
    stds = np.zeros((n_Xdivs, n_Ydivs))
    for i, xBlock in enumerate(np.array_split(list(range(img_ndarray.shape[0])), n_Xdivs)):
        for j, yBlock in enumerate(np.array_split(list(range(img_ndarray.shape[1])), n_Ydivs)):
            stds[i,j] = calc_stdRGB(img_ndarray[xBlock.min():xBlock.max(), yBlock.min():yBlock.max()], color)
    if extremum == 'max':
        return np.max(stds)
    elif extremum == 'min':
        return np.min(stds)
    else:
        print(f'I don\'t know the extremum: {extremum}')
        return -99

@isColorImg
def calc_graysclSumAvg(img_ndarray):
    "Calculate pixel average value of grayscale image"
    width, height, _ = img_ndarray.shape
    return np.sum(rgb2gray(img_ndarray))/(width*height)

def calc_numFASTcorners(img_ndarray):
    "Calculate number of 'features' (corners) using the FAST algorithm"
    # Initiate FAST object with default values
    fast = cv2.FastFeatureDetector_create(threshold=100, nonmaxSuppression=True)
    kp = fast.detect(img_ndarray, None)
    return len(kp)

@isColorImg
def calc_num_ShiTomasi_corners(img_ndarray):
    "Calculate number of corners above a certain threshold with Shi-Tomasi algorithm"
    corners = cv2.goodFeaturesToTrack(image=cv2.cvtColor(img_ndarray,cv2.COLOR_BGR2GRAY),
                                      maxCorners=100, qualityLevel=0.6, minDistance=10)
    return len(corners)

def calc_meanOfEdgePixelSum(img_ndarray):
    "Really rough calculation of how much of the image is edges"
    edge_sobel = sobel(rgb2gray(img_ndarray))
    width, height = edge_sobel.shape
    return (100*np.sum(edge_sobel)/(width*height))

def calc_numBlobs(img_ndarray, blob_type):
    "How many blobs?"
    gray = rgb2gray(img_ndarray)
    if blob_type == 'blobs_dog':
        blobs = blob_dog(gray, max_sigma=40, threshold=0.2)
    elif blob_type == 'blobs_doh':
        blobs = blob_doh(gray, max_sigma=30, threshold=.01)
    elif blob_type == 'blobs_log':
        blobs = blob_log(gray, min_sigma=10, max_sigma=30, num_sigma=5, threshold=.1)
    else:
        print(f'I don\'t know that kind of blob: {blob_type}')
        return -99
    return len(blobs)

@isColorImg
def calc_colorCorr(img_ndarray, feature_name):
    "Calculate color correlation"
    color_dict={'red':0, 'green':1, 'blue':2}
    if feature_name == 'RG_corr':
        c1 = 'red'
        c2 = 'green'
    elif feature_name == 'GB_corr':
        c1 = 'red'
        c2 = 'green'
    elif feature_name == 'RB_corr':
        c1 = 'red'
        c2 = 'green'
    else:
        print(f'I don\'t know that kind color pair: {feature_name}')
        return -99
    a = img_ndarray[:, :, color_dict[c1]].flatten()
    v = img_ndarray[:, :, color_dict[c2]].flatten()
    # Normalize
    a = (a - np.mean(a)) / (np.std(a) * len(a))
    v = (v - np.mean(v)) /  np.std(v)
    return np.correlate(a, v)[0]

## What to calculate

In [7]:
# change to dictionary?
def calc_feature(img_ndarray, feature_name):
    
    # Corners and edges    
    if feature_name == 'num_FASTcorners':
        return calc_numFASTcorners(img_ndarray)
    
    elif feature_name == 'num_ShiTomasi_corners':
        return calc_num_ShiTomasi_corners(img_ndarray)
    
    elif feature_name == 'edgeSumAvg':
        return calc_meanOfEdgePixelSum(img_ndarray)
    
    # blobs
    elif feature_name in ['blobs_dog', 'blobs_doh', 'blobs_log']:
        return calc_numBlobs(img_ndarray, feature_name)
      
    # circle-find: hough circle transform in skimage
    # Daisy
    # color cross-correlation
    elif feature_name in ['RG_corr', 'GB_corr', 'RB_corr']:
        return calc_colorCorr(img_ndarray, feature_name)
    
    # Simple color channel stuff
    elif feature_name == 'avg_R':
        return calc_avgRGB(img_ndarray, 'red')
    elif feature_name == 'avg_G':
        return calc_avgRGB(img_ndarray, 'green')
    elif feature_name == 'avg_B':
        return calc_avgRGB(img_ndarray, 'blue')
    
    elif feature_name == 'R_std_total':
        return calc_stdRGB(img_ndarray, 'red')
    elif feature_name == 'G_std_total':
        return calc_stdRGB(img_ndarray, 'green')
    elif feature_name == 'B_std_total':
        return calc_stdRGB(img_ndarray, 'blue')
    
    elif feature_name == 'R_std_div_max':
        return calc_colorStd_div(img_ndarray, 'red', 'max')
    elif feature_name == 'G_std_div_max':
        return calc_colorStd_div(img_ndarray, 'green', 'max')
    elif feature_name == 'B_std_div_max':
        return calc_colorStd_div(img_ndarray, 'blue', 'max')
    
    elif feature_name == 'R_std_div_min':
        return calc_colorStd_div(img_ndarray, 'red', 'min')
    elif feature_name == 'G_std_div_min':
        return calc_colorStd_div(img_ndarray, 'green', 'min')
    elif feature_name == 'B_std_div_min':
        return calc_colorStd_div(img_ndarray, 'blue', 'min')
    

    elif feature_name == 'grayscl_sumAvg':
        return calc_graysclSumAvg(img_ndarray)

    
    else:
        print(f'sorry, don\'t know how to calculate {feature_name}.')
        return -99


## Finally calculate!

In [9]:
from skimage import io
import pandas as pd
import numpy as np
import time

#dumb features to set up pipeline
features = [
            # Corners and edges
            'num_FASTcorners', 'edgeSumAvg', 'num_ShiTomasi_corners',
            # blobs
            #'blobs_dog', 'blobs_doh', 'blobs_log',
            # color cross-correlation
            'RG_corr', 'GB_corr', 'RB_corr',
            # Simple color channel stuff
            'avg_R', 'avg_G', 'avg_B', 'grayscl_sumAvg',
            #'R_std_total', 'G_std_total', 'B_std_total',
            'R_std_div_max', 'G_std_div_max', 'B_std_div_max',
            'R_std_div_min', 'G_std_div_min', 'B_std_div_min',

            # grayscale std_dev?
            # hough_circle transform? computationally expensive....
            # Daisy - seems complicated
           # Investigate fast ICA on categories for feature inspiration?
           ]
t = time.process_time()
df = pd.DataFrame()
print('Processing images:')
dot_line_thresh = 100
count = 0
for img in imgs:
    this_img_ndarray = io.imread(os.path.join(img.img_dir, img.file))
    feature_vals = [img.img_class]
    for feature in features:
        feature_vals.append(calc_feature(this_img_ndarray, feature))
    feature_vals.append(os.path.join(img.img_dir, img.file))
    
    ddd = ['img_class'] + features + ['full_path']
    this_df = pd.DataFrame([feature_vals], index=[img.file],
                          columns=['img_class'] + features + ['full_path'])
    df = df.append(this_df)
    count = count+1
    print('.', end = '' if (count%dot_line_thresh) else '\n')
elapsed_time = time.process_time() - t
print(f'\nCalculated {len(features)} features for {len(imgs)} images in {int(elapsed_time*100)/100} s')

df.to_pickle('./img_features.pkl')
df.head(10)

# Learning from the image features

In [55]:
df = pd.read_pickle('./img_features.pkl')

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


clf = RandomForestClassifier()
X = df.iloc[:,1:-1].values
Y = df['img_class']
clf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Important features

In [139]:
n_top = 3
top_features = (-clf.feature_importances_).argsort()[0:n_top]

col_w = 1 + max([len(name) for name in df.columns.values[top_features]])
precision = 3
print(f'Most important {n_top} features')
print('-'*25)
print(f'{"Feature":<{col_w}}  :   {"Importance"}')
print('-'*32)
for idx in top_features:
    print(f'{df.columns.values[idx+1]:<{col_w}}  :   {clf.feature_importances_[idx]:.{precision}}')

Most important 3 features
-------------------------
Feature           :   Importance
--------------------------------
edgeSumAvg        :   0.0811
num_FASTcorners   :   0.0775
B_std_div_max     :   0.0733


# Cross-validation

In [140]:
scores = cross_val_score(clf, X, Y, cv=5)
print(scores)

[ 0.18390805  0.21486643  0.20518868  0.21454112  0.20823245]


About 10x better than random guessing!

In [28]:
# Pickle the classifier so it can be loaded without calculating all the training image features,
# and without training the classifier
import pickle

fname = "jaffe_classifier.p"
pickle.dump(clf, open(fname, "wb"))

# Function to run on validation directory:

In [26]:
import pickle
import os
from skimage import io

clf = pickle.load(open(fname, "rb"))

def predict_img_class(img_fname):
        # Read in image
        this_img_ndarray = io.imread(img_fname)
        
        # Calculate image features
        feature_vals = []
        for feature in features:
            feature_vals.append(calc_feature(this_img_ndarray, feature))
        
        # Predict class
        feature_vals = np.asarray(feature_vals).reshape(1,-1)
        prediction = clf.predict(feature_vals)
        return prediction[0]



def run_final_classifier(dir_path_name):
    col_w = 22
    print(f'{"filename":<{col_w}}   {"predicted_class":<{col_w}}')
    print('-'*(2*col_w-4))
    for f in os.listdir(dir_path_name):
        prediction = predict_img_class(os.path.join(dir_path_name, f))
        print(f'{f:<{col_w}}   {prediction:<{col_w}}')
    
    
run_final_classifier('../../downloads/hw6_data/gorilla')

filename                 predicted_class       
----------------------------------------
gorilla_0016.jpg         gorilla               
gorilla_0002.jpg         gorilla               
gorilla_0003.jpg         gorilla               
gorilla_0017.jpg         gorilla               
gorilla_0001.jpg         gorilla               
gorilla_0015.jpg         gorilla               
gorilla_0029.jpg         gorilla               
gorilla_0028.jpg         gorilla               
gorilla_0014.jpg         gorilla               
gorilla_0038.jpg         gorilla               
gorilla_0004.jpg         gorilla               
gorilla_0010.jpg         gorilla               
gorilla_0011.jpg         gorilla               
gorilla_0005.jpg         gorilla               
gorilla_0039.jpg         gorilla               
gorilla_0013.jpg         gorilla               
gorilla_0007.jpg         gorilla               
gorilla_0006.jpg         gorilla               
gorilla_0012.jpg         gorilla               