In [8]:
import os, sys

import numpy   as np
import pandas  as pd
import cPickle as pkl

from scipy import stats
from PIL   import Image, ImageFilter

from sklearn.decomposition   import PCA
from sklearn.linear_model    import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt

import generate_image_lists as giList
import generate_image_labels as giLabels
import edge_feature_generation as efg

%matplotlib inline

plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (14,10)

In [87]:
with open('data/height_logistic_clf.pkl','r') as f:
    _LOGISTIC_HEIGHT_CLF = pkl.load( f )
with open('data/short_logistic_clf.pkl','r') as f:
    _LOGISTIC_SHORT_CLF  = pkl.load( f )
with open('data/long_logistic_clf.pkl','r') as f:
    _LOGISTIC_LONG_CLF   = pkl.load( f )
with open('data/row_col_pca.pkl'      ,'r') as f:
    _RC_PCA              = pkl.load( f )
    
_SHORT_NUMS = [1,2,4,6,8]
_LONG_NUMS  = [1,2,3,4,6,8,10,12]

In [90]:
# Get the features from an image
def generate_features( img_path ):
    
    # Returns relative size of axes, and normalized sum of the rows and column
    rc_ratio, row_avg, col_avg    = efg.get_img_edge_data( img_path, blur=3 )

    # Combine the later
    row_col_arr = np.concatenate( ( row_avg, col_avg ) )
    
    # Run pca to collapse to 1/20 the original size, 85% variance
    pca_vals = _RC_PCA.transform( row_col_arr )
    
    return np.concat( ( np.array(rc_ratio), pca_vals ) )
    
    
# Will generate predictions for provided classes
# Can return raw probabilities of being the class,
#  or return the expected label
def _get_predict( 
                    inp_arr,
                    class_list,
                    clf_dict,
                    return_prob,
                ):
    
    # Get an idea of how many things we are passing
    inp_shape = len( inp_arr.shape )

    # If only one element, have to adjust format
    if ( inp_shape == 1 ):
        pred_arr_format = inp_arr.reshape(1,-1)
    else:
        pred_arr_format = inp_arr
        
    # Get the probability of a given class
    prob_dict = {}
    for classif in class_list:
        prob_dict[classif] = clf_dict[classif].predict_proba( pred_arr_format )[:,1]

    # If we are just returning the probabilities,
    #  can stop here and return a dict
    if ( return_prob ):
        return prob_dict
    
    
    # Otherwise, go through, find best prediction,
    #  and return that
    
    
    out_list = []
    
    # Compare each prediction, and 
    #  locate largest values
    # Populate the out array with these classes
    
    # Loop over each element
# LATER MODIFY TO CONSIDER THRESHOLD
    for i in range( 0, inp_arr.shape[0] ):
        
        # Loop over classes, finding the best
        best_str = class_list[0]
        for classif in class_list[1:]:
            if ( prob_dict[best_str][i] < prob_dict[classif][i] ):
                best_str = classif            
        out_list.append( best_str )
        
    return out_list
    
# Get predicted height category
def get_height_predict( 
                        inp_arr,
                        return_prob=False,
                      ):
    
    # Possible classificatios
    class_list = ['height_brick','height_plate','height_other']
    clf_dict   = _LOGISTIC_HEIGHT_CLF
    
    return _get_predict( inp_arr, class_list, clf_dict, return_prob )

# Get predicted height category
def get_short_predict( 
                        inp_arr,
                        return_prob=False,
                      ):
    
    # Possible classificatios
    class_list = ['short_'+str(col) for col in _SHORT_NUMS ]
    clf_dict   = _LOGISTIC_SHORT_CLF
    
    return _get_predict( inp_arr, class_list, clf_dict, return_prob )

# Get predicted height category
def get_long_predict( 
                        inp_arr,
                        return_prob=False,
                      ):
    
    # Possible classificatios
    class_list = ['long_'+str(col) for col in _LONG_NUMS ]
    clf_dict   = _LOGISTIC_LONG_CLF
    
    return _get_predict( inp_arr, class_list, clf_dict, return_prob )

In [30]:
full_df = pd.read_csv('data/white_labels_edge_pca_50.csv').drop( 'Unnamed: 0',axis=1)

feature_cols = ['row_col_ratio'] + [ col for col in full_df.columns.values if ( 'PCA' in col ) ]

feature_df = full_df[feature_cols]
label_df   = full_df.drop( feature_cols, axis=1 )

In [83]:
label_df.loc[i:i+5]['label']

22               plate_2x4
23        roof_tile_2x2_45
24    roof_tile_1x3_25_inv
25    roof_tile_1x3_25_inv
26    roof_tile_1x3_25_inv
27               plate_4x8
Name: label, dtype: object

In [92]:
i=15
print label_df.loc[i:i+5]['label']
feat_vals = feature_df.loc[i:i+1].values
print get_height_predict( feat_vals )
print get_short_predict( feat_vals )
print get_long_predict( feat_vals )

15    plate_2x4
16    plate_2x4
17    plate_2x4
18    plate_2x4
19    plate_2x4
20    plate_2x4
Name: label, dtype: object
['height_plate', 'height_plate']
['short_2', 'short_2']
['long_2', 'long_4']
