# Utilities

**Author**: Maleakhi Agung Wijaya  
**Email**: maw219@cam.ac.uk  
**Description**: This file contains utility functions and constants used in other notebooks.

## Constants

In [None]:
# Domain knowledge filenames
all_domain_knowledge = "../results/new_shell_comp.mat"
color_domain_knowledge = "../results/shell_color.mat"
shape_domain_knowledge = "../results/shell_shape.mat"
texture_domain_knowledge = "../results/shell_texture.mat"

In [1]:
# RAW data folder
SHELL_IMAGES_DATA = "../data/shell_species_134_data/"

## Functions

In [1]:
def load_domain_knowledge_data(filepath):
    """
    Given filepath, we load the data (features and label)
    after preprocessing using domain knowledge.
    
    Note: for the domain knowledge feature extraction, we follow extraction
    steps discussed by Zhang et al. 
    (https://www.nature.com/articles/s41597-019-0230-3.pdf). We adapted
    the code from Matlab to Python.
    """
    
    # Load extracted feature
    data = scipy.io.loadmat(filepath)
    X = data["X"]
    y = data["Y"]
    
    # Return the features and labels
    return X, y

In [None]:
def plot_confusion_matrix(cm, cmap="coolwarm"):
    """
    Used to draw confusion matrix.
    """
    
    fig = plt.figure(figsize=(6, 6))
    plt.imshow(cm, cmap=cmap, interpolation="nearest")
    plt.colorbar()
    plt.xlabel("Predicted", fontsize=13)
    plt.ylabel("Actual", fontsize=13)
    plt.tick_params(axis='both', which='both', bottom=False, 
                    top=False, labelbottom=False, right=False, left=False, labelleft=False)
    plt.show()

In [None]:
def nested_cv_sklearn(classifier, param_grids, X, y, n_iter=5):
    """
    Perform nested cross-validation (for sklearn ML models).
    
    :param classifier: classifier of interest
    :param param_grids: dictionary combination of parameters to be 
        tried for hyperparameter search.
    :param X: the X feature
    :param y: the y feature
    :param n_iter: the number of iteration in the nested cv
    
    :return: list cv accuracy, aggregate confusion matrix, list of hyperparameter search result
    """
    
    list_acc = []
    list_f1 = []
    list_cv_results = []
    list_cm = []
    
    ## Perform nested cross-validation (outer CV = monte-carlo based)
    for _ in range(n_iter):
        # Split into training and testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, 
                                                           shuffle=True, test_size=0.2)
        
        # Inner CV: search for the best model
        clf = GridSearchCV(classifier, param_grids, cv=5)
        clf.fit(X_train, y_train)
        cv_results = clf.cv_results_
        
        # Evaluate on testing using the best model
        y_pred = clf.best_estimator_.predict(X_test)
        acc = accuracy_score(y_pred, y_test)
        f1 = f1_score(y_pred, y_test, average="macro")
        cm = confusion_matrix(y_pred, y_test)
        
        # Store result
        list_acc.append(acc)
        list_cm.append(cm)
        list_f1.append(f1)
        list_cv_results.append(cv_results)
    
    return list_acc, list_cm, list_f1, list_cv_results

In [None]:
def generate_dict_results():
    # Generate dictionary used to store results for each feature set
    
    return {
        "accuracy": [],
        "f1": [],
        "cv_results": [],
        "cm": []
    }

In [None]:
def preprocess_x_y(filenames):
    """
    Given the format from the dataset, this function split
    the image into X and y respectively. Note that we combine
    front and rear view of the shells as a single image.
    
    :param filenames: list of filenames inside the list with
        format = species_idx_A/B.jpg
        
    :return X: the feature (tuple A, B)
    :return y: the label
    """
    
    X_species = []
    y_species = []
    
    for i in range(0, len(filenames), 2):
        X_species.append((filenames[i], filenames[i+1]))

        # Find the digit index
        label = re.search(r"\d", filenames[i])
        idx_label = label.start() - 1
        y_species.append(filenames[i][:idx_label])
    
    return X_species, y_species

In [None]:
def apply_mask(grayscale_img):
    """
    Apply mask to the grayscale image such that we only focus on
    the shell. It should be noted however, that since the 
    background of the image is already black due to flood fill
    algorithm, masking might not be necessary.
    """
    
    _, mask = cv2.threshold(grayscale_img, 1, 255, cv2.THRESH_BINARY)
    
    return mask

In [2]:
"""
Save and load pickle objects.
"""

def save_object(obj, filename):
    with open(filename, "wb") as f:
        pickle.dump(obj, f)

def load_object(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [None]:
def build_histograms(dictionary, vocabulary_size, X, feature_extractor=cv2.xfeatures2d.SIFT_create()):
    """
    Build histogram depending on the vocabulary (dictionary)
    and data given.
    """
    
    # Storage
    histograms = [] # list of bovw features for each image
    
    for x in tqdm(X):
        X_a = x[0] # A position
        X_b = x[1]

        # Open the image in gray scale
        image_a = cv2.imread(os.path.join(SHELL_IMAGES_DATA, X_a),
                       cv2.IMREAD_GRAYSCALE)
        image_b = cv2.imread(os.path.join(SHELL_IMAGES_DATA, X_b),
                       cv2.IMREAD_GRAYSCALE)

        kp_a, descriptors_a = feature_extractor.detectAndCompute(image_a, None)
        kp_b, descriptors_b = feature_extractor.detectAndCompute(image_b, None)

        descriptors = np.concatenate((descriptors_a, descriptors_b))

        # Calculate distance to each 200 vocab symbols
        dist = cdist(descriptors, dictionary, "euclidean")

        # get the symbolno for the closest symbol
        cluster_assignment = np.argmin(dist, axis=1)

        # Build histogram
        features = np.zeros(vocabulary_size)
        for assign in cluster_assignment:
            features[assign] += 1

        histograms.append(features)
    
    return histograms

In [None]:
def build_histograms_brief(dictionary, vocabulary_size, X):
    """
    Build histogram depending on the vocabulary (dictionary)
    and data given.
    """
    
    # Storage
    histograms = [] # list of bovw features for each image
    star = cv2.FeatureDetector_create("STAR")
    brief = cv2.DescriptorExtractor_create("BRIEF")
    
    for x in tqdm(X):
        X_a = x[0] # A position
        X_b = x[1]

        # Open the image in gray scale
        image_a = cv2.imread(os.path.join(SHELL_IMAGES_DATA, X_a),
                       cv2.IMREAD_GRAYSCALE)
        image_b = cv2.imread(os.path.join(SHELL_IMAGES_DATA, X_b),
                       cv2.IMREAD_GRAYSCALE)
        
        kp_a = star.detect(image_a, None)
        kp_b = star.detect(image_b, None)
        kp_a, descriptors_a = brief.compute(image_a, kp_a)
        kp_b, descriptors_b = brief.compute(image_b, kp_b)

        descriptors = np.concatenate((descriptors_a, descriptors_b))

        # Calculate distance to each 200 vocab symbols
        dist = cdist(descriptors, dictionary, "euclidean")

        # get the symbolno for the closest symbol
        cluster_assignment = np.argmin(dist, axis=1)

        # Build histogram
        features = np.zeros(vocabulary_size)
        for assign in cluster_assignment:
            features[assign] += 1

        histograms.append(features)
    
    return histograms

In [None]:
def extract_canny_edge(path, sigma=0.33):
    """
    Given an image path, we extract the edge using zero-parameter,
    automatic Canny edge detector.
    
    :param path: the image path.
    :param sigma: the tightness level of threshold.
    """
    
    # Read image, convert it to grayscale, and apply Gaussian blue
    image = cv2.imread(path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
    
    # Perform Canny detection (automatically)
    # We automatically search for upper and lower threshold using
    # the median of the blurred image.
    v = np.median(blurred)
    lower = int(max(0, (1.0-sigma) * v))
    upper = int(min(255, (1.0+sigma) * v))
    edge_img = cv2.Canny(blurred, lower, upper)
    
    return edge_img

In [None]:
def plot_canny(path, sigma=0.33):
    """
    Given an image path, plot the edge found by canny.
    """
    
    # Process image using canny detector
    edged_img = extract_canny_edge(path, sigma=sigma)
    
    fig = plt.figure(figsize=(3, 3))
    plt.imshow(edged_img, cmap="gray")
    
    plt.show()