In [None]:
import sys
sys.path.append('../utils/')

In [None]:
from ImageUtils import *

In [None]:
import numpy as np
import pandas as pd # Needs the package Pandas to be installed. Check Anaconda Environments and Packages.
from sklearn.decomposition import PCA # Needs SciKit Learn package to be installed. Check Anaconda Environments and Packages.4
from sklearn.covariance import LedoitWolf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, classification_report
import matplotlib.pyplot as plt
from scipy.spatial.distance import mahalanobis
from collections import Counter
from sklearn.preprocessing import label_binarize
import time
from scipy.cluster.hierarchy import linkage, dendrogram
import seaborn as sns

# DATASET FACES 94

## Data preprocessing

In order to prepare the image dataset to proceed with the final stages of modeling and valuation processes. It is unavoidable to standardize the images features on the initial dataset (codename - faces94) and the external images use it on the future testing activities under the following conditions:

**General Images Characteristics**:

* File Format *.jpg
* Images on Gray Scale.
* Size 180x200 for the images on the dataset (codename- faces94)

**Activities**:

* Organize the images on a short-listed to prepare a new dataset.
* Exclude from the dataset all the images without the *.jpg format.
* On the OpenCV library of Python, upload the images and storage the matrix in arrays for numeric treatment.
* On the OpenCV library of Python, change the images to gray-scale and resize the photos to 180x200.
* Finally, the outcome is a new dataset with the proper images for testing and modeling face recognition. With the Eigenfaces model to apply the Principal Component Analysis (PCA) so represent the face images in a low dimension.


In [None]:
face94_male = readFaces94MaleFaces(gray=True)
plt.imshow(face94_male[0], plt.cm.gray);

In [None]:
N, height, width = face94_male.shape

In [None]:
labels_faces = np.ones(N)

# Mean face

In [None]:
mean_face = np.mean(face94_male.reshape(N, height*width), axis=0).reshape(height, width)
plt.imshow(mean_face, cmap=plt.cm.gray);

# Median face

In [None]:
median_face = np.median(face94_male.reshape(N, height*width), axis=0).reshape(height, width)
plt.imshow(median_face, cmap=plt.cm.gray);

# Images of natural landscapes

The landscape images were obtain of **ImageNet** database [ImageNet database](http://image-net.org/) , 
each one of the directions is [online](http://image-net.org/api/text/imagenet.synset.geturls?wnid=n13104059). We use cv2 package by read and resize images, then we create an Numpy array with a gray scale of images.

In [None]:
landscapes = np.array(readLandsCapeImage(gray=True)) # Read dataset
plt.imshow(landscapes[45], plt.cm.gray); # show image

In [None]:
labels_landscapes = np.zeros(landscapes.shape[0])

In [None]:
dataset = np.vstack((face94_male, landscapes))
plt.imshow(dataset[-1], plt.cm.gray);

In [None]:
labels = np.concatenate((labels_faces, labels_landscapes))

In [None]:
dataset_N, height, width = dataset.shape

In [None]:
mean_with_noise = np.mean(dataset.reshape(dataset_N, height*width), axis=0).reshape(height, width)
plt.imshow(mean_with_noise, cmap=plt.cm.gray);

In [None]:
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("mean with noise")
ax1.imshow(mean_with_noise, plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("mean")
ax2.imshow(mean_face, plt.cm.gray)
Dis=np.linalg.norm(mean_with_noise - mean_face, ord=2, keepdims=False)
print("Distance "+ str(Dis))

In [None]:
median_with_noise = np.median(dataset.reshape(dataset_N, height*width), axis=0).reshape(height, width)
plt.imshow(median_with_noise, cmap=plt.cm.gray);

In [None]:
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("median with noise")
ax1.imshow(median_with_noise, plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("median")
ax2.imshow(median_face, plt.cm.gray)
Dis=np.linalg.norm(median_with_noise - median_face, ord=2, keepdims=False)
print("Distance "+ str(Dis))

# Median face as a real image

In [None]:
A=dataset.reshape(dataset_N, height*width)

In [None]:
dist_1=[]
start_time = time.time()
for i in range(A.shape[0]):
    d = np.linalg.norm(np.subtract(A[i], A), ord=2, axis=1)
    s=np.sum(np.sum(d,axis=0)) # suma de las difenrecias
    dist_1.append(s) # Guardando la suma de las diferencias de la imagen i a las demas

total_time = time.time() - start_time

print("Time elapsed for operation: {}".format(total_time))

In [None]:
Min_1=np.argmin(np.array(dist_1))

In [None]:
plt.imshow(dataset[Min_1],plt.cm.gray)

## Show atypical data distances

In [None]:
distance_info = getNormsAndDistanceInfoFromBaseImage(base_image=mean_with_noise, array_images=dataset, labels=labels)
visualizeOutlierInfo(distance_info,labels)

In [None]:
print(distance_info['falsitude_metrics'])

# Show atypical data distances (outliers interquartile range)

In [None]:
visualizeOutlierInfo2(distance_info,dataset,labels)

In [None]:
print(distance_info['falsitude_metrics_iqr'])

# Normas: Norm1.0  Norm2.0  Norm3.0  Norminf  Norm2.5  Norm0.71

In [None]:
selected_norm = "Norminf"
cols = 6
rows = int(np.ceil(distance_info["outliers"][selected_norm]["indices"].shape[0]/cols))
plt.figure(figsize=(180,200))
for i in np.arange(distance_info["outliers"][selected_norm]["indices"].shape[0]):
    plt.subplot(rows, cols, i + 1)
    plt.imshow(dataset[distance_info["outliers"][selected_norm]["indices"][i]], plt.cm.gray)

In [None]:
selected_norm = "Norm1.0"
selected_outliers = "outliersiqr"
Distance=distance_info["norms"][selected_norm][distance_info[selected_outliers][selected_norm]['indices']]
Ind=distance_info[selected_outliers][selected_norm]['indices']
Distance, Ind =zip(*sorted(zip(Distance, Ind)))
cols = 6
rows = int(np.ceil(len(Ind)/cols))
plt.figure(figsize=(180,200))
for i in np.arange(len(Ind)):
    plt.subplot(rows, cols, i+1)
    plt.imshow(dataset[Ind[-(i+1)]], plt.cm.gray)

In [None]:
faces94_male = readFaces94MaleFaces(gray=True)
faces94_female = readFaces94FemaleFaces(gray=True)
faces94_malestaff = readFaces94MaleStaffFaces(gray=True)
landscapes = np.array(readLandsCapeImage(gray=True))

dataset = np.vstack((faces94_male, faces94_female, faces94_malestaff, landscapes))

labels = np.concatenate((
    np.ones(faces94_male.shape[0]),
    np.full(faces94_female.shape[0], 2),
    np.full(faces94_malestaff.shape[0], 3),
    np.zeros(landscapes.shape[0])
))

dataset_N, height, width = dataset.shape

In [None]:
mean_all = np.mean(dataset.reshape(dataset_N, height*width), axis=0).reshape(height, width)
plt.imshow(mean_all, plt.cm.gray)

# Principal component analysis (PCA)

In [None]:
dataset_norm = dataset/255
dataset_norm_cov = np.cov(dataset_norm.reshape(dataset_N, height*width))

np.linalg.det(dataset_norm_cov)

In [None]:
u,s,vh = np.linalg.svd(dataset_norm_cov)

In [None]:
s

In [None]:
representation_percentage = 0.80

In [None]:
sum_eig = np.sum(s)
percentage_variance = np.divide(s, sum_eig)
sum_var = 0
num_var = 0
for i in np.arange(percentage_variance.shape[0]):
    if sum_var >= representation_percentage:
        num_var = i
        break;
    
    sum_var += percentage_variance[i]
    
num_var

In [None]:
pca = PCA(n_components=num_var, svd_solver='full').fit(dataset.reshape(dataset_N, height*width))
pca.components_.shape

In [None]:
cols = 6
rows = int(np.floor(num_var/cols))
plt.figure(figsize=(180,200))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.imshow(pca.components_[i].reshape(height, width), plt.cm.gray)

In [None]:
mean_face = pca.mean_.reshape(height, width)
mean_face2 = np.mean(dataset.reshape(dataset_N, height*width), axis=0).reshape(height, width)
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("PCA mean")
ax1.imshow(mean_face, plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("np mean")
ax2.imshow(mean_face2, plt.cm.gray)
Dis=np.linalg.norm(mean_face - mean_face2, ord=2, keepdims=False)
print("Distance "+ str(Dis))

In [None]:
dataset_projected = pca.transform(dataset.reshape(dataset_N, height*width))

In [None]:
dataset_projected.shape

In [None]:
image_index = np.random.randint(0, high=dataset_N, size=1)[0]
example_image = np.matmul(dataset_projected[image_index], pca.components_)
original_image = dataset[image_index]
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("Original Image")
ax1.imshow(original_image, plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("Reconstructed Image")
ax2.imshow(example_image.reshape(height,width), plt.cm.gray)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset_projected, labels, test_size=0.3, stratify=labels)

In [None]:
classifier = LogisticRegression(solver='newton-cg', multi_class='multinomial')
classifier.fit(X_train, y_train)

In [None]:
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)

In [None]:
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)

In [None]:
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman", "man_staff"]))

In [None]:
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.invert_yaxis()
ax.invert_xaxis()

In [None]:
label_binarize(labels, classes=[0,1,2,3])

In [None]:
linkage_matrix = linkage(dataset_projected)

In [None]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    linkage_matrix,
    p=120,
    truncate_mode='level'
)
plt.show()

# Check parametric distribution of distances


It is evaluated if the metrics can be adjusted with a parametric distribution, for this purpose the chi-squared statistic is calculated (small chi_square values indicates a better fit) and the Kolmogorov-Smirnov test (values greater than .05 indicate good fit)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
dist=distance_info['norms']['Norm1.0']
check_parametricDistribu_distances(dist)

# Mahalanobis distance

The Mahalanobis distance has the form $d_{i,j}=[(x_i-x_j)'M^{-1}(x_i-x_j)]^{\frac{1}{2}}$ where $M^{-1}$ is the inverse of covariance matrix. To obtain $M^{-1}$, The LedoitWolf estimate is used.

In [None]:
X = dataset.reshape(dataset_N, height*width)
X_norm = np.divide(np.subtract(X, X.mean(axis=0)),  X.std(axis=0))

In [None]:
lw = LedoitWolf(store_precision=True, assume_centered=True)
cov_all = lw.fit(X_norm)
inv_cov_all = lw.precision_