In [30]:
import ima_utils
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.decomposition import PCA, FastICA, NMF, DictionaryLearning

In [16]:
# Globals

_SHUFFLE = True
_SEED = 0
_LIMITS = True

In [17]:
# Set the paths to labels, faces, etc.

faces_path = "../data/Faces/"
labels_path = "../data/labels.csv"
names_path = "../data/filenames.txt"

data_set_gray_npy = "data_gray.npy" # All of the observed data saved in anpy file. See from_face_images_to_npy.py
data_set_rgb_npy = "data_rgb.npy" # Unused

In [18]:
# Get labels as pandas object from labels.csv file.
labels = ima_utils.get_labels_df(labels_path=labels_path, names_path=names_path)

In [19]:
# I limit the age. For legal reasons, not so that I have fewer data and to avoid the babies.

if _LIMITS:
    age_min = 18
    age_max = 80
    labels = labels.loc[(labels.age >= age_min) & (labels.age <= age_max)]

In [20]:
# Put age in bins

age_bin_width_years = 25
ages = labels.age.values//age_bin_width_years  # bin_width fully lived
labels.drop(['age'], axis=1, inplace=True)
labels['age_bin'] = ages

In [25]:
# Split the whole image data set to train and test sets. Only the train data set will be used.

train_dataset_idx, test_dataset_idx = train_test_split(labels.index.to_list()) # The indices of the images to be split

X = np.load(data_set_gray_npy) # Load my dataset (full)

data_train, data_test = X[train_dataset_idx], X[test_dataset_idx]
labels_train, labels_test = labels.loc[train_dataset_idx], labels.loc[test_dataset_idx]
labels_train = labels_train.reset_index()
labels_test = labels_test.reset_index()
del X, labels  # to avoid accessing the test set and also to save ram

In [26]:
# Open a random image to get dimensions

h, w = ima_utils.get_dimensions_from_an_image(faces_path, 0, as_gray=True)

In [27]:
# Select images from data_train to load

n_elements_from_label = 1000
label_to_choose_from = "random"
images_to_load = ima_utils.pick_n_from_label(labels_train, n_elements_from_label, label_to_choose_from, shuffle=_SHUFFLE)


In [28]:
# Select from the data_train set the images_to_load

X = data_train[images_to_load, :]  # X is a new name for my data. The whole data, previously named X, set was deleted

labels_loaded = labels_train.loc[images_to_load]
labels_loaded = labels_loaded.reset_index()

y_age = labels_loaded.age_bin.values
y_race = labels_loaded.race.values
y_gender = labels_loaded.gender.values

In [31]:
# Find for a decomposition method, here NMF, the optimal number of components, based on relative loss

max_iter = 500
tolerance = 0.01

rel_loss = 100
loss = 1

num_comp = 1
i=1
while rel_loss>= tolerance:
    print(f'\nRun {i}')
    num_comp += 1
    i += 1
    nmf = NMF(n_components=num_comp, random_state=_SEED, init='random',max_iter=max_iter).fit(X)
    new_loss = ima_utils.get_decomposition_reconstruction_error_score(nmf, X)
    print(f"Loss {new_loss}")
    rel_loss = np.abs(new_loss-loss)/loss
    print(f"Relative change in loss {rel_loss}")
    loss = new_loss

print(f'NMF number of components {num_comp} wit loss: {loss}')



Run 1


KeyboardInterrupt: 