# PCA

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


params.py

In [None]:
LOCAL_REGISTRY_PATH='/content/gdrive/MyDrive/Bootcamp_ENAP_2022/new_output/model'
LOCAL_DATA_PATH_OUTPUT_IMG='/content/gdrive/MyDrive/Bootcamp_ENAP_2022/new_output/processed_img'

CHUNK_SIZE=10000
PCA_COMPONENTS=400
PCA_BATCH_SIZE=256
AUTOENCODER_WIDTH=160
AUTOENCODER_HEIGHT=192

registry.py

In [None]:
import os
import time
import pickle
import glob
from sklearn.decomposition import IncrementalPCA

def load_pca(elected: bool, bw: bool, save_copy_locally=False) -> IncrementalPCA:
    """
    load the latest saved model, return None if no model found
    """

    print(f"\nLoad pca from local disk...")

   # get latest model version
    model_directory = os.path.join(LOCAL_REGISTRY_PATH,
        'bw' if bw else 'color',
        'elected' if elected else 'not_elected',
        'models', 'pca')

    results = glob.glob(f"{model_directory}/*")
    if not results:
        print(model_directory)
        return None

    model_path = sorted(results)[-1]
    print(f"- path: {model_path}")

    with open(model_path, "rb") as file:
        print("\n✅ model loaded from disk")
        return pickle.load(file)

def save_pca(pca: IncrementalPCA, params: dict, elected: bool, bw: bool) -> None:
    """
    persist trained model, params and metrics
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")

    print("\nSave pca to local disk...")

    # save params
    if params is not None:
        params_path = os.path.join(LOCAL_REGISTRY_PATH,
        'bw' if bw else 'color',
        'elected' if elected else 'not_elected',
        'params', 'pca')
        if not os.path.exists(params_path):
            os.makedirs(params_path)
        print(f"- params path: {params_path}")
        with open(os.path.join(params_path,timestamp + ".pickle"), "wb") as file:
            pickle.dump(params, file)

    # save model
    if pca is not None:
        model_path = os.path.join(LOCAL_REGISTRY_PATH,
        'bw' if bw else 'color',
        'elected' if elected else 'not_elected',
        'models', 'pca')
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        print(f"- model path: {model_path}")
        with open(os.path.join(model_path,timestamp + ".pickle"), "wb") as file:
            pickle.dump(pca, file)

    print("\n✅ data saved locally")

    return None


model.py

In [None]:
from sklearn.decomposition import IncrementalPCA

def initialize_pca(PCA_BATCH_SIZE:int, PCA_COMPONENTS:int) -> IncrementalPCA:
    return IncrementalPCA(batch_size=PCA_BATCH_SIZE, n_components=PCA_COMPONENTS)


main.py

In [None]:
import os
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.errors import NotFoundError
from tensorflow.keras.layers import Rescaling, Flatten

def fit_pca(elected=True, bw=True):
    """
    Train a new model on the full (already preprocessed) dataset ITERATIVELY, by loading it
    chunk-by-chunk, and updating the weight of the model after each chunks.
    Save final model once it has seen all data, and compute validation metrics on a holdout validation set
    common to all chunks.
    """
    print(f"\n⭐️ use case: fit pca on {'elected' if elected else 'not elected'} candidates with {'b&w' if bw else 'color'}-images")

    print("\nLoading preprocessed data...")

    folder = os.path.join(
        LOCAL_DATA_PATH_OUTPUT_IMG,
        'elected' if elected else 'not_elected',
        'bw' if bw else 'color')

    normalized_images_dataset = None
    # load a train set
    try:
        images_dataset = image_dataset_from_directory(folder,
                                                      label_mode=None,
                                                      batch_size=CHUNK_SIZE,
                                                      image_size=(AUTOENCODER_HEIGHT,AUTOENCODER_WIDTH),
                                                      shuffle=True,
                                                      crop_to_aspect_ratio=True)
        normalization_layer = Rescaling(1./255)
        flatten_layer = Flatten()
        normalized_images_dataset = images_dataset.map(lambda x: flatten_layer(normalization_layer(x)))
    except NotFoundError:
        print("\n✅ no data to train")
        return None

    pca = None
    #pca = load_pca(elected, bw)  # production model

    # iterate on the full dataset per chunks
    chunk_id = 0
    row_count = 0

    for image_batch in normalized_images_dataset:

        print(f"\n✅ Loading and training on preprocessed chunk n°{chunk_id}...")

        # check whether data source contain more data
        if image_batch.shape[0] < PCA_COMPONENTS:
            print(f"\nLast batch ({image_batch.shape[0]}) is no greater than pca components ({PCA_COMPONENTS}). It will be skipped.")
            break

        # increment trained row count
        chunk_row_count = image_batch.shape[0]
        row_count += chunk_row_count

        # initialize pca
        if pca is None:
            pca = initialize_pca(PCA_BATCH_SIZE, PCA_COMPONENTS)

        # train the pca incrementally
        pca = pca.partial_fit(image_batch)

        chunk_id += 1

    if row_count == 0:
        print("\n✅ no new data for the training 👌")
        return

    print(f"\n✅ trained on {row_count} rows")

    # save pca
    save_pca(pca=pca, params=dict(PCA_BATCH_SIZE=PCA_BATCH_SIZE, PCA_COMPONENT=PCA_COMPONENTS), elected=elected, bw=bw)

    return None

def fit():
    fit_pca(True, False)
    fit_pca(False, False)
    fit_pca(True, True)
    fit_pca(False, True)


In [None]:
#fit_pca(elected=True, bw=False)
fit()