# Déployer un modèle dans le cloud
# Notebook - Créer un jeu de données réduit pour les tests en local

# Objectifs du notebook

Pour accélérer les tests sur notre application en local, les données sont stockées dans le même répertoire que le notebook et seul un extrait aléatoire de **3 images par catégorie** (=> 3 x 131 = 393 images) sera utilisé dans la version locale.<br>
Les données de ce dataset échantillon seront stockées dans le dossier **test_local_sample**.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Déployer-un-modèle-dans-le-cloud" data-toc-modified-id="Déployer-un-modèle-dans-le-cloud-1">Déployer un modèle dans le cloud</a></span></li><li><span><a href="#Notebook---Créer-un-jeu-de-données-réduit-pour-les-tests-en-local" data-toc-modified-id="Notebook---Créer-un-jeu-de-données-réduit-pour-les-tests-en-local-2">Notebook - Créer un jeu de données réduit pour les tests en local</a></span></li><li><span><a href="#Objectifs-du-notebook" data-toc-modified-id="Objectifs-du-notebook-3">Objectifs du notebook</a></span></li><li><span><a href="#I)-Importation-des-librairies" data-toc-modified-id="I)-Importation-des-librairies-4">I) Importation des librairies</a></span></li><li><span><a href="#II)-Définition-des-chemins" data-toc-modified-id="II)-Définition-des-chemins-5">II) Définition des chemins</a></span></li><li><span><a href="#III)-Définition-des-fonctions" data-toc-modified-id="III)-Définition-des-fonctions-6">III) Définition des fonctions</a></span></li><li><span><a href="#IV)-Création-de-la-liste-des-échantillons-d'image-sélectionnés" data-toc-modified-id="IV)-Création-de-la-liste-des-échantillons-d'image-sélectionnés-7">IV) Création de la liste des échantillons d'image sélectionnés</a></span></li><li><span><a href="#V)-Création-du-dataset-échantillon" data-toc-modified-id="V)-Création-du-dataset-échantillon-8">V) Création du dataset échantillon</a></span></li></ul></div>

# I) Importation des librairies

In [2]:
# For matrix calculation performances.
import pandas as pd

# For files and directories managements.
import os
import shutil
import pathlib

# For the sampling.
import random

# II) Définition des chemins

In [3]:
# Get the absolute current folder path of the notebook.
PROJ_PATH = os.getcwd()
    
# Get the path of the datasets directory.
DATASETS_DIR_PATH = os.path.join(PROJ_PATH, "datasets")
    
# Get the path of the directory to sample.
TESTSET_DIR_PATH = os.path.join(DATASETS_DIR_PATH, "test")

# Path of the folder of the test sample for local use.
LOCAL_FOLDER_NAME = "test_local_sample"
LOCAL_SP_DATASET_PATH = os.path.join(DATASETS_DIR_PATH, LOCAL_FOLDER_NAME)

# III) Définition des fonctions

In [4]:
def create_directories (dir_path, new_dir_names):
    
    """ Creates new directories. """
    
    # If a string was passed as directory name store it in a list.
    if type(new_dir_names) == type(''):
        new_dir_names = [new_dir_names]
    
    # Associates the directory's path to its names.
    for new_dir_name in new_dir_names:
        new_dir_path = os.path.join(dir_path, new_dir_name)
  
        # Creates the directory.
        os.mkdir(new_dir_path)


def move_selected_files (selected_files, source_folder, destination_folder):
    
    """ Moves the selected files from a path to another. """
    
    # If a string was passed as directory name store it in a list.
    if type(selected_files) == type(''):
        selected_files = [selected_files]
    
    # Iterates on all selected files to copy and paste them to the destination folder
    for file in selected_files: #all_folder_files = os.listdir(source_folder)
        
        # Associates the source and destination folders' paths to the file's name.
        src_path = os.path.join(source_folder, file)
        dst_path = os.path.join(destination_folder, file)
        
        # Moves the file to the destination folder.
        os.rename(src_path, dst_path)


def copy_and_paste_selected_files (selected_files, source_folder, destination_folder):
    
    """ Copies/Pastes the selected file. """
    
    # If a string was passed as file name store it in a list.
    if type(selected_files) == type(''):
        selected_files = [selected_files]
    
    # Iterates on all selected files to copy and paste them to the destination folder
    for file in selected_files: #all_folder_files = os.listdir(source_folder)
        
        # Associates the source and destination folders' paths to the file's name.
        src_path = os.path.join(source_folder, file)
        dst_path = os.path.join(destination_folder, file)
        
        # Copies and pastes the file to the destination folder.
        shutil.copy(src_path, dst_path) #os.rename(src_path, dst_path) # For moving file without copying them.
        
        
def remove_selected_files (dir_path, selected_files):

    """ Removes the selected file. """
    
    # If a string was passed as file name store it in a list.
    if type(selected_files) == type(''):
        selected_files = [selected_files]
    
    for file_name in selected_files: #os.listdir(dir_path):
        file_path = os.path.join(dir_path, file_name)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

# IV) Création de la liste des échantillons d'image sélectionnés

In [5]:
# Get the list of subdirectories containing all images.
# NB: subdir_names = cat_names.
l_subdirs_names = os.listdir(TESTSET_DIR_PATH)

# Generate a dictionary with a sample of images per subfolder.
d_sps_per_cat = dict()
for subdir_name in l_subdirs_names:
    
    # Get the path of the subfolder.
    subdir_path = pathlib.Path(os.path.join(TESTSET_DIR_PATH, subdir_name))
    
    # Get the paths of a sample of images within the subfolder (3 images by subfolder).
    random.seed(0) # Set the random sampling such as the sampling is reproducible.
    l_subdir_sp_paths = random.sample(list(subdir_path.glob('*.jpg')), 3)
    
    # (Faculative) For better readability, names will be stored rather than paths.
    l_subdir_sp_names = [os.path.basename(subdir_sp_path) for subdir_sp_path in l_subdir_sp_paths]
    
    # Store both objects within the dictionary.
    d_sps_per_cat[subdir_name] = l_subdir_sp_names

# Convert the dictionary as a pandas dataframe.
df = pd.DataFrame(d_sps_per_cat)

# Show our sampling.
df.head()

Unnamed: 0,Apple Braeburn,Apple Crimson Snow,Apple Golden 1,Apple Golden 2,Apple Golden 3,Apple Granny Smith,Apple Pink Lady,Apple Red 1,Apple Red 2,Apple Red 3,...,Tomato 2,Tomato 3,Tomato 4,Tomato Cherry Red,Tomato Heart,Tomato Maroon,Tomato not Ripened,Tomato Yellow,Walnut,Watermelon
0,r_40_100.jpg,r_20_100.jpg,r_44_100.jpg,r_40_100.jpg,r_43_100.jpg,r_40_100.jpg,r_234_100.jpg,r_40_100.jpg,r_40_100.jpg,r_59_100.jpg,...,r_33_100.jpg,r_34_100.jpg,r_171_100.jpg,r_218_100.jpg,r_88_100.jpg,r_27_100.jpg,r_244_100.jpg,r2_8_100.jpg,r_294_100.jpg,r_172_100.jpg
1,r_49_100.jpg,r_29_100.jpg,r_52_100.jpg,r_49_100.jpg,r_51_100.jpg,r_49_100.jpg,r_243_100.jpg,r_49_100.jpg,r_49_100.jpg,r_67_100.jpg,...,r2_21_100.jpg,r2_24_100.jpg,r_182_100.jpg,r_229_100.jpg,r2_261_100.jpg,285_100.jpg,r_281_100.jpg,r_110_100.jpg,r2_115_100.jpg,r_47_100.jpg
2,35_100.jpg,110_100.jpg,110_100.jpg,35_100.jpg,38_100.jpg,35_100.jpg,237_100.jpg,35_100.jpg,35_100.jpg,53_100.jpg,...,r_27_100.jpg,r_209_100.jpg,130_100.jpg,204_100.jpg,r_145_100.jpg,r_19_100.jpg,155_100.jpg,16_100.jpg,r_200_100.jpg,137_100.jpg


# V) Création du dataset échantillon

In [6]:
# Remove the obsolet folder if required then, create a new one.
remove_selected_files(DATASETS_DIR_PATH, "test_local_sample")              
create_directories(DATASETS_DIR_PATH, "test_local_sample")                   

# Create all subfolders.
create_directories(LOCAL_SP_DATASET_PATH, df.columns)

# Copies and pastes the selected images to the folder.
for subdir_name in df.columns:
    source_folder = os.path.join(TESTSET_DIR_PATH, subdir_name)
    dest_folder = os.path.join(LOCAL_SP_DATASET_PATH, subdir_name)
    copy_and_paste_selected_files(df[subdir_name], source_folder, dest_folder)