# Convert mp4/mov video data to jpg/png images and npy binary format for computer vision applications

This notebook contains the functions and workflow to process mp4/mov video data into conventional imagery datasets (jpg/png) for deep learning computer vision model training. 

### pull data from aws_s3 to aws_sagemaker

In [None]:
# $ aws s3 cp s3://bucket-name/folder-name/ ./folder-name-desintation/  --recursive

### pull data from aws_s3 to local location

In [None]:
# assumes aws env is configured with 2 keys, using aws_cli, boto3
# ! aws s3 --no-verify-ssl sync s3://bucket-name ./folder-name-destination

### general utililty fxns

In [None]:
import os # for folder creation

In [None]:
def create_folder_ifnotexist(name_folder):
    '''
    Creates folder with name stored in name_folder. 
    Can also pass path as name_folder and entire path is created starting
                from executing location
                
    Parameters
    ----------
    name_folder: String. The name of the folder or folder-path to be created
    
    '''
    exists_chk = os.path.exists(name_folder) # boolean
    
    if not exists_chk:
        # directory does not exist, create it
        os.makedirs(name_folder)
        print('Created directory: ' + name_folder)

In [None]:
#create_folder_ifnotexist('test4/test5/test_folder_3')

### mp4 to jpg

In [None]:
import numpy as np
import cv2
import os
from PIL import Image

In [None]:
def convert_mp4_to_jpg(mp4_folder , mp4_files, jpg_folder, label_to_prepend):
    '''
    Converts specified mp4 video files into jpg images, 
    and prepends a specified class label to each image
    
    Parameters
    ----------
    mp4_folder: String. The path to the folder containing the mp4 video files to be converted
    
    mp4_files: String. The specific mp4 files inside the specified mp4_folder to be converted to jpg images
    
    jpg_folder: String. The destination folder for the created jpg files.
    
    label_to_prepend: String. The  class label to add at the start of each jpg file's name. Example '1_'
                      to indicate the given jpg image belongs to class 1. 
    '''
    for mp4 in mp4_files:
        cam = cv2.VideoCapture(mp4_folder + mp4)
        mp4_name = mp4[:-4] # remove .mp4 from file name
        try:
            # creating a folder named data
            #if not os.path.exists('data'):
            if not os.path.exists(jpg_folder):
                #os.makedirs('data')
                os.makedirs(jpg_folder)

        # if not created then raise error
        except OSError:
            print ('Error: Creating directory of data')

        # frame
        currentframe = 0

        while(True):
            # reading from frame
            ret,frame = cam.read()
            if ret:
                # if video is still left continue creating images
                #name = './data/frame' + str(currentframe) + '.jpg'
                #name = './data/' + str(label_to_prepend) + '_' + 'frame' + str(currentframe) + '_' + mp4_name + '.jpg'
                name = './' + jpg_folder + '/' + str(label_to_prepend) + '_' + 'frame' +  str(currentframe) + '_' + mp4_name + '.jpg'
                print ('Creating...' + name)

                # writing the extracted images
                cv2.imwrite(name, frame)

                # increasing counter so that it will
                # show how many frames are created
                currentframe += 1
            else:
                break
        # Release all space and windows once done
        cam.release()
        cv2.destroyAllWindows()

In [None]:

mp4_folder = '../data_mp4/'

#mp4_files = ['mp4_file1.mp4', # if want to select mp4 file
#             'mp4_file2.mp4'
#            ]

mp4_files = os.listdir(mp4_folder) # if want to convert all mp4 files in a directory

jpg_folder = mp4_files[0] + '_jpgs'
#jpg_folder = 'practice1' + '_jpgs'

label_to_prepend = '0'
#label_to_prepend = '1'

print(mp4_folder)
print(mp4_files)
print(" ")
print(jpg_folder)
print(label_to_prepend)

In [None]:
convert_mp4_to_jpg(mp4_folder, mp4_files, jpg_folder=jpg_folder, label_to_prepend=label_to_prepend)

### prepend class label to already existing images

In [None]:
import os

In [None]:
def rename_pics(ls_files, path_old, path_new):
    '''
    Prepends (adds) a class label indication before the name of each jpg file and writes the renamed
    jpg files to a new folder location
    
    Parameters
    ----------
    ls_files: List. List of jpg files to rename
    
    path_old: String. Path to the folder containing jpg images with original names
    
    path_new: String. Path to a the folder where the re-names jpg image files will be saved
    
    Notes
    -----
    Usage Note: Must comment out and in code lines below based on what naming needs to happen.
    Future plan is to make these processes automatic (less hand manip of fxns).
    '''
    for file in ls_files:
        #os.rename(path_old + file, './temp_rename/' + '1_' + file)
        
        #os.rename(path_old + file,  path_new + '0_' + file)
        os.rename(path_old + file,  path_new + '1_' + file)

In [None]:
data_0 = '../data_raw_synth_tray_miss_SK-J350-BKT-9080-bad/'
data_1 = '../data_raw_synth_tray_full_SK-J350-BKT-9080/'

ls_files_0 = os.listdir(data_0)
ls_files_1 = os.listdir(data_1)

print(len(ls_files_0))
print(len(ls_files_1))

In [None]:
#rename_pics(ls_files_0, data_0, './temp_rename/')
rename_pics(ls_files_1, data_1, './temp_rename/')

### resize images

In [None]:
import numpy as np
import cv2
import os
from PIL import Image

# 12/12/2022
from PIL import ImageOps # for greyscale, 4 color channel, issue 

In [None]:
def resize_images(path_orig_img, path_smaller_img, x_dim, y_dim, img_quality):
    '''
    Changes the resolution of jpg or png images
    
    Parameters
    ----------
    path_orig_img: String. Path to the folder containing the full resolution images (jpg or png).
    
    path_smaller_img: String. Path to the folder where the reduced-resolution images will be written. 
    
    x_dim: Integer. The x-dimension of the reduced resolution images
    
    y_dim: Integer. The y-dimension of the reduced resolution images.
    
    image_quaity: Integer. The value of the 'quality' parameter from the Pillow (PIL) Python library. 
                  See:  See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
                  The image quality, on a scale from 0 (worst) to 95 (best), or the string keep. 
                  The default is 75. Values above 95 should be avoided; 100 disables portions of 
                  the JPEG compression algorithm, and results in large files with hardly any gain in image quality. 
                  The value keep is only valid for JPEG files and will retain the original image quality level, 
                  subsampling, and qtables.   
    
    Notes
    -----
    Creates path_smaller_img if it does not exist
    '''
    
    images = [file for file in os.listdir(path_orig_img) if file.endswith(('jpeg', 'png', 'jpg'))]
    print('image count for size reduction is: ' + str(len(images)))
    
    create_folder_ifnotexist(path_smaller_img)
    
    for image in images:
        img = Image.open(path_orig_img + image)
        # img.thumbnail((28,28)) # maintains aspect ratio
        img = img.resize((x_dim,y_dim)) # disregards original aspect ratio
        img.save(('./' + path_smaller_img + '/' + image), 
                 optimize=True, quality=img_quality)

In [None]:
x_dim = 299
y_dim = 299
img_quality = 95

path_orig_img = './path_to_full_resolution_images/'

path_smaller_img = path_orig_img[:-1] + '_' + str(x_dim) + '/'
path_smaller_img

In [None]:
resize_images(path_orig_img, path_smaller_img, x_dim, y_dim, img_quality)

### Create combined folder of data

In [None]:
import random
import time
import shutil

In [None]:
def combine_data_sources(ls_source, ls_files_cnt, folder_combined):
    '''
    A convenience function to copy images from 2 folders into 1 folder location. 
    Saves the need to copy and combine the images files by hand.
    Also allows easy specification of the number of images files (jpg or png) to copy from each folder,
        allowing the composition of the training dataset to be customized as needed.
        
    Parameters
    ----------
    
    ls_source: List of Strings. The paths to the folders containing the original images that are to be combined.
    
    ls_files_cnt: List of Integers. Integers specifying how many images to take from each folder specified in ls_source.
                  The specified number of images will be randomly selected from the source folder, without replacement.
    
    folder_combined: String. The base name of the folder the images will be compied to, the folder containing the combined 
                     image dataset. A timestamp is appended to the final folder name. 
    
    Returns
    -------
    folder_combined_name. String. The name of the folder containing the combined image dataset, with epoch timestamp appended.
    
    Notes
    -----
    Note: len(ls_source) should match len(ls_files_cnt). (note: code a test for this condition and return error if not true).
    
    Note: Specified number of images will be randomly selected from the source folder, without replacement.
    '''
    
    folder_combined_name = folder_combined  + '_' + str(round(time.time()))
    create_folder_ifnotexist(folder_combined_name)
    
    for i in range(0, len(ls_source)):
        ls_files = []
        ls_files = os.listdir(ls_source[i])
        ls_files_sel = random.sample(ls_files, ls_files_cnt[i])
        
        for file_name in ls_files_sel:
            shutil.copy(ls_source[i] + file_name, folder_combined_name + '/' + file_name)
        
        #shutil.copy()
    #return ls_files_sel    
    
    return folder_combined_name

In [None]:
ls_source = ['./source_folder_1/',
             './source_folder_2/',
             './source_folder_3/'
             ]
ls_files_cnt = [100,
                50,
                50
               ]

folder_combined = 'combined_practice1'

In [None]:
folder_combined_name = combine_data_sources(ls_source, 
                                            ls_files_cnt, 
                                            folder_combined)
folder_combined_name

In [None]:
# could use a dictionary too, like below (but seemed slightly more complex)
# # https://stackoverflow.com/questions/4326658/how-to-index-into-a-dictionary
#dict_source = {'./source_folder_1/': 100,
#               './source_folder_2/': 100,
#             }
#print(list(dict_source)[0])
#print(list(dict_source.values())[0])

### jpg to npy

In [None]:
# (imports below work in base_env on dsk3)
import glob
import numpy as np
import os.path as path
import imageio

In [None]:
def img_to_npy(IMAGE_PATH, TRAIN_TEST_SPLIT_RATIO):
    '''
    Converts image files (jpg or png) to numpy binary numeric format and saves
    
    Parameters
    ----------
    
    IMAGE_PATH: String. 
    
    Notes
    -----
    Currently writes npy data to local folder
    '''
    file_paths = glob.glob(path.join(IMAGE_PATH, '*.jpg'))
    #file_paths = glob.glob(path.join(IMAGE_PATH, '*.png'))
    # Load the images
    images = [imageio.imread(path) for path in file_paths]
    images = np.asarray(images)
    print(images.shape)
    
    # Scale
    images = images / 255
    
    # Read the labels from the filenames
    n_images = images.shape[0]
    labels = np.zeros(n_images)
    for i in range(n_images):
        filename = path.basename(file_paths[i])[0]
        labels[i] = int(filename[0])
        
    ## Split into test and training sets
    
    # Split at the given index
    split_index = int(TRAIN_TEST_SPLIT_RATIO * n_images)
    shuffled_indices = np.random.permutation(n_images)
    train_indices = shuffled_indices[0:split_index]
    test_indices = shuffled_indices[split_index:]
    
    # Split the images and the labels

    X_train = images[train_indices, :, :, :]
    #X_train = images[train_indices]
    
    y_train = labels[train_indices]
    
    X_test = images[test_indices, :, :, :]
    #X_test = images[test_indices]

    y_test = labels[test_indices]

    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    print(y_train[0:10])
    
    time_stamp = str(round(time.time()))
    
    #np.save('X_train.npy', X_train)
    np.save('X_train_' + time_stamp + '.npy', X_train)
    #np.save('y_train.npy', y_train)
    np.save('y_train_' + time_stamp + '.npy', y_train)
    #np.save('X_test.npy', X_test)
    np.save('X_test_' + time_stamp + '.npy', X_test)
    #np.save('y_test.npy', y_test)
    np.save('y_test_' + time_stamp + '.npy', y_test)

In [None]:
#folder_combined_name = 'combined_practice1_1676255021'
path_img_combined = folder_combined_name # from previous step
path_img_combined

In [None]:
img_to_npy(path_img_combined, 0.80)

## Automate the whole process, jpg or png to npy

To ease user effort and save time, a streamlined pipeline is provided that leverages the mp4 to jpg/npy conversion and processing functions defined above. 

#### Imports

In [None]:
import os

import numpy as np
import cv2
import os
from PIL import Image

import numpy as np
import cv2
import os
from PIL import Image

# 12/12/2022
from PIL import ImageOps # for greyscale, 4 color channel, issue 

import random
import time
import shutil

import time

In [None]:
from utility_fxns_mp4_to_npy_v5 import create_folder_ifnotexist
from utility_fxns_mp4_to_npy_v5 import convert_mp4_to_jpg
from utility_fxns_mp4_to_npy_v5 import rename_pics
from utility_fxns_mp4_to_npy_v5 import resize_images
# from utility_fxns_mp4_to_npy_v1 import combine_data_sources
from utility_fxns_mp4_to_npy_v5 import combine_data_sources_2 # needed to add path_unique into fxn code
# from utility_fxns_mp4_to_npy_v1 import img_to_npy
from utility_fxns_mp4_to_npy_v5 import img_to_npy_2

In [None]:
path_unique = './createnpy_' + str(round(time.time())) + '/'
create_folder_ifnotexist(path_unique)

#### define variables, call fxns

##### mp4 to jpg

In [None]:
# mp4 to jpg

mp4_folder = '../data_mp4/'

mp4_files = os.listdir(mp4_folder) # if want to convert all mp4 files in a directory
jpg_folder = mp4_files[0] + '_jpgs'

label_to_prepend = '0'
#label_to_prepend = '1'

#print(mp4_folder)
#print(mp4_files)
#print(" ")
#print(jpg_folder)
#print(label_to_prepend)

In [None]:
# convert_mp4_to_jpg(mp4_folder, mp4_files, jpg_folder=jpg_folder, label_to_prepend=label_to_prepend)
convert_mp4_to_jpg(mp4_folder, mp4_files, 
                   jpg_folder=(path_unique+jpg_folder), 
                   label_to_prepend=label_to_prepend)

##### Resize images

In [None]:
# resize images

x_dim = 299
y_dim = 299
img_quality = 95

#path_orig_img = path_unique + '1_kit.mp4_jpgs/'

#path_orig_img = path_unique + '0_orig_img.mp4_jpgs/'
path_orig_img = path_unique + '1_orig_img.mp4_jpgs/'

path_smaller_img = path_orig_img[:-1] + '_' + str(x_dim) + '/'
path_smaller_img

In [None]:
resize_images(path_orig_img, path_smaller_img, x_dim, y_dim, img_quality)

##### Combine classes

In [None]:
ls_source = ['0_orig_img.mp4_jpgs_299/',
             '1_orig_img.mp4_jpgs_299/'
            ]


# ls_files_cnt = [5856,5320]
ls_files_cnt = [1000,1000]

folder_combined = 'data_combined_all_1'

In [None]:
folder_combined_name = combine_data_sources_2(ls_source, 
                                            ls_files_cnt, 
                                            folder_combined, path_unique
                                            )
folder_combined_name

##### jpg to npy

In [None]:
#folder_combined_name = './data_combined'
path_img_combined = path_unique + folder_combined_name # from previous step
path_img_combined

In [None]:
img_to_npy_2(path_img_combined, 0.80, path_unique)