In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
import numpy as np
import cv2 as cv
import pandas as pd
import glob
from tqdm import tqdm
from typing import List, Tuple

In [18]:
def read_photo_path_data(path = '/content/drive/My Drive/aueb/') -> pd.DataFrame:
  """
  Reads the file paths of every jpg file in the chosen directory
  Returns a DataFrame Object
  """
  path = path +'**/*.jpg' # Includes all jpgs from all subdirectories
  file_paths = glob.glob(path, recursive = True)
  if not file_paths: # If glob didn't find any .jpg files
    raise ValueError(f"Did not find any .jpg files on path {path}")
  return pd.DataFrame(file_paths)


def extract_label_from_dataframe(df: pd.DataFrame) -> pd.Series:
  """
  Extracts the image label from path in the FIRST COLUMN.
  It is assumed that image is named as {class_label}_{photo_id}.jpg
  """
  return df.iloc[:,0].str.split("/").str[-1].str.split("_").str[0]


def images_to_ndarray(photo_path : str, image_size: Tuple)-> np.array:
  """
  The function reads the images supplied by 
  photo_paths parameter. The function reads an image from path
  and resizes it.
  """
  nd_array = cv.imread(photo_path)
  return cv.resize(nd_array, image_size)

In [21]:
IMG_SIZE = (200, 200)

In [19]:
photo_path_data = read_photo_path_data(path = '/content/drive/My Drive/Images/') # read image paths
photo_path_data['class_label'] = extract_label_from_dataframe(photo_path_data) # extract label and assign it to column
photo_path_data.rename(columns = {0:'path'},inplace = True)

In [None]:
#Reading Images may take a long time (>30min)
X = [images_to_ndarray(im, image_size = IMG_SIZE) for im in tqdm(photo_path_data['path'])]
X = np.array(X)
Y = np.array(photo_path_data['class_label'])
np.savez_compressed('/content/drive/MyDrive/models/data', a = X, b = Y) # Save np.array data to drive