In [1]:
import cv2
import os
import numpy as np
from random import shuffle
from tqdm import tqdm

In [2]:
# Paths
PROJ_ROOT = os.path.join(os.pardir)
path_training = os.path.join(PROJ_ROOT, "data", "raw", "train/")
path_testing  = os.path.join(PROJ_ROOT, "data", "raw", "test/")

In [3]:
# Labeling the dataset
def label_img(img):
    word_label = img.split(".")[-3]
    if word_label == 'cat':
        return [1,0]
    elif word_label == 'dog':
        return [0,1]

In [20]:
# Create the training data
def create_training_data():
    training_data = []
    training_labels = []
    # tqdm is only used for interactive loading
    # loading the training data
    for img in tqdm(os.listdir(path_training)):
        
        # label of the image
        label = label_img(img)
        
        # path to the image
        path = os.path.join(path_training, img)
        
        # load the image from the path and convert it to grayscale for simplicity
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        
        # resize the image
        img = cv2.resize(img, (50, 50))
        
        # final step-forming the training data list wiht numpy array of images
        training_data.append(img)
        training_labels.append(label)
        
    # shuffling of the training data to preserve the random state of our data
    shuffle(training_data)
    
    # randomly choose 1/5 of training set and call it validation set
    validation_set = training_data[:len(training_data)//5]
    validation_labels = training_labels[:len(training_labels)//5]
    training_labels = training_labels[len(training_labels)//5:]
    training_set   = training_data[len(training_data)//5:]
    
    # save the trained data for further uses if needed
#     np.save(os.path.join(PROJ_ROOT,'data', 'interim','training_data'), training_set)
#     np.save(os.path.join(PROJ_ROOT,'data', 'interim','validation_data'), validation_set)
#     np.save(os.path.join(PROJ_ROOT,'data', 'interim','validation_labels'), validation_labels)
#     np.save(os.path.join(PROJ_ROOT,'data', 'interim','training_labels'), training_labels)
        
    return training_set, training_labels, validation_labels, validation_set
        

In [17]:
training_d = create_training_data()



  0%|          | 0/25000 [00:00<?, ?it/s][A[A

  0%|          | 44/25000 [00:00<00:57, 434.46it/s][A[A

  0%|          | 97/25000 [00:00<00:54, 457.21it/s][A[A

  1%|          | 151/25000 [00:00<00:51, 478.17it/s][A[A

  1%|          | 208/25000 [00:00<00:49, 500.41it/s][A[A

  1%|          | 264/25000 [00:00<00:47, 515.58it/s][A[A

  1%|▏         | 318/25000 [00:00<00:47, 521.19it/s][A[A

  1%|▏         | 374/25000 [00:00<00:46, 531.44it/s][A[A

  2%|▏         | 432/25000 [00:00<00:45, 544.60it/s][A[A

  2%|▏         | 485/25000 [00:00<00:45, 537.88it/s][A[A

  2%|▏         | 543/25000 [00:01<00:44, 549.72it/s][A[A

  2%|▏         | 597/25000 [00:01<00:45, 540.35it/s][A[A

  3%|▎         | 651/25000 [00:01<00:45, 537.76it/s][A[A

  3%|▎         | 705/25000 [00:01<00:45, 531.93it/s][A[A

  3%|▎         | 764/25000 [00:01<00:44, 547.87it/s][A[A

  3%|▎         | 824/25000 [00:01<00:43, 562.05it/s][A[A

  4%|▎         | 884/25000 [00:01<00:42, 570.98it/s

 66%|██████▌   | 16390/25000 [00:27<00:13, 617.33it/s][A[A

 66%|██████▌   | 16453/25000 [00:27<00:13, 619.21it/s][A[A

 66%|██████▌   | 16515/25000 [00:27<00:13, 616.05it/s][A[A

 66%|██████▋   | 16577/25000 [00:27<00:13, 615.65it/s][A[A

 67%|██████▋   | 16643/25000 [00:27<00:13, 626.54it/s][A[A

 67%|██████▋   | 16709/25000 [00:27<00:13, 633.32it/s][A[A

 67%|██████▋   | 16773/25000 [00:27<00:13, 627.53it/s][A[A

 67%|██████▋   | 16837/25000 [00:28<00:12, 628.92it/s][A[A

 68%|██████▊   | 16901/25000 [00:28<00:12, 629.83it/s][A[A

 68%|██████▊   | 16965/25000 [00:28<00:12, 621.24it/s][A[A

 68%|██████▊   | 17029/25000 [00:28<00:12, 626.00it/s][A[A

 68%|██████▊   | 17092/25000 [00:28<00:12, 610.28it/s][A[A

 69%|██████▊   | 17156/25000 [00:28<00:12, 617.13it/s][A[A

 69%|██████▉   | 17222/25000 [00:28<00:12, 626.40it/s][A[A

 69%|██████▉   | 17285/25000 [00:28<00:12, 624.72it/s][A[A

 69%|██████▉   | 17349/25000 [00:28<00:12, 627.82it/s][A[A

 70%|███

In [62]:
# Convert the test data as well
def create_test_data():
    testing_data = []
    for img in tqdm(os.listdir(path_testing)):
        # path to the image
        path = os.path.join(path_testing, img)
        
        img_num = img.split(".")[0]
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (50, 50))
        testing_data.append([np.array(img),img_num])
    shuffle(testing_data)
    np.save(os.path.join(PROJ_ROOT, "data", "interim", "test_data"), testing_data)
    return testing_data

In [63]:
test_data = create_test_data()

100%|██████████| 12500/12500 [00:20<00:00, 607.28it/s]
