<a href="https://colab.research.google.com/github/mathun3003/sight_seeking/blob/main/notebooks/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import image_dataset_from_directory
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import os, sys
import cv2
import numpy as np
from PIL import Image
import logging
import time
import random
import json

In [None]:
# mount images from gdrive
drive.mount('/content/drive/', force_remount=True)

imgs_path = "/Sight_Seeking/data/"
base_path = "/content/drive/My Drive"
gdrive_path = base_path + imgs_path

# imgs dirs
dirs = next(os.walk(gdrive_path))[1]
# add slash to each dir
dirs = [dir + "/" for dir in dirs]

Mounted at /content/drive/


In [None]:
# create function for image resizing
def resize_images_in_dir(dir: str, output_shape: tuple=(224, 224), quality: int=90) -> None:
  print("Resizing images in {}".format(dir))
  # for every image in dir
  for item in os.listdir(dir):
    # if it is a file
    if os.path.isfile(dir+item):
      # open image
      im = Image.open(dir+item)
      # split file name and file type, discard file type
      f, _ = os.path.splitext(dir+item)
      # resize img
      imResize = im.resize(output_shape, Image.ANTIALIAS)
      # if image is not RGB
      if imResize.mode in ["RGBA", "P"]:
        # conver to RGB
        imResize = imResize.convert("RGB")
      # save img
      imResize.save(f.replace("data/", "pp_data/") + '_resized.jpg', 'JPEG', quality=quality)
      # wait until saving succeed
      time.sleep(0.5)
    else:
      raise ValueError("Objects has to be a file, not {}".format(type(item)))
  # print "logs"
  print(f"""
  Number of images before scaling: {len(os.listdir(dir))}
  Number of images after scaling: {len(os.listdir(dir.replace("data/", "pp_data/")))}
  """)
  pass

In [None]:
# rescaling images
for dir in dirs:
  resize_images_in_dir(gdrive_path + dir)

Resizing images in /content/drive/My Drive/Sight_Seeking/data/Schloss_Münster/

  Number of images before scaling: 90
  Number of images after scaling: 89
  
Resizing images in /content/drive/My Drive/Sight_Seeking/data/St._Paulus_Dom_Münster/

  Number of images before scaling: 80
  Number of images after scaling: 80
  
Resizing images in /content/drive/My Drive/Sight_Seeking/data/Erbdrostenhof_münster/

  Number of images before scaling: 68
  Number of images after scaling: 68
  
Resizing images in /content/drive/My Drive/Sight_Seeking/data/kiepenkerl_denkmal_münster/

  Number of images before scaling: 69
  Number of images after scaling: 69
  
Resizing images in /content/drive/My Drive/Sight_Seeking/data/LWL-Museum_für_kunst_und_kultur_münster/

  Number of images before scaling: 13
  Number of images after scaling: 13
  
Resizing images in /content/drive/My Drive/Sight_Seeking/data/Sankt_Lamberti_Münster/

  Number of images before scaling: 72
  Number of images after scali

In [None]:
# Define a labels dict to map labels
base_dir = '/content/drive/My Drive/Sight_Seeking'
directories = os.listdir(base_dir + "/pp_data")
labels_dict = {l: n for n,l in enumerate(directories)}
labels_dict

{'Schloss_Münster': 0,
 'St._Paulus_Dom_Münster': 1,
 'Erbdrostenhof_münster': 2,
 'kiepenkerl_denkmal_münster': 3,
 'LWL-Museum_für_kunst_und_kultur_münster': 4,
 'Sankt_Lamberti_Münster': 5,
 'Buddenturm_Münster': 6,
 'Aaseekugeln_Münster': 7,
 'Antiquariat_Münster': 8,
 'Provinzial_Münster': 9,
 'Cavete_Münster': 10,
 'Rathaus_Münster': 11}

In [None]:
# save labels_dict as json file for mapping after inference
with open(base_dir + "/labels.json", "w") as f:
  json.dump(labels_dict, f, indent=4)

In [None]:
# get number of classes
num_classes = len(os.listdir(base_path + "/Sight_Seeking/pp_data/"))

# Load images and convert them to numpy arrays
images, labels = [], []
for i, dir in enumerate(directories):
  print("Collecting images from {}".format(dir))
  label = np.zeros(num_classes)
  label[i] = 1
  for image_path in os.listdir(base_dir + "/pp_data/" + dir):
      img = Image.open(base_dir + "/pp_data/" + dir + "/" + image_path)
      # if image is not RGB
      if len(img.getbands()) != 3:
        # convert to RGB
        img = img.convert("RGB")
      image_array = np.array(img)
      # add image
      images.append(image_array)
      # add label
      labels.append(label)
# create trainset
dataset = (np.array(images), np.stack(labels, axis=0))

Collecting images from Schloss_Münster
Collecting images from St._Paulus_Dom_Münster
Collecting images from Erbdrostenhof_münster
Collecting images from kiepenkerl_denkmal_münster
Collecting images from LWL-Museum_für_kunst_und_kultur_münster
Collecting images from Sankt_Lamberti_Münster
Collecting images from Buddenturm_Münster
Collecting images from Aaseekugeln_Münster
Collecting images from Antiquariat_Münster
Collecting images from Provinzial_Münster
Collecting images from Cavete_Münster
Collecting images from Rathaus_Münster


In [None]:
# Shuffle the images and labels together
images, labels = shuffle(dataset[0], dataset[1], random_state=42)
# Split the dataset into a train and test set
images_train, images_test, labels_train, labels_test = train_test_split(images, labels, test_size=0.3, random_state=42)

In [None]:
# store train and test sets
with open("/content/drive/My Drive/train_test_sets/images_train.npy", "wb") as f:
  np.save(f, images_train)

with open("/content/drive/My Drive/train_test_sets/images_test.npy", "wb") as f:
  np.save(f, images_test)

with open("/content/drive/My Drive/train_test_sets/labels_train.npy", "wb") as f:
  np.save(f, labels_train)

with open("/content/drive/My Drive/train_test_sets/labels_test.npy", "wb") as f:
  np.save(f, labels_test)