<a href="https://colab.research.google.com/github/EricHeidbreder/data-centric-ai/blob/eric_h/data_centric_ai_comp_copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import requests
import tarfile
import io
import numpy as np
from PIL import Image, ImageTk
from matplotlib import pyplot as plt
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from tensorflow.python.keras.preprocessing import dataset_utils
import os
import tensorflow as tf
from tensorflow import keras
import json
import sys
import matplotlib.pyplot as plt
# from google.colab.patches import cv2_imshow

import shutil
import random

In [None]:
train_datagen = ImageDataGenerator(rotation_range=20,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   rescale=1/255,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=False)

valid_datagen = ImageDataGenerator(rescale=1/255)

In [1]:
 batch_size = len(os.listdir('.\\data_sorted\\train\\ii'))
 batch_size

3

In [None]:
data_sorted_path = '.\\data_sorted_copy'
# data_sorted_path = '.\\data_sorted'
class_folders = os.listdir(data_sorted_path)

In [None]:
for folder in class_folders:
  # Don't do this in the test folder
  if folder not in [
    'test',
    '.DS_Store',
    # TODO: remove val after subgroups are added to that folder
    'val'
  ]:
    class_folders = os.listdir(os.path.join(data_sorted_path, folder))
  else:
    continue

  # Iterates through the subfolders in each class. 
  # Makes sure there are an equal number of items from each subgroup within the classes by randomly sampling the smaller subgroups
  for class_folder in class_folders:
    
    # Gather initial information about the class path and the number of subgroups
    if class_folder not in ['.DS_Store', 'junk_vals']:
    # if class_folder == 'i':
      class_path = os.path.join(data_sorted_path, folder, class_folder)
      class_subgroups = os.listdir(class_path)
      max_subgroup_len = 0

      # Need to get the max number of files in a subgroup folder, so we can get them all to match later
      for class_folder_subgroup in class_subgroups:
        max_subgroup_len = max(len(os.listdir(os.path.join(data_sorted_path, folder, class_folder, class_folder_subgroup))), max_subgroup_len)

      # Get the subgroup path (example i_lowercase, i_ruled, etc.)
      for class_folder_subgroup in class_subgroups:
        class_subgroup_path = os.path.join(data_sorted_path, folder, class_folder, class_folder_subgroup)

        # If the subgroup isn't the max length, determine how many copies we need to make to get it to match
        if len(os.listdir(class_subgroup_path)) < max_subgroup_len:
          subgroup_images = os.listdir(class_subgroup_path)
          num_subgroup_images = len(subgroup_images)
          num_copies_to_make = max_subgroup_len - num_subgroup_images

          # Make the copies using random sampling of the existing images
          for i in range(num_copies_to_make):
            random_image = subgroup_images[random.randint(0, num_subgroup_images - 1)]
            shutil.copyfile(os.path.join(class_subgroup_path, random_image), class_subgroup_path+f'/copy_{i}_{random_image}')

    else:
        continue

In [None]:
class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"]

for class_name in class_names:
  train_generator = train_datagen.flow_from_directory(
    './data_sorted/train',
    target_size=(32,32),
    batch_size=500,
    classes=[class_name],
    save_to_dir='./data_preprocessed/train/'+class_name,
    save_prefix='aug',
    shuffle=True
  )
  batch = next(train_generator)
 
for class_name in class_names:
  batch_size = len([f for f in os.listdir('./data_sorted/val/'+ class_name) if os.path.isfile(os.path.join('./data_sorted/val/' + class_name, f))])
  validation_generator = valid_datagen.flow_from_directory(
        './data_sorted/val',
        target_size=(32, 32),
        class_mode='categorical',
        classes=[class_name],
        batch_size=batch_size,
        shuffle=False,
        save_to_dir='./data_preprocessed/val/'+class_name)  
  batch = next(validation_generator)

In [None]:
directory = "./data_preprocessed"
user_data = directory + "/train"
valid_data = directory + "/val"
test_data = directory + "/test" # this can be the label book, or any other test set you create

### DO NOT MODIFY BELOW THIS LINE, THIS IS THE FIXED MODEL ###
batch_size = 8
tf.random.set_seed(123)


if __name__ == "__main__":
    train = tf.keras.preprocessing.image_dataset_from_directory(
        user_data,# + '/train',
        labels="inferred",
        label_mode="categorical",
        class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"],
        shuffle=True,
        seed=123,
        batch_size=batch_size,
        image_size=(32, 32),
    )

    valid = tf.keras.preprocessing.image_dataset_from_directory(
        valid_data,# + '/val',
        labels="inferred",
        label_mode="categorical",
        class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"],
        shuffle=True,
        seed=123,
        batch_size=batch_size,
        image_size=(32, 32),
    )

    total_length = ((train.cardinality() + valid.cardinality()) * batch_size).numpy()
    if total_length > 10_000:
        print(f"Dataset size larger than 10,000. Got {total_length} examples")
        sys.exit()

    test = tf.keras.preprocessing.image_dataset_from_directory(
        test_data,
        labels="inferred",
        label_mode="categorical",
        class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"],
        shuffle=False,
        seed=123,
        batch_size=batch_size,
        image_size=(32, 32),
    )

    base_model = tf.keras.applications.ResNet50(
        input_shape=(32, 32, 3),
        include_top=False,
        weights=None,
    )
    base_model = tf.keras.Model(
        base_model.inputs, outputs=[base_model.get_layer("conv2_block3_out").output]
    )

    inputs = tf.keras.Input(shape=(32, 32, 3))
    x = tf.keras.applications.resnet.preprocess_input(inputs)
    x = base_model(x)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(10)(x)
    model = tf.keras.Model(inputs, x)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=0.0001),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )
    model.summary()
    loss_0, acc_0 = model.evaluate(valid)
    print(f"loss {loss_0}, acc {acc_0}")

    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        "best_model",
        monitor="val_accuracy",
        mode="max",
        save_best_only=True,
        save_weights_only=True,
    )

    history = model.fit(
        train,
        validation_data=valid,
        epochs=100,
        callbacks=[checkpoint],
    )

    model.load_weights("best_model")

    loss, acc = model.evaluate(valid)
    print(f"final loss {loss}, final acc {acc}")

    test_loss, test_acc = model.evaluate(test)
    print(f"test loss {test_loss}, test acc {test_acc}")

   

In [None]:
#view predictions
pred_classes = model.predict(test).argmax(axis=-1) + 1

pred_classes