In [1]:
from pathlib import Path
import pickle, tarfile, gzip, math, os, time, shutil, glob, matplotlib as mpl, matplotlib.pyplot as plt

In [2]:
import openai

In [3]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
%reload_ext genai

For this exercise we'll be using the *Imagenette* dataset, but sized and cropped to 160 x 160 px

### Get Imagenette Data:

In [5]:
IMAGENETTE_160_DATASET='https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz'
path_data = Path('imagenette_160_data')
path_data.mkdir(exist_ok=True)
path_tgz = path_data/'imagenette2-160.tgz'

In [6]:
from urllib.request import urlretrieve

In [7]:
if not path_tgz.exists(): urlretrieve(IMAGENETTE_160_DATASET, path_tgz)

In [8]:
!ls -la imagenette_160_data/

total 96696
drwxr-xr-x 3 pete-00 pete-00     4096 May  7 06:18 .
drwxr-xr-x 6 pete-00 pete-00     4096 May  7 10:22 ..
drwxrwxr-x 5 pete-00 pete-00     4096 Feb  6  2021 imagenette2-160
-rw-r--r-- 1 pete-00 pete-00 99003388 May  6 20:11 imagenette2-160.tgz


### Extract Imagenette Data

Our file in this case is a `.tgz` file, which means that it was compressed with the *tar* utility, and then compressed using the *gzip* algorithm. So to accomplish this with pure python, we'll have to use both the `gzip` and `tarfile` utilities.

In [9]:
with gzip.open(path_tgz, 'rb') as f:
    with tarfile.open(fileobj=f, mode='r') as tar:
        tar.extractall(path_data)

In [10]:
!ls -l imagenette_160_data/imagenette2-160/train

total 384
drwxr-xr-x 3 pete-00 pete-00 40960 Nov 29  2019 n01440764
drwxr-xr-x 3 pete-00 pete-00 40960 Nov 29  2019 n02102040
drwxr-xr-x 2 pete-00 pete-00 40960 Nov 29  2019 n02979186
drwxr-xr-x 2 pete-00 pete-00 36864 Nov 29  2019 n03000684
drwxr-xr-x 2 pete-00 pete-00 36864 Nov 29  2019 n03028079
drwxr-xr-x 2 pete-00 pete-00 36864 Nov 29  2019 n03394916
drwxr-xr-x 2 pete-00 pete-00 40960 Nov 29  2019 n03417042
drwxr-xr-x 2 pete-00 pete-00 36864 Nov 29  2019 n03425413
drwxr-xr-x 2 pete-00 pete-00 36864 Nov 29  2019 n03445777
drwxr-xr-x 2 pete-00 pete-00 36864 Nov 29  2019 n03888257


### Import PIL... Wait what?

In [11]:
from PIL import Image
import numpy as np

I know, I know, using PIL is cheating. But in the interest of not stretching my abilities to thin, and staying in the spirit of the exercise, I'm declaring it necessary. We'll be converting it to a `.pkl` object anyway, so go easy on me.  

In [12]:
path_data = Path(path_data/'imagenette2-160')

In [13]:
train_path_data = Path(path_data/'train')
valid_path_data = Path(path_data/'val')

In [14]:
!ls {path_data}

noisy_imagenette.csv  train  val


In [None]:
def read_images(folder_path):
    data = []
    labels = {}
    label_counter = 0

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.JPEG'):
                img_path = os.path.join(root, file)
                label_name = os.path.basename(os.path.dirname(img_path))

                if label_name not in labels:
                    labels[label_name] = label_counter
                    label_counter += 1

                img = Image.open(img_path).convert('RGB')
                img_array = np.array(img)

                data.append((img_array, labels[label_name]))

    return data

In [18]:
def save_data_to_pkl(data, output_file):
    with open(output_file, 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
train_path_data

PosixPath('imagenette_160_data/imagenette2-160/train')

In [20]:
train_data = read_images(train_path_data)

In [23]:
valid_data = read_images(valid_path_data)

In [24]:
final_data = (train_data, valid_data, None)

In [28]:
len(final_data)

3

In [29]:
save_data_to_pkl(final_data, 'imagenette.pkl')

In [31]:
with open('imagenette.pkl', 'rb') as f: 
    pickle_data = pickle.load(f)

In [33]:
pickle_data

([(array([[[161,  31,  18],
           [154,  44,  27],
           [187, 105,  81],
           ...,
           [253,  68, 187],
           [253,  68, 187],
           [253,  68, 187]],
   
          [[147,  32,  13],
           [185,  83,  61],
           [206, 120,  95],
           ...,
           [252,  67, 186],
           [252,  67, 186],
           [252,  67, 186]],
   
          [[135,  48,  18],
           [194, 105,  75],
           [193, 106,  78],
           ...,
           [251,  66, 185],
           [251,  66, 185],
           [251,  66, 185]],
   
          ...,
   
          [[ 40,  50,  25],
           [ 51,  61,  36],
           [ 37,  47,  22],
           ...,
           [ 39,  44,  21],
           [ 23,  31,   7],
           [ 22,  30,   6]],
   
          [[ 22,  32,   7],
           [ 42,  52,  27],
           [ 41,  51,  26],
           ...,
           [ 42,  47,  24],
           [ 24,  32,   8],
           [ 19,  27,   3]],
   
          [[ 27,  37,  12],
        