# Dataset Saving

## UCI ML hand-written digits datasets

- data object type: gray image
    - image size: 8 * 8
- number of data objects: 1797
- label task: classification
    - classes: 0, ..., 9

In [None]:
import os
import cv2 as cv
import numpy as np
from sklearn.datasets import load_digits

digits = load_digits()

X_raw = (digits.images.astype(float) / 16 * 255).astype(np.uint8)
y = digits.target

dataset_name = 'UCI_handwritten_digits'
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)

for i, img in enumerate(X_raw):
    cv.imwrite(f'{dataset_name}/{i}.png', img)

## Cifar-10

- data object type: color image
    - image size: 32 * 32
- number of data objects: 50000 (train) + 10000 (test)
- label task: classification
    - classes:
        - 0 = airplane
        - 1 = automobile
        - 2 = bird
        - 3 = cat
        - 4 = deer
        - 5 = dog
        - 6 = frog
        - 7 = horse
        - 8 = ship
        - 9 = truck

In [None]:
import os
import cv2 as cv
import tensorflow_datasets as tfds

cifar10 = tfds.load('cifar10', download=True)
train = [*tfds.as_numpy(cifar10)['train']]
test = [*tfds.as_numpy(cifar10)['test']]

dataset_name = 'Cifar-10'
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)
if not os.path.exists(f'{dataset_name}/train'):
    os.makedirs(f'{dataset_name}/train')
if not os.path.exists(f'{dataset_name}/test'):
    os.makedirs(f'{dataset_name}/test')

for i in range(len(train)):
    img = train[i]['image']
    title = train[i]['id'].decode()
    cv.imwrite(f'{dataset_name}/train/{title}.png', img)
for i in range(len(test)):
    img = test[i]['image']
    title = test[i]['id'].decode()
    cv.imwrite(f'{dataset_name}/test/{title}.png', img)

## imagenette

- data object type: color image
    - image size: >= 160 * 160 (shortest side is 160)
- number of data objcets: 12894 (train) + 500 (validation)
- label task: classification
    - classes:
        - n01440764 = 'tench'
        - n02102040 = 'English springer'
        - n02979186 = 'cassette player'
        - n03000684 = 'chain saw'
        - n03028079 = 'church'
        - n03394916 = 'French horn'
        - n03417042 = 'garbage truck'
        - n03425413 = 'gas pump'
        - n03445777 = 'golf ball'
        - n03888257 = 'parachute'

In [1]:
import os
import cv2 as cv
import tensorflow_datasets as tfds

imagenette = tfds.load('imagenette/160px', download=True)
train = [*tfds.as_numpy(imagenette)['train']]
validation = [*tfds.as_numpy(imagenette)['validation']]

dataset_name = 'imagenette'
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)
if not os.path.exists(f'{dataset_name}/train'):
    os.makedirs(f'{dataset_name}/train')
if not os.path.exists(f'{dataset_name}/validation'):
    os.makedirs(f'{dataset_name}/validation')

# imagenette dataset originally stored the image in bgr order
for i in range(len(train)):
    img = cv.cvtColor(train[i]['image'], cv.COLOR_BGR2RGB)
    label = train[i]['label']
    cv.imwrite(f'{dataset_name}/train/{i}-{label}.png', img)
for i in range(len(validation)):
    img = cv.cvtColor(validation[i]['image'], cv.COLOR_BGR2RGB)
    label = validation[i]['label']
    cv.imwrite(f'{dataset_name}/validation/{i}-{label}.png', img)

## 20 Newsgroups

- data object type: document
- number of images: 18846
- task: classification
    - classes:
        - 'alt.atheism'
        - 'comp.graphics'
        - 'comp.os.ms-windows.misc'
        - 'comp.sys.ibm.pc.hardware'
        - 'comp.sys.mac.hardware'
        - 'comp.windows.x'
        - 'misc.forsale'
        - 'rec.autos'
        - 'rec.motorcycles'
        - 'rec.sport.baseball'
        - 'rec.sport.hockey'
        - 'sci.crypt'
        - 'sci.electronics'
        - 'sci.med'
        - 'sci.space'
        - 'soc.religion.christian'
        - 'talk.politics.guns'
        - 'talk.politics.mideast'
        - 'talk.politics.misc'
        - 'talk.religion.misc'

In [55]:
import os
import numpy as np
from sklearn.datasets import fetch_20newsgroups


X, y = fetch_20newsgroups(subset='all', remove=['headers', 'footers', 'quotes'], return_X_y=True)