# Dataset Download

## UCI ML hand-written digits datasets

- data object type: gray image
    - image size: 8 * 8
- number of data objects: 1797
- label task: classification
    - classes: 0, ..., 9

Reference

1. sklearn dataset download api ([link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html))
2. dataset homepage ([link](https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits))

In [None]:
import os
import cv2 as cv
import numpy as np
from sklearn.datasets import load_digits

digits = load_digits()

X_raw = (digits.images.astype(float) / 16 * 255).astype(np.uint8)
y = digits.target

dataset_name = 'UCI_handwritten_digits'
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)

for i, img in enumerate(X_raw):
    cv.imwrite(f'{dataset_name}/{i}.png', img)

## Cifar-10

- data object type: color image
    - image size: 32 * 32
- number of data objects: 50000 (train) + 10000 (test)
- label task: classification
    - classes:
        - 0 = airplane
        - 1 = automobile
        - 2 = bird
        - 3 = cat
        - 4 = deer
        - 5 = dog
        - 6 = frog
        - 7 = horse
        - 8 = ship
        - 9 = truck

Reference

1. tensorflow dataset download api ([link](https://www.tensorflow.org/datasets/catalog/cifar10))
2. dataset homepage ([link](https://www.cs.toronto.edu/~kriz/cifar.html))

In [None]:
import os
import cv2 as cv
import tensorflow_datasets as tfds

cifar10 = tfds.load('cifar10', download=True)
train = [*tfds.as_numpy(cifar10)['train']]
test = [*tfds.as_numpy(cifar10)['test']]

dataset_name = 'Cifar-10'
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)
if not os.path.exists(f'{dataset_name}/train'):
    os.makedirs(f'{dataset_name}/train')
if not os.path.exists(f'{dataset_name}/test'):
    os.makedirs(f'{dataset_name}/test')

for i in range(len(train)):
    img = train[i]['image']
    title = train[i]['id'].decode()
    cv.imwrite(f'{dataset_name}/train/{title}.png', img)
for i in range(len(test)):
    img = test[i]['image']
    title = test[i]['id'].decode()
    cv.imwrite(f'{dataset_name}/test/{title}.png', img)

## imagenette

- data object type: color image
    - image size: >= 160 * 160 (shortest side is 160)
- number of data objcets: 12894 (train) + 500 (validation)
- label task: classification
    - classes:
        - n01440764 = 'tench'
        - n02102040 = 'English springer'
        - n02979186 = 'cassette player'
        - n03000684 = 'chain saw'
        - n03028079 = 'church'
        - n03394916 = 'French horn'
        - n03417042 = 'garbage truck'
        - n03425413 = 'gas pump'
        - n03445777 = 'golf ball'
        - n03888257 = 'parachute'

Reference

1. tensorflow dataset download api ([link](https://www.tensorflow.org/datasets/catalog/imagenette))
2. dataset homepage ([link](https://github.com/fastai/imagenette))

In [None]:
import os
import cv2 as cv
import tensorflow_datasets as tfds

imagenette = tfds.load('imagenette/160px', download=True)
train = [*tfds.as_numpy(imagenette)['train']]
validation = [*tfds.as_numpy(imagenette)['validation']]

dataset_name = 'imagenette'
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)
if not os.path.exists(f'{dataset_name}/train'):
    os.makedirs(f'{dataset_name}/train')
if not os.path.exists(f'{dataset_name}/validation'):
    os.makedirs(f'{dataset_name}/validation')

# imagenette dataset originally stored the image in bgr order
for i in range(len(train)):
    img = cv.cvtColor(train[i]['image'], cv.COLOR_BGR2RGB)
    label = train[i]['label']
    cv.imwrite(f'{dataset_name}/train/{i}-{label}.png', img)
for i in range(len(validation)):
    img = cv.cvtColor(validation[i]['image'], cv.COLOR_BGR2RGB)
    label = validation[i]['label']
    cv.imwrite(f'{dataset_name}/validation/{i}-{label}.png', img)

## 20 Newsgroups

- data object type: text
- number of data objects: 18846 (total), 18466 (empty body filtered)
- task: classification
    - classes:
        - 'alt.atheism'
        - 'comp.graphics'
        - 'comp.os.ms-windows.misc'
        - 'comp.sys.ibm.pc.hardware'
        - 'comp.sys.mac.hardware'
        - 'comp.windows.x'
        - 'misc.forsale'
        - 'rec.autos'
        - 'rec.motorcycles'
        - 'rec.sport.baseball'
        - 'rec.sport.hockey'
        - 'sci.crypt'
        - 'sci.electronics'
        - 'sci.med'
        - 'sci.space'
        - 'soc.religion.christian'
        - 'talk.politics.guns'
        - 'talk.politics.mideast'
        - 'talk.politics.misc'
        - 'talk.religion.misc'

Reference

1. sklearn dataset download api ([link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html))
2. dataset homepage ([link](http://qwone.com/~jason/20Newsgroups/))

In [None]:
import json
from sklearn.datasets import fetch_20newsgroups


X, y = fetch_20newsgroups(subset='all', remove=['headers', 'footers', 'quotes'], return_X_y=True)
#X, y = fetch_20newsgroups(subset='all', return_X_y=True)

# filter data objects with empty body
indices = [i for i, x in enumerate(X) if len(x) == 0]
X = [x for i, x in enumerate(X) if i not in indices]
y = [y for i, y in enumerate(y) if i not in indices]

with open('20newsgroups.json', 'w', encoding='utf-8') as f:
    json.dump(X, f, ensure_ascii=False, indent=4)

## IMDb Movie Reviews

- data object type: text
- number of data objects: 25000 (train) + 25000 (test)
- task: classification
    - classes:
        - 0: negative
        - 1: positive

Reference

1. tensorflow dataset download api ([link](https://www.tensorflow.org/datasets/catalog/imdb_reviews))
2. dataset homepage ([link](https://ai.stanford.edu/~amaas/data/sentiment/))

In [None]:
import tensorflow_datasets as tfds

imdb = tfds.load('imdb_reviews', download=True)
X = [d['text'].decode() for d in [*tfds.as_numpy(imdb)['unsupervised']]]

with open('imdb-reviews.json', 'w', encoding='utf-8') as f:
    json.dump(X, f, ensure_ascii=False, indent=4)

## KTH

- data object type: video
- number of data objects: 600
- task: classification
    - classes:
        - 'walking'
        - 'jogging'
        - 'running'
        - 'boxing'
        - 'handwaving'
        - 'handclapping'

Reference

1. dataset homepage ([link](https://www.csc.kth.se/cvap/actions/))

In [None]:
import glob
import moviepy.editor as moviepy
import os
import progressbar
import urllib.request
import zipfile

dataset_name = 'KTH-Action-dataset'
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)

class MyProgressBar():
    def __init__(self):
        self.pbar = None

    def __call__(self, block_num, block_size, total_size):
        if not self.pbar:
            self.pbar=progressbar.ProgressBar(maxval=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()

categories = [
    'boxing',
    'handclapping',
    'handwaving',
    'jogging',
    'running',
    'walking',
]

category_attrs = [
    {
        'name': category,
        'link': f'http://www.nada.kth.se/cvap/actions/{category}.zip',
        'zipname': f'{category}.zip',
    }
    for category in categories
]

for category_attr in category_attrs:
    category = category_attr['name']
    link = category_attr['link']
    zipname = category_attr['zipname']
    
    zipfile_path = f'./{dataset_name}/{zipname}'
    
    # download videos
    urllib.request.urlretrieve(link, zipfile_path, MyProgressBar())

    # unzip videos
    with zipfile.ZipFile(zipfile_path, 'r') as zip_ref:
        zip_ref.extractall(f'./{dataset_name}/{category}')
    
    file_paths = glob.glob(f'./{dataset_name}/{category}/*.avi')
    for file_path in file_paths:
        # convert avi to mp4
        clip = moviepy.VideoFileClip(file_path)
        split = file_path.split('.avi')[0]
        clip.write_videofile(f'{split}.mp4')
    
    for file_path in file_paths:
        os.remove(file_path)

    os.remove(zipfile_path)

In [None]:
import os
import shutil

# flatten the dataset
for category in categories:
    file_paths = glob.glob(f'./{dataset_name}/{category}/*')
    for file_path in file_paths:
        filename = file_path.split('\\')[-1]
        os.rename(file_path, f'./{dataset_name}/{filename}')
    if os.path.exists(f'./{dataset_name}/{category}'):
        shutil.rmtree(f'./{dataset_name}/{category}')