In [7]:
from utils.data_setupper.downloader import *
from utils.data_setupper.dataset_reader import *
from utils.data_setupper.extractors import *
from utils.data_setupper.augmentation import *
import torch
import torchvision
import torch.nn as nn
import os

# Notebook usage
This notebook is used to streamline as much as possible the data setup for the project. We have trained models on data downloaded from the internet 
as well as those already implemented in pytorch. For the latter, there are function already implemented in the main.py since we can work directly with the object instantiated by the library.

This notebook will help you set up the data in the correct directory structure when downloading from Kaggle or the Internet.
It will be used to download the data, extract it, and set it up in the correct directory structure.
If you want to work using datasets preloaded in Torch or timm, you can skip this notebook.

Everything that needs to be edited is marked with "@edit". It is procedural, therefore do not skip steps and go one cell at the time. 

## Download Data from Kaggle

In [2]:
# manually edit the three following lines, save the dataset wherever you wish, but the "datasets" folder is recommended
# make sure to use the absolute path, not relative
# @edit
#dataset_url = "https://www.kaggle.com/datasets/prasanshasatpathy/soil-types"
dataset_url = "https://www.kaggle.com/datasets/wenewone/cub2002011"
# @edit
root = "/home/disi/ml/datasets"
download_dataset_kaggle(dataset_url, root)
# @edit extract zip if needed
#extract_tgz(f"{dataset_url}/datasets.tar.gz", root)

Dataset URL: https://www.kaggle.com/datasets/wenewone/cub2002011
Kaggle Dataset downloaded successfully.


# Download dataset from a url

In [None]:
# @edit
dataset_url = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz"
root = "/home/disi/ml/datasets"
i = download_dataset(dataset_url, root)
# Manually move it in the datasets folder if you do not need to unzip it, otherwhise extracting it with the following function
# will do it for you
# @edit
extract_tgz(f"{dataset_url}/datasets.tar.gz", root)

## Augment the dataset (optional)

In [None]:
# @edit
# make sure to add a / in front of the folder name 
dataset_folder = root + "/flowers"
image_generator =  setup_data_generator(rotation_range=40,
                                        width_shift_range=0.2, 
                                        height_shift_range=0.2,
                                        shear_range=0.2, 
                                        zoom_range=0.2, 
                                        horizontal_flip=True, 
                                        fill_mode='nearest')

create_train_val_test_folders(dataset_folder, train_size=0.7, val_size=0.15, test_size=0.15)
cleanup(dataset_folder)

In [None]:
directory_path  = dataset_folder + "/train"
train_generator = load_data_from_directory(directory_path, target_size=(300,300), batch_size=32)
n_of_batches = 50

#TODO write how the number of batches infuences the actual amount of images generated

In [None]:
for i, (images, labels) in enumerate(train_generator):
    if i >= n_of_batches:  # Stop after saving images from 50 batches
        break
    save_augmented_images(images, labels, directory_path, train_generator.class_indices)

## Train-Test-Val-Split
If you augmented the images you are good to go, the dataset is ready to be used in the main function. Otherwhise edit and execute the following block of code.


In [7]:
# @edit
# make sure to add a / in front of the folder name 
root = "/home/disi/ml/datasets"
folder_target = f"{root}/CUB_200_2011/images"
dataset_folder = folder_target #+ "/jpg"

In [None]:
create_train_val_test_folders(dataset_folder, train_size=0.7, val_size=0.15, test_size=0.15)

# Last step
Once you reach the train-test-val configuration, run this final function to create a folder named as you wish that will contain train-test-val. This will be our root in the main.py. 

In [15]:
# @edit
location = root
newfolder_name = ''
train_folder = f'{location}/train/'
val_folder = f'{location}/val/'
test_folder = f'{location}/test/'
final_structure(location, newfolder_name, train_folder, val_folder, test_folder)