# Get the subset of ImageNet from kaggle
#### note that it required to download the entire dataset from [Kaggle](https://www.kaggle.com/c/imagenet-object-localization-challenge/overview/description).

### 0. Import necessary modules

In [None]:
import os
import zipfile
import shutil
import pandas as pd
from PIL import Image
import random
import warnings

# Set the seed
random.seed(42)
# Filter warnings
warnings.filterwarnings("ignore")


### 1. Install Kaggle

In [None]:
!pip install kaggle


### 2. Obtain the Kaggle API Token by downloading it personally

* Go to the Kaggle website and log in.
* Click on your profile picture at the top right and navigate to **Settings**.
* Scroll down to the **API** section and click on **Create New Token**.
* This action will download a kaggle.json file.

### 3. Move the "kaggle.json" file to "~/.kaggle" directory

Create the .kaggle directory in your home folder

In [None]:
!mkdir -p ~/.kaggle

Move the kaggle.json file. Replace '/path/to/kaggle.json' with the actual path to the downloaded file.

In [None]:
!mv /Users/your_user/Downloads/kaggle.json ~/.kaggle/

Set the file permissions

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

### 4. Download the dataset

In [None]:
!kaggle competitions download -c imagenet-object-localization-challenge


### 5. Extract (unzip) the downloaded file

In [None]:
# Path to your zip file
zip_file_path = 'imagenet-object-localization-challenge.zip'
# Directory to extract to
extract_to_dir = '.'
os.makedirs(extract_to_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_dir)

print(f"Files extracted to {extract_to_dir}")


### 6. Copy a subset to the "resourses/dataset" directory (and resize the images to 300x300)

For resizing images

In [None]:
!pip install Pillow


Load images table

In [None]:
images_df = pd.read_csv('subset_of_imagenet_images_list.csv')
images_df.head()


Resize and copy the required images to the destination directory

In [None]:
datasets_main_dir = '../resources/datasets'

# Path to the directory where the unzipped files are stored
dataset_dir = extract_to_dir + '/ILSVRC/Data/CLS-LOC'
# Path to the directory where you want to store the subset
subset_dir = datasets_main_dir + '/subset_of_imagenet'
os.makedirs(subset_dir, exist_ok=True)

for i in range(images_df.shape[0]):
    img_row = images_df.iloc[i:i+1, :]
    # Build the source path
    phase_source = max(img_row['phase_source'])
    image_folder = max(img_row['image_folder'])
    image_name_full = max(img_row['image_name_full'])
    source_path = os.path.join(dataset_dir, phase_source, image_folder, image_name_full)
    
    # Build the destination path
    phase_destination = max(img_row['phase_destination'])
    animal = max(img_row['animal'])
    destination_dir = os.path.join(subset_dir, phase_destination, animal)
    destination_path = os.path.join(destination_dir, image_name_full.replace('JPEG', 'png'))
    os.makedirs(destination_dir, exist_ok=True)
    
    # Resize and save the image
    img = Image.open(source_path)
    resized_img = img.resize((300, 300), Image.ANTIALIAS)
    resized_img.save(destination_path)


## ------------------ Sample Small Dataset ------------------
#### If you wish to work with few samples run the following code as well

In [None]:
# how many samples to take from each type of animal
n_samples_train = 33
n_samples_validation = 3
n_samples_test = 7
n_samples_train_full = n_samples_train + n_samples_validation


subset_sample_dir = datasets_main_dir + '/subset_of_imagenet_sample'
os.makedirs(subset_sample_dir, exist_ok=True)

def copy_images(source_dir, destination_dir, image_files):
    for file_name in image_files:
        source_file = os.path.join(source_dir, file_name)
        destination_file = os.path.join(destination_dir, file_name)
        shutil.copy(source_file, destination_file)

for dir_path, dir_names, files in os.walk(subset_dir):
    if files and '.DS_Store' not in files:
        dir_path_splt = dir_path.split('/')
        phase = dir_path_splt[-2]
        animal = dir_path_splt[-1]
        
        print(f'phase: {phase}, animal: {animal}')
        
        if phase == 'train_full':
            n = n_samples_train_full
        elif phase == 'train':
            n = n_samples_train
        elif phase == 'val':
            n = n_samples_validation
        else:
            n = n_samples_test
            
        # sample images paths
        image_files = random.sample(files, n)
        # copy images to the sample folder
        destination_dir = os.path.join(subset_sample_dir, phase, animal)
        os.makedirs(destination_dir, exist_ok=True)
        copy_images(dir_path, destination_dir, image_files)
        
        
        