## Processing the Data

In [49]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from PIL import Image
import torchvision.transforms as transforms
from pathlib import Path
import datasets
import shutil

In [3]:
# Turning images into tensors 
def convert_img_path_into_tensor(img_path, tensor_path):
    img = Image.open(img_path)
    transform = transforms.Compose([transforms.PILToTensor()])
    img_tensor = transform(img)
    torch.save(img_tensor, tensor_path)

In [77]:
# Creating a dictionary of classes
classes_dict = {
   'conchiglioni raw pasta': 0,
   'farfalle raw pasta': 1, 
   'fettuccine raw pasta': 2,
   'fusilli raw pasta': 3,
   'gnocchi raw pasta': 4,
   'lasagne raw pasta': 5,
   'linguine raw pasta': 6,
   'orecchiette raw pasta': 7,
   'penne raw pasta': 8,
   'rigatoni raw pasta': 9,
   'spaghetti raw pasta': 10, 
   'tagliatelle raw pasta': 11
}

In [78]:
# Applying tensor transformation function to all images 
main_dir = Path('simple_images/')

for child_dir in main_dir.iterdir():
    if (child_dir / '.ipynb_checkpoints').is_dir():
        shutil.rmtree(child_dir / '.ipynb_checkpoints')
    tensor_path = Path('tensor') / child_dir.parts[-1] 
    tensor_path.mkdir(parents=True)
    
    for img in tqdm(child_dir.iterdir()):
        convert_img_path_into_tensor(img, tensor_path / img.parts[-1].replace('.jpg', '.pt'))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [79]:
# Creating two empty lists
list_of_tensors = []
list_of_labels = [] 

In [80]:
# Loading the tensors 
for path in Path('tensor').iterdir():
    k = path.parts[-1]
    for p in path.iterdir():
        tensor = torch.load(p)
        list_of_tensors.append(tensor)
        list_of_labels.append(classes_dict[k])

In [83]:
# Creating a dataset for tensors and labels 
dataset = datasets.Dataset.from_dict(
    {
        "images": list_of_tensors,
        "labels": list_of_labels,
    }
)

In [None]:
# Saving the dataset on the laptop 
dataset.save_to_disk("processed_pasta_dataset")

In [93]:
# Shuffling dataset's indices
dataset_size = len(dataset)
idxs = np.random.permutation(dataset_size)

In [94]:
# Splitting the data into train, validation, and test set
train_idxs = idxs[:int(0.8 * len(idxs))]
valid_idxs = idxs[int(0.8 * len(idxs)):int(0.9 * len(idxs))]
test_idxs = idxs[int(0.9 * len(idxs)):]

In [95]:
train_dataset = dataset.select(train_idxs)
valid_dataset = dataset.select(valid_idxs)
test_dataset = dataset.select(test_idxs)

In [98]:
# Creating a dataset and saving it onto the laptop
datasets.DatasetDict(
    {
        "train": train_dataset,
        "valid": valid_dataset,
        "test": test_dataset,
    }
).save_to_disk("split_pasta_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/363 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/45 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46 [00:00<?, ? examples/s]