# Motivation

In this cell we set up the data pipeline for the Plant Seedlings Classification task: loading image file paths and labels, constructing and splitting training/validation DataFrames, and defining a batch generator that yields resized images and one-hot encoded labels for model training.


### Import the librarys 

In [1]:
import sys 
from pathlib import Path

cwd = Path.cwd()
if  (cwd / 'src').exists():              
    src_dir = cwd / 'src'
elif(cwd.parent / 'src').exists():        
    src_dir = cwd.parent / 'src'
else:
    raise FileNotFoundError("No pude encontrar la carpeta 'src/'")

sys.path.insert(0, str(src_dir))

from data_loader import *

### Load the images 

In [6]:
base_dir = Path('..') / Path('plant-seedlings-classification')
train_dir = base_dir / 'train'
test_dir  = base_dir / 'test'

if not train_dir.exists():
    raise FileNotFoundError(f"No se encontró: {train_dir}")


train_rows = []
for class_dir in train_dir.iterdir():
    if class_dir.is_dir():
        for img_path in class_dir.glob('*.*'):
            train_rows.append({
                'filepath': str(img_path),
                'label': class_dir.name
            })

df_train = pd.DataFrame(train_rows)
df_train_shuffled = df_train.sample(frac=1, random_state=9).reset_index(drop=True)

n = len(df_train)
split_idx = int(0.8 * n)
train_df = df_train.iloc[:split_idx].copy()
val_df   = df_train.iloc[split_idx:].copy()
print(f"Train: {len(train_df)} filas, Val: {len(val_df)} filas")


# DataFrame de test (sin labels)
test_rows = []
for img_path in test_dir.glob('*.*'):
    test_rows.append({'filepath': str(img_path)})
df_test = pd.DataFrame(test_rows)
print(f"Test: {len(df_test)} filas")

Train: 3800 filas, Val: 950 filas
Test: 794 filas


### Crate label_list and label2idx for the functions 

In [7]:
label_list = sorted(df_train['label'].unique())
label2idx  = {label: i for i, label in enumerate(label_list)}

### test some bathces 

In [9]:
batch_size = 16
gen = batch_generator(train_df, batch_size=batch_size , label2idx=label2idx , label_list=label_list)
X_batch, y_batch = next(gen)
print(X_batch.shape, y_batch.shape) 

(16, 64, 64, 3) (16, 12)


In [10]:
batch_size = 32
gen = batch_generator(train_df, batch_size=batch_size , label2idx=label2idx , label_list=label_list)
X_batch, y_batch = next(gen)
print(X_batch.shape, y_batch.shape) 

(32, 64, 64, 3) (32, 12)


In [11]:
batch_size = 32
gen = batch_generator(train_df, batch_size=batch_size , label2idx=label2idx , label_list=label_list , target_size=(128,128))
X_batch, y_batch = next(gen)
print(X_batch.shape, y_batch.shape) 

(32, 128, 128, 3) (32, 12)


## Pablo Reyes 