# EX02: DFs

## 1. Make some data

In [1]:
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.random(size=(100000, 100)))

X_columns = [f'X_{i}' for i in range(99)]
y_columns = [f'X_{i}' for i in range(1)]
df.columns = X_columns + y_columns

## 2. Compare EasyLoader with a DataLoader + Dataset

Define a couple of different kinds of simple DF dataset.

In [2]:
import pandas as pd

from torch.utils.data import Dataset, DataLoader


class NaiveDFDataset(Dataset):
    
    def __init__(self, df, column_groups):
        self.groups = [df[g] for g in column_groups]
    
    def __len__(self):
        return len(self.groups[0])
    
    def __getitem__(self, idx):
        return [g.iloc[idx].to_numpy() for g in self.groups]


class SimpleDFDataset(Dataset):
    
    def __init__(self, df, column_groups):
        self.groups = [df[g].values for g in column_groups]
    
    def __len__(self):
        return len(self.groups[0])
    
    def __getitem__(self, idx):
        return [g[idx] for g in self.groups]

Wrap and run.

In [3]:
from tqdm.auto import tqdm

print('Naive DF Dataset:')

dataset = NaiveDFDataset(df, [X_columns, y_columns])
dl_simple = DataLoader(dataset, batch_size=10, shuffle=True)

for epoch in tqdm(range(5)):
    for batch in tqdm(dl_simple):
        pass

print('Slightly less naive, simple DF Dataset:')

dataset = SimpleDFDataset(df, [X_columns, y_columns])
dl_simple = DataLoader(dataset, batch_size=10, shuffle=True)

for epoch in tqdm(range(5)):
    for batch in tqdm(dl_simple):
        pass

Naive DF Dataset:


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Slightly less naive, simple DF Dataset:


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Now try with an EasyDataLoader!

In [4]:
from easyloader.loader import DFDataLoader

dl_easy = DFDataLoader(df, columns=[X_columns, y_columns],
                       batch_size=10, shuffle=True)

# Iterate through the DataLoader
for epoch in tqdm(range(5)):
    for batch in tqdm(dl_easy):
        pass

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

# 3. IDs

We can keep track of IDs using the `ids` argument. It updates any time we shuffle or sample.

In [5]:
df['ids'] = [f'item_{i}' for i in df.index]

dl_easy = DFDataLoader(df, columns=[X_columns, y_columns], ids='ids',
                       batch_size=10, shuffle=True)

print('First 10 IDs before iterating:')
print(dl_easy.ids[:10])

print('\nFirst 10 IDs after iterating:')
iter(dl_easy)
print(dl_easy.ids[:10])

First 10 IDs before iterating:
['item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'item_6', 'item_7', 'item_8', 'item_9']

First 10 IDs after iterating:
['item_60412', 'item_83577', 'item_81750', 'item_73369', 'item_46369', 'item_17321', 'item_21879', 'item_28399', 'item_38891', 'item_41818']
