### How to: work with Data Loaders

How to iterate over the entire dataset using DataLoaders

In [17]:
# using Pandas for Data Import from csv
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [2]:
# globals
DEBUG = False

In [3]:
class AttritionDataset(Dataset):
    """Oracle Attrition dataset."""

    # categoricals cols are identified dynamically
    # the rule used: each cols with < 10 distinct values
    def identify_categoricals(self):
        # theshol to identify categoricals cols
        THR = 10

        nunique = self.df.nunique()
        types = self.df.dtypes

        self.categorical_columns = []

        for col in self.df.columns:
            # identifichiamo come categoriche tutte le colonne che soddisfano questa condizione
            # la soglia THR la possiamo cambiare
            if types[col] == "object" or nunique[col] < THR:
                if DEBUG:
                    print(f"{col} distinct values: {self.df[col].nunique()}")

                self.categorical_columns.append(col)

    def codify_categoricals(self):
        for col in self.categorical_columns:
            # codifichiamo i categorici con LabelEncoder
            l_enc = LabelEncoder()
            self.df[col] = l_enc.fit_transform(self.df[col].values)

    """
    In __init__ we encapsulate the logic for loading the data from csv
    remove unneeded cols
    identify categoricals
    codify as integer, using LabelEncoder categoricals
    """

    def __init__(self, csv_file):
        """Initializes instance of class AttritionDataset.

        Args:
            csv_file (str): Path to the csv file with the data.

        """
        # here we read the entire csv
        self.df = pd.read_csv(csv_file)

        # cols not to be used
        self.cols_to_drop = [
            "Directs",
            "name",
            "Over18",
            "WeeklyWorkedHours",
            "EmployeeNumber",
        ]

        self.target = "Attrition"

        # dropping cols not to be used
        self.df = self.df.drop(columns=self.cols_to_drop)

        # label encoding of categoricals
        self.identify_categoricals()
        self.codify_categoricals()

        # Save features and target as tensors
        self.X = torch.from_numpy(self.df.drop(self.target, axis=1).values)
        self.y = torch.from_numpy(self.df[self.target].values)

    """
    A PyTorch Dataset has two fundamentals methods
    __len__ which must return the number of records in the dataset
    __get_item must return item of index idx as a Tensor
    """

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        # if isinstance(idx, torch.Tensor):
        #    idx = idx.tolist()

        return (self.X[idx], self.y[idx])

#### Build the dataset

In [4]:
attrition_path = "/opt/notebooks/ads-examples/oracle_data/orcl_attrition.csv"

attrition_ds = AttritionDataset(attrition_path)

In [None]:
#### Build the DataLOader

In [11]:
BATCH_SIZE = 64

train_dataloader = DataLoader(attrition_ds, batch_size=BATCH_SIZE, shuffle=True)

In [12]:
train_features, train_labels = next(iter(train_dataloader))

In [13]:
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 30])
Labels batch shape: torch.Size([64])


In [23]:
# one epoch: iterate over entire dataset
EPOCHS = 10

for epoch in tqdm(range(EPOCHS)):
    for batch in train_dataloader:
        train_features, train_labels = batch
        
        # do something
        # print(train_features.shape)

100%|██████████| 10/10 [00:00<00:00, 98.08it/s]

torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([62, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([62, 30])
torch.Size([64, 30])
torch.Size([6


