### How to: prepare a PyTorch Dataset

we show how to, starting from a csv file, we wrap it in a PyTorch Dataset

In [1]:
# using Pandas for Data Import from csv
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import Dataset

In [14]:
# globals
DEBUG = False

In [15]:
class AttritionDataset(Dataset):
    """Oracle Attrition dataset."""

    # cat cols are identified dynamically
    # the rule used: each cols with < 10 distinct values
    def identify_categoricals(self):
        THR = 10

        nunique = self.df.nunique()
        types = self.df.dtypes

        self.categorical_columns = []

        for col in self.df.columns:
            # identifichiamo come categoriche tutte le colonne che soddisfano questa condizione
            # la soglia la possiamo cambiare
            if types[col] == "object" or nunique[col] < THR:
                if DEBUG:
                    print(f"{col} distinct values: {self.df[col].nunique()}")

                self.categorical_columns.append(col)

    def codify_categoricals(self):
        for col in self.categorical_columns:
            # codifichiamo i categorici con LabelEncoder
            l_enc = LabelEncoder()
            self.df[col] = l_enc.fit_transform(self.df[col].values)

    """
    In __init__ we encapsulate the logic for loading the data from csv
    remove unneeded cols
    identify categoricals
    codify as integer, using LabelEncoder categoricals
    """

    def __init__(self, csv_file):
        """Initializes instance of class AttritionDataset.

        Args:
            csv_file (str): Path to the csv file with the data.

        """
        # here we read the entire csv
        self.df = pd.read_csv(csv_file)

        # cols not to be used
        self.cols_to_drop = [
            "Directs",
            "name",
            "Over18",
            "WeeklyWorkedHours",
            "EmployeeNumber",
        ]

        self.target = "Attrition"

        # dropping cols not to be used
        self.df = self.df.drop(columns=self.cols_to_drop)

        # label encoding of categoricals
        self.identify_categoricals()
        self.codify_categoricals()

        # Save target and predictors as numpy array
        # next step should be yo use only tensors
        self.X = self.df.drop(self.target, axis=1).values
        self.y = self.df[self.target].values

    """
    A PyTorch Dataset has two fundamentals methods
    __len__ must return the number of records in the dataset
    __get_item must return item of index idx
    """

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()

        return [self.X[idx], self.y[idx]]

In [16]:
attrition_path = "/opt/notebooks/ads-examples/oracle_data/orcl_attrition.csv"

attrition_ds = AttritionDataset(attrition_path)

In [17]:
INDEX = 100
print(f"The number of records in the dataset is: {len(attrition_ds)}")
print()
print(f"One example is: {attrition_ds[INDEX]}")

The number of records in the dataset is: 1470

One example is: [array([   38,     0,  3700,     0,     7,     3,     0,     2,     1,
          63,     2,     0,     1,     0,     0,  2073, 23648,     4,
           1,    22,     1,     3,     0,     7,     3,     2,     3,
           2,     0,     2]), 1]
