# 5. Custom Datasets in PyTorch

This is a short notebook showcasing the basis of building your own Custom Dataset in PyTorch.

# 0. Imports, Constants and Support Functions

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

# 1. Load the dataset

In [3]:
# Define the target categories
categories = ["rec.autos", "rec.sport.baseball", "rec.sport.hockey", "sci.med"]
newsgroups_train = fetch_20newsgroups(
    subset="train", categories=categories, shuffle=True, random_state=14
)
newsgroups_test = fetch_20newsgroups(
    subset="test", categories=categories, shuffle=True, random_state=14
)

In [4]:
# Vectorize the text data
# Vectorize => Convert sentence to token count
vectorizer = CountVectorizer(max_features=10000, stop_words="english")
X_train_counts = vectorizer.fit_transform(newsgroups_train.data)
X_test_counts = vectorizer.fit_transform(newsgroups_test.data)

In [5]:
# Create dense vectors
X_train = torch.tensor(X_train_counts.toarray(), dtype=torch.float32)
X_test = torch.tensor(X_test_counts.toarray(), dtype=torch.float32)
y_train = torch.tensor(newsgroups_train.target, dtype=torch.long)
y_test = torch.tensor(newsgroups_test.target, dtype=torch.long)

# 2. Create the Custom Dataset

In [6]:
# Create custom dataset for Torch
# This is ideal if we wanted to include some preprocessing to our dataset
class NewsGroupDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = NewsGroupDataset(X_train, y_train)
test_dataset = NewsGroupDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)