# 3.3 Synthetic Regression Data

## Generating the synthetic training data

In [2]:
%matplotlib inline
import random
import torch
from d2l import torch as d2l

In [4]:
class SyntheticRegressionData(d2l.DataModule):

    def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000, batch_size=32):
        super().__init__()
        self.save_hyperparameters()
        n = num_train + num_val
        self.X = torch.randn(n, len(w))
        noise = torch.randn(n, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise


Below, we set the true parameters to $\mathbf{w} = [-2, 3.4]^{\intercal}$ and $b=4.2$, later we can see how our estimated parameters compare to these ground truth parameters.

In [6]:
data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)

torch.Size([2000, 2])


In [7]:
print('features:', data.X[0],'\nlabel:', data.y[0])

features: tensor([1.3148, 0.5488]) 
label: tensor([4.9695])


In [8]:
print(data.X.shape, data.y.shape)

torch.Size([2000, 2]) torch.Size([2000, 1])


## Reading the Dataset (with a dataloader)

Aim is to design concise, memory efficient dataloaders to iterate over a large dataset, providing us with batches of data.

For training, we want to get the data in an entirely random order, while for validation we want to always get the data in the same order, which we expect will be helpful for debugging.

In [9]:
@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self, train):
    if train:
        indices = list(range(0, self.num_train))
        random.shuffle(indices)
    else:
        indices = list(range(self.num_train, self.num_train + self.num_val))
    for i in range(0, len(indices), self.batch_size):
        batch_indices = torch.tensor(indices[i: i + self.batch_size])
        yield self.X[batch_indices], self.y[batch_indices]

In [11]:
X, y = next(iter(data.train_dataloader()))

In [12]:
print('X shape:', X.shape, '\ny shape:', y.shape)

X shape: torch.Size([32, 2]) 
y shape: torch.Size([32, 1])


## Consise implementation

While the above implementation works, it requires loading the whole daqtaset into memory which is inefficient and may result in problems for real world problems.

PyTorch has highly optimised and capable dataloaders, so we should rely on this API>

In [15]:
@d2l.add_to_class(d2l.DataModule)
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)

@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)

X, y = next(iter(data.train_dataloader()))
print('X shape:', X.shape, '\ny shape:', y.shape)

X shape: torch.Size([32, 2]) 
y shape: torch.Size([32, 1])
