# Understanding components of custom data loader in pytorch
![](https://drive.google.com/uc?id=1e92FXOYdRlmQTbK0WozmBN0ZO9KYCPJx)

## Recap - Creating Linear regression model

In [None]:
## Dataset used
# https://www.kaggle.com/datasets/mirichoi0218/insurance

In [4]:
!pip install kaggle



In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1


In [6]:
import os
import pandas as pd

In [7]:
os.listdir("/root/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1")

['insurance.csv']

In [8]:
df = pd.read_csv(os.path.join(path, 'insurance.csv'))

In [9]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [29]:
# Split dataset before encoding
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [30]:
# Encode categorical variables
label_encoders = {}
for col in ['sex', 'smoker', 'region']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le  # Store encoders for later use

In [31]:
# Features and target
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']
X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [32]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(1070, 6)
(1070,)
(268, 6)


In [33]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [35]:
print(X_train_tensor.shape)

torch.Size([1070, 6])


In [36]:
# Define Neural Network Model
class SimpleNNRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNNRegressionModel, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.network(x)


In [37]:
# Initialize model
input_dim = X_train_tensor.shape[1]
model = SimpleNNRegressionModel(input_dim)

In [38]:
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [39]:
'''
x_train_tensor = 1000000 --> 10gb OOM - out of memory
1000000 --> weight and bias

we are teaching human : A book of 1000 pages --> student : 10 : student, i am not able to understand.
1000
10 --> 100
100 feedback = 1000 pages --> 1 epoch

1000 pages total
100 epoch
10 pages feedback
100 iteration * 100



100 epoch
1 epoch --> 1070 rows
'''

'\nx_train_tensor = 1000000 --> 10gb OOM - out of memory\n1000000 --> weight and bias\n\nwe are teaching human : A book of 1000 pages --> student : 10 : student, i am not able to understand.\n1000\n10 --> 100\n100 feedback = 1000 pages --> 1 epoch\n\n1000 pages total\n100 epoch\n10 pages feedback\n100 iteration * 100 \n\n\n\n100 epoch\n1 epoch --> 1070 rows\n'

In [40]:
# Training loop
epochs = 1000
clip_value = 25
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()

    # torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 45953556.0000
Epoch [200/1000], Loss: 32574696.0000
Epoch [300/1000], Loss: 29739176.0000
Epoch [400/1000], Loss: 27349340.0000
Epoch [500/1000], Loss: 25866534.0000
Epoch [600/1000], Loss: 24815058.0000
Epoch [700/1000], Loss: 24016502.0000
Epoch [800/1000], Loss: 23519920.0000
Epoch [900/1000], Loss: 23093526.0000
Epoch [1000/1000], Loss: 22686852.0000


## Understanding Components of a Custom DataLoader in PyTorch

1. Dataset (torch.utils.data.Dataset)
2. DataLoader (torch.utils.data.DataLoader)

In [None]:
# Creating our custom Dataset in pytorch
# init() - initialised the dataset, loads data, applied preprocessing
# len() - return the total numbers of samples in the dataset
# getitem() - Defines how to retrieve a single data sample when an index is provide

In [26]:
import torch
from torch.utils.data import Dataset, DataLoader

In [41]:
class InsuranceDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
     features = torch.tensor(self.X[idx], dtype=torch.float32)
     target = torch.tensor(self.y.values[idx], dtype=torch.float32)
     return features, target


In [43]:
dataset = InsuranceDataset(X_train, y_train)

In [53]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

In [55]:
for batch_idx, (features, targets) in enumerate(dataloader):
  print(f"Batch {batch_idx+1} :")
  print("Features : ", features.shape)
  print("Targets : ", targets.shape)
  break

Batch 1 :
Features :  torch.Size([32, 6])
Targets :  torch.Size([32])


In [57]:
1070/32

33.4375

In [56]:
epochs = 1000
for epoch in range(epochs):
    model.train()

    for batch_idx, (batch_X, batch_y) in enumerate(dataloader):
      print(f"Current batch : {batch_idx}")
      optimizer.zero_grad()
      predictions = model(batch_X)
      loss = criterion(predictions, batch_y)
      loss.backward()
      optimizer.step()
      print(f'Batch [{batch_idx+1}/{epochs}], Loss: {loss.item():.4f}')

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Current batch : 0
Batch [1/1000], Loss: 334446848.0000
Current batch : 1
Batch [2/1000], Loss: 204137728.0000
Current batch : 2
Batch [3/1000], Loss: 317306848.0000
Current batch : 3
Batch [4/1000], Loss: 259250912.0000
Current batch : 4
Batch [5/1000], Loss: 275054368.0000
Current batch : 5
Batch [6/1000], Loss: 114522608.0000
Current batch : 6
Batch [7/1000], Loss: 219878224.0000
Current batch : 7
Batch [8/1000], Loss: 200335792.0000
Current batch : 8
Batch [9/1000], Loss: 158659248.0000
Current batch : 9
Batch [10/1000], Loss: 208273968.0000
Current batch : 10
Batch [11/1000], Loss: 163429344.0000
Current batch : 11
Batch [12/1000], Loss: 78071992.0000
Current batch : 12
Batch [13/1000], Loss: 126491960.0000
Current batch : 13
Batch [14/1000], Loss: 252276672.0000
Current batch : 14
Batch [15/1000], Loss: 170708896.0000
Current batch : 15
Batch [16/1000], Loss: 145750144.0000
Current batch : 16
Batch [17/1000], Loss: 121007312.0000
Current batch : 17
Batch [18/1000], Loss: 145255760

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch [17/1000], Loss: 63846216.0000
Current batch : 17
Batch [18/1000], Loss: 132763968.0000
Current batch : 18
Batch [19/1000], Loss: 151123280.0000
Current batch : 19
Batch [20/1000], Loss: 93049880.0000
Current batch : 20
Batch [21/1000], Loss: 118938928.0000
Current batch : 21
Batch [22/1000], Loss: 96508016.0000
Current batch : 22
Batch [23/1000], Loss: 118289704.0000
Current batch : 23
Batch [24/1000], Loss: 171222336.0000
Current batch : 24
Batch [25/1000], Loss: 169944784.0000
Current batch : 25
Batch [26/1000], Loss: 109705840.0000
Current batch : 26
Batch [27/1000], Loss: 98576104.0000
Current batch : 27
Batch [28/1000], Loss: 136617056.0000
Current batch : 28
Batch [29/1000], Loss: 145394320.0000
Current batch : 29
Batch [30/1000], Loss: 123056824.0000
Current batch : 30
Batch [31/1000], Loss: 126297968.0000
Current batch : 31
Batch [32/1000], Loss: 245696624.0000
Current batch : 32
Batch [33/1000], Loss: 7512