#### Link To The Video:
#### https://www.youtube.com/watch?v=hvLFD4AZzCw&list=PLyMom0n-MBroupZiLfVSZqK5asX8KfoHL&index=3

In [108]:
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split

In [109]:
dataframe = pd.read_csv(Path("../../data/KaggleMedicalCostPersonalDataset/insureance.csv"))
dataframe.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Question: How many rows does the dataframe have?

In [110]:
# To determine the row count we can use the shape attribute.
dataframe.shape[0]

1338

Question: What are the column titles of output/target variable?

In [111]:
dataframe.charges

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

Question: Which of the input columns are non-numeric or categorical variables?

In [112]:
non_numeric = []
for column in dataframe.columns:
    converted = pd.to_numeric(dataframe[column], errors="coerce")
    if converted.isnull().any():
        non_numeric.append(column)

print("Following columns contain non numeric values:")
for item in non_numeric:
    print(item)

Following columns contain non numeric values:
sex
smoker
region


Question: What are the minimum, maximum and average values of the charges column? Can you show the distrubution of the values in a graph?

In [113]:
charges_min = dataframe["charges"].min()
charges_max = dataframe["charges"].max()
charges_mean = dataframe["charges"].mean()
print(f"Max: {charges_max:.4f}, Min: {charges_min:.4f}, Average: {charges_mean:.4f}")

Max: 63770.4280, Min: 1121.8739, Average: 13270.4223


#### Prepare the Dataset for Training

In [114]:
def get_input_columns(dataframe):

    input_columns = dataframe.columns.tolist()
    input_columns.remove("charges")
    return input_columns

def get_output_columns(dataframe):

    return ["charges"]

def get_categorical_columns(dataframe):

    categorical = []
    for column in dataframe.columns:
        converted = pd.to_numeric(dataframe[column], errors="coerce")
        if converted.isnull().any():
            categorical.append(column)

    return categorical

def dataframe_to_arrays(dataframe):

    # Make a copy of the original dataframe
    dataframe_ = dataframe.copy(deep=True)

    # Convert non-numeric categorical columns to numbers
    for col in get_categorical_columns(dataframe):
        dataframe_[col] = dataframe_[col].astype("category").cat.codes

    # Extract input & outputs as numpy arrays
    inputs_array = dataframe_[get_input_columns(dataframe)].to_numpy()
    targets_array = dataframe_[get_output_columns(dataframe)].to_numpy()
    return inputs_array, targets_array

Question: Convert the numpy arrays input_array and targets_array to PyTorch tensors. Make sure that the data type is float32.

In [115]:
inputs_array, targets_array = dataframe_to_arrays(dataframe)
inputs_tensors = torch.tensor(inputs_array).to(torch.float32)
targets_tensors = torch.tensor(targets_array).to(torch.float32)

print(inputs_tensors.dtype)
print(targets_tensors.dtype)

torch.float32
torch.float32


In [116]:
dataset = TensorDataset(inputs_tensors, targets_tensors)

Question: Pick a number between 0.1 and 0.2 to determine the fraction of data that will be used for creating the validation set.
Then use random_split to create training and validation datasets.

In [117]:
validation_percent = .15
row_count = dataframe.shape[0]
validation_size = int(row_count * validation_percent)
train_size = row_count - validation_size
train_dataset, validation_dataset =  random_split(dataset, [train_size, validation_size])

Question: Pick a batch size for the data loader.

In [118]:
batch_size = 50

train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size)

Create a linear regression model.

In [119]:
class InsuranceModel(nn.Module):

    def __init__(self, input_size, output_size):

        super().__init__()
        hidden_size = 500
        self.layer_0 = nn.Linear(input_size, hidden_size)
        self.layer_1 = nn.LazyLinear(hidden_size, output_size)
        self.loss_fn = F.mse_loss

    def forward(self, xb):

        return self.layer_1(self.layer_0(xb))
    
    def training_step(self, batch):

        inputs, targets = batch

        # Generate predictions
        outputs = self(inputs)

        # Calculate loss 
        return self.loss_fn(outputs, targets)

    def validation_step(self, batch):

        inputs, targets = batch
    
        # Generate predictions
        outputs = self(inputs)

        # Calculate loss
        loss = self.loss_fn(outputs, targets)

        return {"val_loss": loss.detach()}
    
    def validation_epoch_end(self, outputs):

        batch_losses = [x["val_loss"] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        return {"val_loss": epoch_loss.item()}
    
    def epoch_end(self, epoch, result, num_epochs):

        if (epoch + 1) % 20 == 0 or epoch == num_epochs - 1:
            print(f"Epoch [{epoch + 1}], loss: {result['val_loss']:.4f}")

In [120]:
def evaluate(model, validation_loader):

    outputs = [model.validation_step(batch) for batch in validation_loader]
    return model.validation_epoch_end(outputs)

In [121]:
def fit(epochs, lr, model, train_loader, validation_loader, optimizer_function=torch.optim.Adam):

    history = []
    optimizer = optimizer_function(model.parameters(), lr)
    for epoch in range(epochs):
        for batch in train_loader:  # Training phase
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        result = evaluate(model, validation_loader)  # Validation phase
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    
    return history

In [122]:
input_size = len(get_input_columns(dataframe))
output_size = len(get_output_columns(dataframe))
model = InsuranceModel(input_size, output_size)



Question: Use the evaluate function to calculate the loss on the validation set before training.

In [123]:
evaluate(model, validation_loader)

  loss = self.loss_fn(outputs, targets)


{'val_loss': 285007648.0}

In [124]:
epochs = 1000
lr = 1e-3
history_0 = fit(epochs, lr, model, train_loader, validation_loader)

  return self.loss_fn(outputs, targets)
  return self.loss_fn(outputs, targets)
  loss = self.loss_fn(outputs, targets)


Epoch [20], loss: 111936800.0000
Epoch [40], loss: 111022696.0000
Epoch [60], loss: 110089392.0000
Epoch [80], loss: 109043440.0000
Epoch [100], loss: 107932904.0000
Epoch [120], loss: 106796648.0000
Epoch [140], loss: 105327504.0000
Epoch [160], loss: 103816872.0000
Epoch [180], loss: 101847368.0000
Epoch [200], loss: 99330944.0000
Epoch [220], loss: 96975456.0000
Epoch [240], loss: 93206968.0000
Epoch [260], loss: 89231216.0000
Epoch [280], loss: 84377736.0000
Epoch [300], loss: 78945760.0000
Epoch [320], loss: 72811248.0000
Epoch [340], loss: 66051500.0000
Epoch [360], loss: 59698152.0000
Epoch [380], loss: 53000060.0000
Epoch [400], loss: 47010264.0000
Epoch [420], loss: 41949072.0000
Epoch [440], loss: 37678680.0000
Epoch [460], loss: 34327532.0000
Epoch [480], loss: 32335604.0000
Epoch [500], loss: 30911252.0000
Epoch [520], loss: 30147074.0000
Epoch [540], loss: 30094274.0000
Epoch [560], loss: 30236966.0000
Epoch [580], loss: 30471116.0000
Epoch [600], loss: 30398932.0000
Epoch

In [127]:
epochs = 5000
lr = 1e-2
history_0 = fit(epochs, lr, model, train_loader, validation_loader)

  return self.loss_fn(outputs, targets)
  return self.loss_fn(outputs, targets)
  loss = self.loss_fn(outputs, targets)


Epoch [20], loss: 27395462.0000
Epoch [40], loss: 27600402.0000
Epoch [60], loss: 27385686.0000
Epoch [80], loss: 27460312.0000
Epoch [100], loss: 27263952.0000
Epoch [120], loss: 32340790.0000
Epoch [140], loss: 27747750.0000
Epoch [160], loss: 28055952.0000
Epoch [180], loss: 27660048.0000
Epoch [200], loss: 31551768.0000
Epoch [220], loss: 29725748.0000
Epoch [240], loss: 27325352.0000
Epoch [260], loss: 27402376.0000
Epoch [280], loss: 27344504.0000
Epoch [300], loss: 30993736.0000
Epoch [320], loss: 27610356.0000
Epoch [340], loss: 27846370.0000
Epoch [360], loss: 29028196.0000
Epoch [380], loss: 27299148.0000
Epoch [400], loss: 28831972.0000
Epoch [420], loss: 29933828.0000
Epoch [440], loss: 30219702.0000
Epoch [460], loss: 29288690.0000
Epoch [480], loss: 28107308.0000
Epoch [500], loss: 28962622.0000
Epoch [520], loss: 27816032.0000
Epoch [540], loss: 27407340.0000
Epoch [560], loss: 29953092.0000
Epoch [580], loss: 27797814.0000
Epoch [600], loss: 27424956.0000
Epoch [620], l

In [126]:
item_no = 6
print(model(validation_dataset[item_no][0]))
print(validation_dataset[item_no][1])

tensor([-1115.2299, -1114.8939, -1114.8011, -1114.8809, -1115.4144, -1115.3270,
        -1114.9404, -1114.4325, -1114.8660, -1114.9890, -1115.0845, -1114.5745,
        -1115.0720, -1114.8093, -1114.6744, -1114.7313, -1114.8679, -1115.3007,
        -1115.1625, -1115.2877, -1115.1005, -1114.7206, -1114.9952, -1114.8074,
        -1114.8568, -1114.8214, -1115.1595, -1114.5918, -1115.0573, -1115.3767,
        -1115.3141, -1115.2604, -1114.7008, -1114.8917, -1115.2141, -1114.8132,
        -1114.8922, -1114.8555, -1114.7000, -1114.7167, -1115.2335, -1115.1301,
        -1114.7477, -1114.9037, -1114.8739, -1115.0635, -1115.0669, -1114.9233,
        -1115.1940, -1114.8877, -1114.8434, -1115.1498, -1115.1794, -1115.1062,
        -1115.1238, -1114.8584, -1115.2111, -1114.8147, -1115.1486, -1114.8746,
        -1114.7935, -1115.4674, -1115.3602, -1114.8154, -1115.0858, -1114.6051,
        -1114.5927, -1114.3783, -1114.7483, -1115.0175, -1114.9987, -1115.2153,
        -1114.9324, -1115.1801, -1114.81