In [1]:
# We usually don't use PyTorch for simple linear regression (standalone linear classifier; not in neural networks). Scikit-learn is typically preferred due to its simplicity. PyTorch is mainly used for neural network for complex datasets

In [None]:
# # Unlike scikit-learn, PyTorch doesn't come with built-in high-level models like "LogisticRegression" or "RandomForestClassifier". Instead, PyTorch provides building blocks that you use to construct these models yourself.

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch


df = pd.read_csv('FuelConsumption.csv')
df.head()

x_num = df[['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY', 
           'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB', 
           'FUELCONSUMPTION_COMB_MPG']]

x_cat = df[['MODELYEAR', 'VEHICLECLASS', 'TRANSMISSION', 'FUELTYPE']]
x_cat_encoded = pd.get_dummies(x_cat, drop_first=True)
x = pd.concat([x_num, x_cat_encoded], axis=1)
X = x.values
X = StandardScaler().fit_transform(X)

y = df[['CO2EMISSIONS']].values  # [[]] so we don't need to do y = y.reshape(-1, 1) anymore

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert data to tensors
X_train = torch.FloatTensor(X_train)  # or we can write one code in class init later (self.X = torch.FloatTensor(X)) to do all Xs and one code to do all ys (but cause we need these later I think this way is better)    
X_val = torch.FloatTensor(X_val)    
X_test = torch.FloatTensor(X_test) 
y_train = torch.FloatTensor(y_train)    
y_val = torch.FloatTensor(y_val)    
y_test = torch.FloatTensor(y_test)    

In [3]:
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn


    
# convert initial data to useable data for PyTroch model (to create a custom dataset check the end of the page)
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

# create data loaders for 3 reasons:
# Batch processing - splits data into smaller batches for efficient training
# Shuffle data - randomizes order of samples
# Memory efficiency - loads data in batches instead of all at once
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # so we can get X_batch, y_batch in train_loader later
val_loader = DataLoader(val_dataset, batch_size=32)  # actually no need to these 2 in this model
test_loader = DataLoader(test_dataset, batch_size=32)  


# Define the Linear Regression model
class LinearRegression(nn.Module):
    def __init__(self, input_size):
        super().__init__()  # It's required in PyTorch models to properly set up the neural network inheritance chain and initialize all the necessary components from the parent class.
        self.linear = nn.Linear(input_size, 1)  # output size is 1
        
    def forward(self, x):  
        return self.linear(x)  # takes your input data x and passes it through the linear model (which applies the equation y = wx + b, where w and b are the learned weights and bias) to get the predicted output.
    

# Initialize the model
input_size = X_train.shape[1]
model = LinearRegression(input_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)    


# Training loop (check more advanced code in the cell below)
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch_X, batch_y in train_loader:  # order here (v) is important
        optimizer.zero_grad()
        pred_train = model(batch_X)  # forward padd: Make predictions
        loss = criterion(pred_train, batch_y)
        #* Note that in PyTorch we don't train the model like Keras with X and y! We just use Xs, and the first train_pred is a random thing since we didn't tell anything about y to th model. Then in the next steps, it finds the weights and bias by loss and optimizer
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate total loss
        total_train_loss += loss.item()  # loss.item(): loss value for the current batch

    # Calculate average loss for the epoch
    avg_train_loss = total_train_loss / len(train_loader) 

    
    # Validation
    model.eval()  # note that we don't use batches for eval here (for small datasets we can avoid using it (see logisitic regression for using it); check NeuralNetworkRegression_PyTorch file to see how to use it)
    with torch.no_grad():  # Don't compute gradients during validation
        pred_val = model(X_val)
        val_loss = criterion(pred_val, y_val)

    print()    
    if (epoch + 1) % 10 == 0:  # print losses for each 10 epochs
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss.item():.4f}')


# Test the model
model.eval()  # note that we don't use batches for val here (for small datasets we can avoid using it; check NeuralNetworkRegression_PyTorch file to see how to use it)
with torch.no_grad():  
    pred_test = model(X_test)
    test_loss = criterion(pred_test, y_test)
    print(f'\nTest Loss: {test_loss.item():.4f}')        












Epoch [10/100], Average Train Loss: 67310.5737, Val Loss: 66489.9531










Epoch [20/100], Average Train Loss: 65118.5122, Val Loss: 64763.2109










Epoch [30/100], Average Train Loss: 63883.1413, Val Loss: 63317.9297










Epoch [40/100], Average Train Loss: 62081.8465, Val Loss: 62064.1758










Epoch [50/100], Average Train Loss: 60726.6489, Val Loss: 60933.1875










Epoch [60/100], Average Train Loss: 59000.5210, Val Loss: 59879.6641










Epoch [70/100], Average Train Loss: 57817.0101, Val Loss: 58864.3984










Epoch [80/100], Average Train Loss: 56543.1688, Val Loss: 57893.6758










Epoch [90/100], Average Train Loss: 55270.5811, Val Loss: 56930.8359










Epoch [100/100], Average Train Loss: 54307.6279, Val Loss: 55962.8047

Test Loss: 55743.5469


In [4]:
'''
# create a custom dataset class to create datasets that we can use in DataLoader() so we can get batches
class FuelDataset(Dataset):
    def __init__(self, X, y):
        self.X = X  
        self.y = y

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
'''

'\n# create a custom dataset class to create datasets that we can use in DataLoader() so we can get batches\nclass FuelDataset(Dataset):\n    def __init__(self, X, y):\n        self.X = X  \n        self.y = y\n\n    def __len__(self):\n        return len(self.X)\n    \n    def __getitem__(self, idx):\n        return self.X[idx], self.y[idx]\n'

In [5]:
'''
more advanced training for Track the best validation loss and implement early stopping, and save the best model based on validation performance

best_val_loss = float('inf')
patience = 10  # for early stopping
counter = 0

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    num_batches = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        pred_train = model(batch_X)
        loss = criterion(pred_train, batch_y)
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        num_batches += 1
    
    avg_train_loss = total_train_loss / num_batches
    
    # Validation
    model.eval()
    with torch.no_grad():
        pred_val = model(X_val)
        val_loss = criterion(pred_val, y_val)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            counter = 0
        else:
            counter += 1
        
        # Early stopping
        if counter >= patience:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Average Train Loss: {avg_train_loss:.4f}, '
              f'Val Loss: {val_loss.item():.4f}')

# Load best model for testing
model.load_state_dict(torch.load('best_model.pth'))

'''

"\nmore advanced training for Track the best validation loss and implement early stopping, and save the best model based on validation performance\n\nbest_val_loss = float('inf')\npatience = 10  # for early stopping\ncounter = 0\n\nfor epoch in range(num_epochs):\n    model.train()\n    total_train_loss = 0\n    num_batches = 0\n    \n    for batch_X, batch_y in train_loader:\n        optimizer.zero_grad()\n        pred_train = model(batch_X)\n        loss = criterion(pred_train, batch_y)\n        loss.backward()\n        optimizer.step()\n        \n        total_train_loss += loss.item()\n        num_batches += 1\n    \n    avg_train_loss = total_train_loss / num_batches\n    \n    # Validation\n    model.eval()\n    with torch.no_grad():\n        pred_val = model(X_val)\n        val_loss = criterion(pred_val, y_val)\n        \n        # Save best model\n        if val_loss < best_val_loss:\n            best_val_loss = val_loss\n            torch.save(model.state_dict(), 'best_model.p