## Projects: Predicting Car Fuel Efficiency and MNIST Digits

These projects are from Chapter 13 of Raschka et al and cover some basic data processing steps and make use of the concepts discussed in pyotrch-mechanics.ipynb.

## Project 1: Predicting Car Fuel Efficiency

Here we want to predict the fuel efficiency of a car given features such as year, cylinders, horsepower, weight, etc. This is the famous "Auto MPG" dataset.

Some of the useful features of this project:

* It contains both continuous, discrete, and categorical variables, so we learn how to deal with those
* Shows how to "bucket" discrete data into a few broader buckets
* One hot encoding for categorical features
* It is regression rather than classification, while we've focused on the latter so far
* Example of loading data from a URL

In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from IPython.display import Image

In [2]:
# start by loading data from UCI machine learning repository using pandas
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv(url, names=column_names,
                 na_values = "?", comment='\t',
                 sep=" ", skipinitialspace=True)

df.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


In [None]:
# create a training and test split of the data using sklearn
# also look at some summary of the features using .describe() function from sklearn
import sklearn
import sklearn.model_selection


df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8, random_state=1)
train_stats = df_train.describe().transpose()  # describe quantiles of the data
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,318.0,23.583962,7.89075,9.0,17.5,23.0,29.8,46.6
Cylinders,318.0,5.418239,1.682508,3.0,4.0,4.0,6.0,8.0
Displacement,318.0,191.128931,102.212399,70.0,101.75,146.0,258.0,455.0
Horsepower,312.0,103.317308,37.839804,46.0,75.0,92.0,122.75,230.0
Weight,318.0,2952.04717,836.500568,1613.0,2220.75,2801.0,3533.75,5140.0
Acceleration,318.0,15.71761,2.751966,8.5,14.0,15.5,17.4,24.8
Model Year,318.0,75.921384,3.683467,70.0,73.0,76.0,79.0,82.0
Origin,318.0,1.58805,0.80815,1.0,1.0,1.0,2.0,3.0


In [None]:
# normalize data
from packaging import version

# the list of features we will ultimately use
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

df_train_norm, df_test_norm = df_train.copy(), df_test.copy()


if version.parse(pd.__version__) >= version.parse("2.0.0"):

    for col_name in numeric_column_names:
        mean = train_stats.loc[col_name, 'mean']
        std = train_stats.loc[col_name, 'std']
        df_train_norm[col_name] = (df_train_norm[col_name] - mean) / std
        df_test_norm[col_name] = (df_test_norm[col_name] - mean) / std

else:

    for col_name in numeric_column_names:
        mean = train_stats.loc[col_name, 'mean']
        std  = train_stats.loc[col_name, 'std']
        df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean) / std
        df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean) / std
        
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,-0.842931,-0.500222,-0.457648,-0.19372,-0.042737,82,1
255,25.1,-0.842931,-0.500222,-0.404794,-0.277402,-0.115412,78,1
72,15.0,1.534472,1.10428,1.233693,1.123673,-1.169204,72,1
235,26.0,-0.842931,-0.920915,-0.748347,-0.821335,0.902042,77,3
37,18.0,0.34577,0.399864,-0.087667,0.401617,-0.079074,71,1


Here is an example of a nice feature of pytorch -- it lets you group a discrete variable with many values into a few broader "buckets".

In [5]:
# "bucketize" the discrete variable "model year" into pre-1973, 1973-76, 1976-79, and post-1979
boundaries = torch.tensor([73, 76, 79])  # define boundaries of our buckets
v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries, right=True)  # right=True makes right end of bucket exclusive/closed
v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries, right=True)

numeric_column_names.append('Model Year Bucketed')  # add this to the list of features to use


Now we will show how to one-hot encode a categorical feature. If the feature has many categories which may share some similarity, it is preferable to instead use embeddings, both in order to have a more compressed representation that is computationally tractable to work with and to capture similarity between categories. But here we have only a few categories so one-hot is fine.

In [31]:
# one-hot encode country of origin
from torch.nn.functional import one_hot

total_origin = len(set(df_train_norm['Origin']))

# here i think the modulo is to change the origin from 1/2/3 to 0/1/2
origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values) % total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)  # extract only the columns we want as features
x_train = torch.cat([x_train_numeric, origin_encoded], 1).float()  # column-wise concatenate our one-hot encoded origin feature

# repeat for test data
origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values) % total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded], 1).float()

In [32]:
# use MPG as our target variable. it is continuous so this is a regression problem.
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

In [33]:
# i had to add this code to deal with a a few NAs
idx = np.array(np.isnan(x_train).any(axis=1))
x_train = x_train[idx==0][:]
y_train = y_train[idx==0]

In [34]:
# create a TensorDataset and DataLoader
from torch.utils.data import DataLoader, TensorDataset

train_ds = TensorDataset(x_train, y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [35]:
# specify a sequential neural network model
# note this is slightly different than how we did it before:
#    we have a list of number of hidden units per layer, and
#    give each layer a ReLU activation. this can specify 
#    sequential networks quickly if all layers and activations
#    are the same.

hidden_units = [8, 4]
input_size = x_train.shape[1]  # input size is equal to the number of features in each example

all_layers = []
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit  # reset input_size for the next layer

all_layers.append(nn.Linear(hidden_units[-1], 1))  # output layer to single MPG value prediction
model = nn.Sequential(*all_layers)

model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [36]:
# now we follow a typical training procedure
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

torch.manual_seed(1)
num_epochs = 200
log_epochs = 20 
for epoch in range(num_epochs):
    loss_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)[:, 0]
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs==0:
        print(f'Epoch {epoch}  Loss {loss_hist_train/len(train_dl):.4f}')

Epoch 0  Loss 543.6362
Epoch 20  Loss 9.6601
Epoch 40  Loss 9.4157
Epoch 60  Loss 8.5744
Epoch 80  Loss 8.4528
Epoch 100  Loss 7.8052
Epoch 120  Loss 8.1525
Epoch 140  Loss 7.6223
Epoch 160  Loss 7.2230
Epoch 180  Loss 7.1409


In [37]:
# now we can make predictions on the test data and compute loss functions
with torch.no_grad():  # used to avoid training weights further; just predict
    pred = model(x_test.float())[:, 0]
    loss = loss_fn(pred, y_test)
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {nn.L1Loss()(pred, y_test).item():.4f}')

Test MSE: 5.5979
Test MAE: 1.7947


## Project 2: Predicting MNIST Digits

This is similar to what has been done in previous chapters, but here we will use a Sequential network. We will also see the "Flatten" function to convert a tensor to 1D.

In [39]:
import torchvision 
from torchvision import transforms 


image_path = '../datasets/'

# Compose lets us define trasnformations of the data. Here we will just do ToTensor,
#   which converts the array to a PyTorch tensor and normalized 0-255 pixels to 0-1.
# In the next chapter on CNNs we will see other transforms.
transform = transforms.Compose([transforms.ToTensor()])

mnist_train_dataset = torchvision.datasets.MNIST(root=image_path, 
                                           train=True, 
                                           transform=transform, 
                                           download=False)  # not downloading since we did that in ch12
mnist_test_dataset = torchvision.datasets.MNIST(root=image_path, 
                                           train=False, 
                                           transform=transform, 
                                           download=False)
 
batch_size = 64
torch.manual_seed(1)
train_dl = DataLoader(mnist_train_dataset, batch_size, shuffle=True)

In [40]:
# again, build a sequential network
hidden_units = [32, 16]
image_size = mnist_train_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]  # here we use Flatten to flatten these images into 1D vectors for network input 
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)

model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [41]:
# standard training loop
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

torch.manual_seed(1)
num_epochs = 20
for epoch in range(num_epochs):
    accuracy_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
        accuracy_hist_train += is_correct.sum()
    accuracy_hist_train /= len(train_dl.dataset)
    print(f'Epoch {epoch}  Accuracy {accuracy_hist_train:.4f}')
    

Epoch 0  Accuracy 0.8531
Epoch 1  Accuracy 0.9287
Epoch 2  Accuracy 0.9413
Epoch 3  Accuracy 0.9506
Epoch 4  Accuracy 0.9558
Epoch 5  Accuracy 0.9592
Epoch 6  Accuracy 0.9627
Epoch 7  Accuracy 0.9650
Epoch 8  Accuracy 0.9674
Epoch 9  Accuracy 0.9690
Epoch 10  Accuracy 0.9710
Epoch 11  Accuracy 0.9729
Epoch 12  Accuracy 0.9739
Epoch 13  Accuracy 0.9750
Epoch 14  Accuracy 0.9764
Epoch 15  Accuracy 0.9777
Epoch 16  Accuracy 0.9779
Epoch 17  Accuracy 0.9798
Epoch 18  Accuracy 0.9806
Epoch 19  Accuracy 0.9813


In [42]:
# make model predictions and compute accuracy
pred = model(mnist_test_dataset.data / 255.)  # divide by 255 to get 0-1 scale
# prediction will be digit class with highest probability (argmax); compare with truth
is_correct = (torch.argmax(pred, dim=1) == mnist_test_dataset.targets).float()
print(f'Test accuracy: {is_correct.mean():.4f}')

Test accuracy: 0.9647
