In [1]:
import torch
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv(url, names=column_names,na_values = "?", comment='\t',sep=" ", skipinitialspace=True)


In [3]:
## drop the NA rows
df = df.dropna()
df = df.reset_index(drop=True)

In [4]:
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

train_state = df_train.describe().transpose()

train_state

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [8]:
numeric_column_names = [ 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration'  ]

df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_state.loc[col_name, 'mean']
    std = train_state.loc[col_name, 'std']

    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean) / std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean) / std

df_train_norm.tail()


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


![Alt text](../images/screenshot/bucket.png)

In [11]:
boudaries = torch.tensor([73, 76, 79])

v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(v, boudaries, right=True)

v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v, boudaries, right=True)

df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Model Year Bucketed
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3,2
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1,2
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1,0
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1,2
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1,0


In [12]:
numeric_column_names.append('Model Year Bucketed')

one-hot-encoding approach on the categorical feature in order to convert it into the dense format

In [16]:
from torch.nn.functional import one_hot

total_origin = len(set(df_train_norm['Origin']))

origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values))
x_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)

x_train = torch.cat([x_train_numeric, origin_encoded],1).float()

total_origin = len(set(df_test_norm['Origin']))

origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values))
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)

x_test = torch.cat([x_test_numeric, origin_encoded],1).float()

In [17]:
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

In [29]:
from torch.utils.data import DataLoader, TensorDataset

train_ds = TensorDataset(x_train, y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [30]:
import torch.nn as nn 

hidden_units = [8, 4]
input_size = x_train.shape[1]
print(input_size)
all_layer = []

for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layer.append(layer)
    all_layer.append(nn.ReLU())

    input_size = hidden_unit

all_layer.append(nn.Linear(hidden_units[-1], 1))

model = nn.Sequential(*all_layer)
model


10


Sequential(
  (0): Linear(in_features=10, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [31]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [32]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20

for epoch in range(num_epochs):
    loss_hist_train = 0

    for x_batch, y_batch in train_dl:
        pred=model(x_batch)[:, 0]
        loss =loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()

    if epoch % log_epochs ==0 :
        print(f'Epoch {epoch} Loss ',f'{loss_hist_train/len(train_dl):.4f}')


Epoch 0 Loss  563.4365
Epoch 20 Loss  10.6730
Epoch 40 Loss  8.1136
Epoch 60 Loss  7.5590
Epoch 80 Loss  7.2724
Epoch 100 Loss  6.8976
Epoch 120 Loss  6.5562
Epoch 140 Loss  6.7111
Epoch 160 Loss  6.4681
Epoch 180 Loss  5.9357


In [33]:
with torch.no_grad():
    pred = model(x_test.float())[:, 0]
    loss = loss_fn(pred, y_test)
    
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {nn.L1Loss()(pred, y_test).item():.4f}')

Test MSE: 7.9344
Test MAE: 1.8505
