In [1]:
try:
    %tensorflow_version 2.x
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
Note: using Google CoLab


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from scipy.stats import zscore

In [3]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", 
    na_values=['NA', '?'])

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [5]:
mean_std={}

# Remove target(Because the name of the car will hinder the training)
cars = df['name']

# Handle missing value
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

# Standardization
x_columns = df.columns.drop('name')
for var in x_columns:
  mean_std[var]=(df[var].mean(), df[var].std())
  df[var] = (df[var] - df[var].mean()) / df[var].std()

# Pandas to Numpy
x = df[x_columns].values
y = df['acceleration'].values # regression

In [6]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,-0.705551,1.496308,1.089233,0.672271,0.630077,-1.29387,-1.625381,-0.714246,chevrolet chevelle malibu
1,-1.089379,1.496308,1.501624,1.587959,0.853259,-1.475181,-1.625381,-0.714246,buick skylark 320
2,-0.705551,1.496308,1.194728,1.195522,0.549778,-1.656492,-1.625381,-0.714246,plymouth satellite
3,-0.961437,1.496308,1.060461,1.195522,0.546236,-1.29387,-1.625381,-0.714246,amc rebel sst
4,-0.833494,1.496308,1.04128,0.933897,0.56513,-1.837804,-1.625381,-0.714246,ford torino


In [7]:
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# Build the neural network
model = Sequential()
model.add(Dense(50, input_dim=x.shape[1], activation='relu', kernel_initializer='random_normal')) # Hidden 1
model.add(Dense(25, activation='relu', kernel_initializer='random_normal')) # Hidden 2
model.add(Dense(1, kernel_initializer='random_normal')) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train,y_train,validation_data=(x_test,y_test),verbose=2,epochs=100)

Epoch 1/100
10/10 - 2s - loss: 0.9800 - val_loss: 0.9801 - 2s/epoch - 176ms/step
Epoch 2/100
10/10 - 0s - loss: 0.9342 - val_loss: 0.9211 - 67ms/epoch - 7ms/step
Epoch 3/100
10/10 - 0s - loss: 0.8664 - val_loss: 0.8279 - 67ms/epoch - 7ms/step
Epoch 4/100
10/10 - 0s - loss: 0.7537 - val_loss: 0.6883 - 113ms/epoch - 11ms/step
Epoch 5/100
10/10 - 0s - loss: 0.6070 - val_loss: 0.5362 - 73ms/epoch - 7ms/step
Epoch 6/100
10/10 - 0s - loss: 0.4726 - val_loss: 0.4236 - 85ms/epoch - 8ms/step
Epoch 7/100
10/10 - 0s - loss: 0.3750 - val_loss: 0.3214 - 116ms/epoch - 12ms/step
Epoch 8/100
10/10 - 0s - loss: 0.2714 - val_loss: 0.2201 - 92ms/epoch - 9ms/step
Epoch 9/100
10/10 - 0s - loss: 0.1745 - val_loss: 0.1364 - 98ms/epoch - 10ms/step
Epoch 10/100
10/10 - 0s - loss: 0.1051 - val_loss: 0.0713 - 91ms/epoch - 9ms/step
Epoch 11/100
10/10 - 0s - loss: 0.0571 - val_loss: 0.0351 - 89ms/epoch - 9ms/step
Epoch 12/100
10/10 - 0s - loss: 0.0307 - val_loss: 0.0229 - 108ms/epoch - 11ms/step
Epoch 13/100
10/10

<keras.callbacks.History at 0x7fe6f0179ad0>

In [8]:
def reverse_standardization(dict, data, col_name):
  return data * dict[col_name][1] + dict[col_name][0]

In [9]:
pred = model.predict(x)
print(reverse_standardization(mean_std, pred[0:10], 'acceleration')) # Reverse Standardization

[[11.893829 ]
 [11.401018 ]
 [10.911663 ]
 [11.926542 ]
 [10.438042 ]
 [ 9.9341545]
 [ 9.005135 ]
 [ 8.549535 ]
 [ 9.944359 ]
 [ 8.56008  ]]


In [10]:
# Measure MSE error.  MSE is common for regression.
score = metrics.mean_squared_error(pred,y)
print(f"Final score (MSE): {score}")

Final score (MSE): 0.0005262484407749141


In [11]:
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y))
print(f"Final score (RMSE): {score}")

Final score (RMSE): 0.022940105509236745


In [12]:
# Sample predictions
for i in range(10):
    print(f"{i+1}. Car name: {cars[i]}, Acceleration: {reverse_standardization(mean_std, y[i], 'acceleration')}, predicted Acceleration: {reverse_standardization(mean_std, pred[i], 'acceleration')}.")

1. Car name: chevrolet chevelle malibu, Acceleration: 12.0, predicted Acceleration: [11.893829].
2. Car name: buick skylark 320, Acceleration: 11.5, predicted Acceleration: [11.401018].
3. Car name: plymouth satellite, Acceleration: 11.0, predicted Acceleration: [10.911663].
4. Car name: amc rebel sst, Acceleration: 12.0, predicted Acceleration: [11.926542].
5. Car name: ford torino, Acceleration: 10.5, predicted Acceleration: [10.438042].
6. Car name: ford galaxie 500, Acceleration: 10.0, predicted Acceleration: [9.9341545].
7. Car name: chevrolet impala, Acceleration: 9.0, predicted Acceleration: [9.005135].
8. Car name: plymouth fury iii, Acceleration: 8.5, predicted Acceleration: [8.549535].
9. Car name: pontiac catalina, Acceleration: 10.0, predicted Acceleration: [9.944359].
10. Car name: amc ambassador dpl, Acceleration: 8.5, predicted Acceleration: [8.56008].
