In [0]:
# Regression: Predict fuel efficiency
# predict the output of continuous value like price of a probability
# Use Auto MPG dataset and builds model to predict the fuel efficiency.

In [0]:
# Use seaborn for pairplot
!pip install -q seaborn

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

try:
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

In [0]:
# Auto MPG dataset
# The dataset is available from UCI Machine Learning Repository

# Get data
dataset_path = keras.utils.get_file('auto-mpg.data', 
        'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data')
dataset_path

In [0]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values='?', 
                          comment='\t', sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.tail()

In [0]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values='?',
                          skipinitialspace=True, comment='\t', sep=' ')
dataset = raw_dataset.copy()
dataset.tail()

In [0]:
# Clean data
# The dataset contains a few unknown values
dataset.isna()

In [0]:
dataset.isna().sum()

In [0]:
# drop those rows
dataset = dataset.dropna()

In [0]:
# column 'Origin' is categorical.
# So conver that to a one-hot.

origin = dataset.pop('Origin')

dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0

In [0]:
dataset.index

In [0]:
# Split the data into train and test
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [0]:
# Inspect the data
# Joint distribution of a few pairs of column from the training dataset.
sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], 
             diag_kind='kde')

In [0]:
# Look at the overall statistics:
train_stats = train_dataset.describe()
train_stats.pop('MPG')
train_stats = train_stats.transpose()
train_stats

In [0]:
# Split features from labels
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

In [0]:
# Normalize the data
# stats show hat how different the ranges of each feature are.
# It is good practice to normalize features that use different scales and ranges
# Although the model might converge without feature normalization, it makes training
# more difficult, and it makes the resulting model dependent on the choice of units used in the input

def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [0]:
# Note:
# The statistics used to normalize the inputs here (mean and standard devication)
# need to be applied to any other data that is fed to the mode, along with the one-hot
# encoding that we did eariler. That includes the test set as well as live data
# when the model is used in production

In [0]:
train_dataset.keys()

In [0]:
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    
    optimizer = tf.keras.optimizers.RMSprop(0.001)
    
    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae', 'mse'])
    return model

In [0]:
model = build_model()

In [0]:
# Try out the model with 10 examples for the training data
# and call model.predict on it.

example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result

In [0]:
# Train the model
# Train the model for 1000 epochs, and record the training and validation accuracy
# in the history object

# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0:
            print('')
        print('.', end='')
        
EPOCHS = 1000

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS, 
                    validation_split=0.2, verbose=0, callbacks=[PrintDot()])

In [0]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [0]:
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error [MPG]')
    plt.plot(hist['epoch'], hist['mae'],
             label='Train Error')
    plt.plot(hist['epoch'], hist['val_mae'],
             label='Val Error')
    plt.ylim([0, 5])
    plt.legend()
    
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Mean Square Error[$MPG^2$]')
    plt.plot(hist['epoch'], hist['mse'],
            label='Train Error')
    plt.plot(hist['epoch'], hist['val_mse'],
             label='Val error')
    plt.ylim([0, 20])
    plt.legend()
    
    plt.show()
    
plot_history(history)

In [0]:
# The graph shows degradation in the validation error after about
# 100 epochs. Let's update model.fit call to automatically stop training when the validation
# score doesn't improve.
# Use an EarlyStopping callback that tests a training condition for every epoch.
# If a set amount of epochs elapses without showing improvment,
# ,then automatically stop the training

model = build_model()

# The patience parameter is the amount of epochs to check for improvment
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
                    validation_split=0.2, verbose=0, callbacks=[early_stop, PrintDot()])
plot_history(history)

In [0]:
# Let's see how well the model generalizes by using the test set.
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=0)

print('Testing set Mean Error: {:5.2f} MPG'.format(mae))

In [0]:
# Predict MPG values using data in the testing set:

test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
# plt.axis('equal')
plt.axis('square')
plt.xlim([0, plt.xlim()[1]])
plt.ylim([0, plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])

In [0]:
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [MPG]")
_ = plt.ylabel("Count")

In [0]:
# It's not quite gaussian, but we might expect that because the number of samples is very small.
# Reducible error and irreducible error
# When numeric input data features have values with differnet ranges,
# each feature should be scaled independently to the same range