In [1]:
# Python ≥3.5 is required
import sys
import os
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0-preview is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Constant

In [27]:
DATA_DIR="/home/oonisim/home/repositories/git/oonisim/handson-ml2/datasets/california_housing"
LIST_FILESET=DATA_DIR + os.path.sep + "housing*.csv"

# Data

## Shuffled California housing files
See datasets/california_housing/shuffle_split.sh

In [52]:
file_paths = !ls {LIST_FILESET}
!ls {LIST_FILESET} | wc -l

5


In [56]:
_result = !ls {LIST_FILESET} | wc -l
num_files = _result[0]
print(num_files)

for path in file_paths:
    print(path)

5
/home/oonisim/home/repositories/git/oonisim/handson-ml2/datasets/california_housing/housing00.csv
/home/oonisim/home/repositories/git/oonisim/handson-ml2/datasets/california_housing/housing01.csv
/home/oonisim/home/repositories/git/oonisim/handson-ml2/datasets/california_housing/housing02.csv
/home/oonisim/home/repositories/git/oonisim/handson-ml2/datasets/california_housing/housing03.csv
/home/oonisim/home/repositories/git/oonisim/handson-ml2/datasets/california_housing/housing04.csv


## Examine California housing data

In [63]:
import pandas as pd

data = pd.read_csv(file_paths[0])
print(data.shape)
data.head()

(4129, 10)
10


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.25,34.16,52.0,2477.0,385.0,993.0,371.0,4.9135,368100.0,<1H OCEAN
1,-117.94,34.02,27.0,5026.0,955.0,3899.0,930.0,3.871,162900.0,<1H OCEAN
2,-122.01,37.37,25.0,2213.0,360.0,1066.0,390.0,7.2165,360900.0,<1H OCEAN
3,-122.41,37.62,39.0,3119.0,758.0,1807.0,696.0,3.2216,242700.0,NEAR OCEAN
4,-118.16,34.01,37.0,690.0,261.0,952.0,255.0,1.6354,158900.0,<1H OCEAN


In [99]:
num_fields = data.shape[1]
record_defaults=[
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    tf.constant([0.0], dtype=tf.float32),
    [""]
]

In [113]:
num_fields = data.shape[1]
record_defaults=[0.] * (num_fields -1) + [""]

In [114]:
record_defaults

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, '']

# TF Dataset of the files

In [30]:
# TF Dataset.list_files() creates a TF Dataset of file paths that matches file_pattern.
filepaths = tf.data.Dataset.list_files(
    file_pattern=LIST_FILESET,
    shuffle=True,
    seed=42
)

## Interleaving dataset

In [103]:
# TF Dataset.interleave() creates 6 datasets:
# 1. Create 5 (n_readers=5) text datasets by applying map_func to each of the filepaths dataset.
# 2. Interweave each text dataset resulting in the interleaved dataset.
interleaved_dataset = filepaths.interleave(
    # Apply the lambda function to each element in filepaths.
    # Here, creating a dataset comprising lines from each csv file excluding the header.
    map_func=lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    #  The number of input elements that will be processed concurrently.
    cycle_length=int(num_files),
    num_parallel_calls=tf.data.experimental.AUTOTUNE
).shuffle(
    buffer_size=100,
    seed=42
)

# Print first rows (ignoring the header row) of five CSV files, chosen randomly. 
for line in interleaved_dataset.take(5):
#    print(line)
    print(line.numpy())

b'-122.08,37.03,36.0,4682.0,899.0,2143.0,832.0,4.5096,203700.0,NEAR OCEAN'
b'-118.23,34.21,50.0,309.0,47.0,121.0,45.0,6.213,285000.0,<1H OCEAN'
b'-122.31,37.93,39.0,2505.0,371.0,872.0,345.0,5.3433,286500.0,NEAR BAY'
b'-117.11,32.75,46.0,695.0,182.0,601.0,195.0,2.4219,90600.0,NEAR OCEAN'
b'-121.8,36.68,18.0,8581.0,1957.0,6071.0,1889.0,3.0,162200.0,<1H OCEAN'


In [144]:
@tf.function
def preprocess(line):
    fields = tf.io.decode_csv(line, record_defaults=record_defaults)
    return fields

In [149]:
preprocessed_dataset = interleaved_dataset.map(
    preprocess,
    num_parallel_calls=tf.data.experimental.AUTOTUNE
).batch(
    batch_size=5,
    drop_remainder=False
).prefetch(
    buffer_size=1
)

for x in preprocessed_dataset.take(5):
    print(x)


(<tf.Tensor: shape=(5,), dtype=float32, numpy=array([-118.46, -119.04, -122.07, -118.3 , -121.96], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([34.05, 36.07, 37.69, 34.05, 37.53], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([21., 17., 31., 34., 23.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([3639., 2623., 5914., 1453., 2215.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1002.,  659., 1309.,  588.,  475.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1489., 1912., 2999., 1987., 1278.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([ 983.,  618., 1295.,  589.,  492.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([4.6197, 1.5893, 3.0964, 2.096 , 4.2955], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([387500.,  52000., 190500., 187500., 218800.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=string, n

In [149]:
preprocessed_dataset = interleaved_dataset.map(
    preprocess,
    num_parallel_calls=tf.data.experimental.AUTOTUNE
).batch(
    batch_size=5,
    drop_remainder=False
).prefetch(
    buffer_size=1
)

for x in preprocessed_dataset.take(5):
    print(x)


(<tf.Tensor: shape=(5,), dtype=float32, numpy=array([-118.46, -119.04, -122.07, -118.3 , -121.96], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([34.05, 36.07, 37.69, 34.05, 37.53], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([21., 17., 31., 34., 23.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([3639., 2623., 5914., 1453., 2215.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1002.,  659., 1309.,  588.,  475.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1489., 1912., 2999., 1987., 1278.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([ 983.,  618., 1295.,  589.,  492.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([4.6197, 1.5893, 3.0964, 2.096 , 4.2955], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([387500.,  52000., 190500., 187500., 218800.], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=string, n