<a href="https://colab.research.google.com/github/osipov/edu/blob/master/tf0/Solution_Tensorflow_Dataloader.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg"/></a>

## **TODO:** Set the value of `URL` below to the URL from your learning materials

In [None]:
URL = None
import os
assert URL and (type(URL) is str), "Be sure to initialize URL using the value from your learning materials"
os.environ['URL'] = URL

In [None]:
%%bash
wget -q $URL -O ./data.zip
mkdir -p data
find *.zip | xargs unzip -o -d data/

## Use TensorFlow `Dataset` and `from_tensor_slices` with a structured dataset

In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.data as tfd

from tensorflow.data import Dataset

# building blocks of our network
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

Read the files that match `part-*.csv` from the `data` subdirectory into a Pandas data frame named `df`.

In [None]:
from pathlib import Path

df = pd.concat(
    pd.read_csv(file) for file in Path('data/').glob('part-*.csv')
)


## Explore the `df` data frame, including the column names, the first few rows of the dataset, and the data frame's memory usage.

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

## Drop the `origindatetime_tr` column from the data frame. 

For now you are going to predict the taxi fare just based on the lat/lon coordinates of the pickup and the drop off locations. Remove the `origindatetime_tr` column from the data frame in your working dataset.

In [None]:
working_df = df.drop('origindatetime_tr', axis = 1)
working_df.shape

## Sample 10% of your working dataset into a test dataset data frame

* **hint:** use the Pandas `sample` function with the dataframe. Specify a value for the `random_state` to achieve reproducibility.

In [None]:
test_df = working_df.sample(frac = 0.10, random_state = 42)
test_df.shape

## Drop the rows that exist in your test dataset from the working dataset to produce a training dataset.

* **hint** DataFrame's `drop` function can use index values from a data frame to drop specific rows.

In [None]:
train_df = working_df.drop(index = test_df.index)
train_df.shape

## Define 2 Python lists: 1st for the feature column names; 2nd for the target column name

In [None]:
FEATURES = ['origin_block_latitude','origin_block_longitude','destination_block_latitude','destination_block_longitude']
TARGET = ['fareamount']

## Create `X` and `y` tensors with the values of your feature and target columns in the training dataset

In [None]:
X_train = tf.constant(train_df[FEATURES].values)
y_train = tf.constant(train_df[TARGET].values)

## Create a `TensorSliceDataset` instance with the `y` and `X` tensors (in that order)

In [None]:
train_ds = Dataset.from_tensor_slices((X_train, y_train))

## Create a `BatchDataset` instance specifying a custom batch size

A batch size of `2 ** 18 = 262,144` should work well.

In [None]:
BATCH_SIZE = 2 ** 18
train_ds = train_ds.batch(batch_size=BATCH_SIZE)
len(train_ds)

## Create a model using `keras.Dense`

In [None]:
model = Sequential()
model.add(
    Dense(units=1, input_shape=[len(FEATURES)], activation='linear')
)

In [None]:
model.summary()

## Create an instance of the `Adam` optimizer for the model

In [None]:
LEARNING_RATE = 0.003
optimizer = Adam( learning_rate = LEARNING_RATE )

## Declare your `loss` function using `keras.losses.MeanSquaredError`


In [None]:
loss_fn = tf.keras.losses.MeanSquaredError()

## Iterate over the batches returned by your `BatchDataset` instance

For every step of gradient descent, print out the MSE, RMSE, and the batch index


In [None]:
for step, (X_train_batch, y_train_batch) in enumerate(train_ds):
    with tf.GradientTape() as tape:
        # forward pass
        y_pred = model(X_train_batch, training=True)
        
        # compute loss
        loss = loss_fn(y_train_batch, y_pred)
        
    # backpropagation
    grads = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    
    # monitor performance
    loss_val, rmse = float(loss), float(tf.math.sqrt(loss))
    print(f'batch {step+1:2d}:  loss={loss_val:7.2f}, RMSE={rmse:7.4f}')

## Implement 10 epochs of gradient descent training

For every step of gradient descent, printout the MSE, RMSE, epoch index, and batch index.

* **hint:** you can call `enumerate(BatchDataset)` repeatedly in a `for` loop

In [None]:
EPOCHS=10
for epoch in range(EPOCHS):
    for step, (X_train_batch, y_train_batch) in enumerate(train_ds):
        with tf.GradientTape() as tape:
            # forward pass
            y_pred = model(X_train_batch, training=True)

            # compute loss
            loss = loss_fn(y_train_batch, y_pred)

        # back propogation
        grads = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # monitor performance
        loss_val, rmse = float(loss), float(tf.math.sqrt(loss))
        if (step==0):
            print(f'epoch {epoch+1:2d} ...')
        print(f'  -- batch {step+1:2d}:  loss={loss_val:7.2f}, RMSE={rmse:7.4f}')

Copyright 2021 CounterFactual.AI LLC. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.