## **TODO:** Set the value of `URL` to the URL from your learning materials

In [None]:
URL = None
import os
assert URL and (type(URL) is str), "Be sure to initialize URL using the value from your learning materials"
os.environ['URL'] = URL

In [None]:
%%bash
wget -q $URL -O ./data.zip
mkdir -p data
find *.zip | xargs unzip -o -d data/

## Use PyTorch `Dataset` and `Dataloader` with a structured dataset

In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.data as tfd

from tensorflow.data import Dataset

# building blocks of our network
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


Read the files that match `part-*.csv` from the `data` subdirectory into a Pandas data frame named `df`.

In [2]:
from pathlib import Path

df = pd.concat(
    pd.read_csv(file) for file in Path('data/').glob('part-*.csv')
)


## Explore the `df` data frame, including the column names, the first few rows of the dataset, and the data frame's memory usage.

In [4]:
df.head(5)

Unnamed: 0,fareamount,origindatetime_tr,origin_block_latitude,origin_block_longitude,destination_block_latitude,destination_block_longitude,id
0,4.87,06/01/2017 07:00,38.898314,-77.028849,38.902521,-77.030791,751d10ef2403c770a3bd4e220db8594b656d6774962b63...
1,12.7,06/01/2017 14:00,38.904683,-77.046645,38.940181,-77.061193,a9ddc1ab38a3cc3f360e4d2408678d707658762c418e6c...
2,5.14,06/01/2017 12:00,38.910635,-77.042514,38.909652,-77.033254,1f804117b3d98193b5ab7fddc15a543a8165cd60b6b20e...
3,5.14,06/02/2017 13:00,38.889184,-77.021907,38.897207,-77.023477,21af1912855db837c7892fb073f4c59678c305aec0b23b...
4,14.32,06/01/2017 13:00,38.901336,-77.037534,38.942216,-77.073508,26dcdd256e6269e4c6f1ccd2119c345c4deed788a35082...


In [3]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6368133 entries, 0 to 3289205
Data columns (total 7 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   fareamount                   float64
 1   origindatetime_tr            object 
 2   origin_block_latitude        float64
 3   origin_block_longitude       float64
 4   destination_block_latitude   float64
 5   destination_block_longitude  float64
 6   id                           object 
dtypes: float64(5), object(2)
memory usage: 388.7+ MB


In [4]:
df.describe()

Unnamed: 0,fareamount,origin_block_latitude,origin_block_longitude,destination_block_latitude,destination_block_longitude
count,6368133.0,6368133.0,6368133.0,6368133.0,6368133.0
mean,9.700529,38.90408,-77.02986,38.90593,-77.0297
std,4.590246,0.01505742,0.01938031,0.01732536,0.02240406
min,3.25,38.81206,-77.11363,38.81206,-77.11363
25%,6.76,38.89632,-77.04237,38.89667,-77.04337
50%,8.65,38.90148,-77.03195,38.90294,-77.03156
75%,11.62,38.90911,-77.01942,38.91266,-77.01719
max,179.83,38.99422,-76.91001,38.99422,-76.91001


## Drop the `origindatetime_tr` column from the data frame. 

For now you are going to predict the taxi fare just based on the lat/lon coordinates of the pickup and the drop off locations. Remove the `origindatetime_tr` column from the data frame in your working dataset.

In [5]:
working_df = df.drop('origindatetime_tr', axis = 1)
working_df.shape

(6368133, 6)

## Sample 10% of your working dataset into a test dataset data frame

* **hint:** use the Pandas `sample` function with the dataframe. Specify a value for the `random_state` to achieve reproducibility.

In [6]:
test_df = working_df.sample(frac = 0.10, random_state = 42)
test_df.shape

(636813, 6)

## Drop the rows that exist in your test dataset from the working dataset to produce a training dataset.

* **hint** DataFrame's `drop` function can use index values from a data frame to drop specific rows.

In [7]:
train_df = working_df.drop(index = test_df.index)
train_df.shape

(5177451, 6)

## Define 2 Python lists: 1st for the feature column names; 2nd for the target column name

In [8]:
FEATURES = ['origin_block_latitude','origin_block_longitude','destination_block_latitude','destination_block_longitude']
TARGET = ['fareamount']

## Create `X` and `y` tensors with the values of your feature and target columns in the training dataset

In [9]:
X_train = tf.constant(train_df[FEATURES].values)
y_train = tf.constant(train_df[TARGET].values)

## Create a `TensorDataset` instance with the `y` and `X` tensors (in that order)

In [10]:
train_ds = Dataset.from_tensor_slices((X_train, y_train))

## Create a `DataLoader` instance specifying a custom batch size

A batch size of `2 ** 18 = 262,144` should work well.

In [11]:
BATCH_SIZE = 2 ** 18
train_ds = train_ds.batch(batch_size=BATCH_SIZE)
len(train_ds)

20

## Create a model using `nn.Linear`

In [12]:
model = Sequential()

model.add(
    Dense(units=1, input_shape=[len(FEATURES)], activation='linear')
)

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1)                 5         
Total params: 5
Trainable params: 5
Non-trainable params: 0
_________________________________________________________________


## Create an instance of the `Adam` optimizer for the model

In [14]:
optimizer = Adam(learning_rate=0.003)

## Declare your `loss` function


In [15]:
loss_fn = tf.keras.losses.MeanSquaredError()

## Iterate over the batches returned by your `DataLoader` instance

For every step of gradient descent, print out the MSE, MSE, and the batch index


In [16]:
for step, (X_train_batch, y_train_batch) in enumerate(train_ds):
    with tf.GradientTape() as tape:
        # forward pass
        y_pred = model(X_train_batch, training=True)
        
        # compute loss
        loss = loss_fn(y_train_batch, y_pred)
        
    # back propogation
    grads = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    
    # monitor performance
    loss_val, rmse = float(loss), float(tf.math.sqrt(loss))
    print(f'batch {step+1:2d}:  loss={loss_val:7.2f}, RMSE={rmse:7.4f}')

batch  1:  loss=6146.35, RMSE=78.3986
batch  2:  loss=6035.61, RMSE=77.6892
batch  3:  loss=5928.75, RMSE=76.9984
batch  4:  loss=5822.18, RMSE=76.3032
batch  5:  loss=5716.76, RMSE=75.6093
batch  6:  loss=5612.28, RMSE=74.9151
batch  7:  loss=5505.92, RMSE=74.2019
batch  8:  loss=5405.74, RMSE=73.5238
batch  9:  loss=5306.27, RMSE=72.8441
batch 10:  loss=5246.80, RMSE=72.4348
batch 11:  loss=5177.05, RMSE=71.9517
batch 12:  loss=5087.61, RMSE=71.3275
batch 13:  loss=4979.20, RMSE=70.5634
batch 14:  loss=4846.83, RMSE=69.6192
batch 15:  loss=4734.94, RMSE=68.8109
batch 16:  loss=4623.94, RMSE=67.9995
batch 17:  loss=4447.85, RMSE=66.6922
batch 18:  loss=4467.89, RMSE=66.8422
batch 19:  loss=4328.80, RMSE=65.7936
batch 20:  loss=4170.72, RMSE=64.5811


## Implement 10 epochs of gradient descent training

For every step of gradient descent, printout the MSE, RMSE, epoch index, and batch index.

* **hint:** you can call `enumerate(DataLoader)` repeatedly in a `for` loop

In [17]:
epochs=10
for epoch in range(epochs):
    for step, (X_train_batch, y_train_batch) in enumerate(train_ds):
        with tf.GradientTape() as tape:
            # forward pass
            y_pred = model(X_train_batch, training=True)

            # compute loss
            loss = loss_fn(y_train_batch, y_pred)

        # back propogation
        grads = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # monitor performance
        loss_val, rmse = float(loss), float(tf.math.sqrt(loss))
        if (step==0):
            print(f'epoch {epoch+1:2d} ...')
        print(f'  -- batch {step+1:2d}:  loss={loss_val:7.2f}, RMSE={rmse:7.4f}')

epoch  1 ...
  -- batch  1:  loss=4175.20, RMSE=64.6158
  -- batch  2:  loss=4087.41, RMSE=63.9328
  -- batch  3:  loss=4003.14, RMSE=63.2704
  -- batch  4:  loss=3919.38, RMSE=62.6049
  -- batch  5:  loss=3837.14, RMSE=61.9446
  -- batch  6:  loss=3755.84, RMSE=61.2849
  -- batch  7:  loss=3673.46, RMSE=60.6091
  -- batch  8:  loss=3596.27, RMSE=59.9689
  -- batch  9:  loss=3520.07, RMSE=59.3301
  -- batch 10:  loss=3476.69, RMSE=58.9635
  -- batch 11:  loss=3424.98, RMSE=58.5234
  -- batch 12:  loss=3357.76, RMSE=57.9462
  -- batch 13:  loss=3275.19, RMSE=57.2292
  -- batch 14:  loss=3173.76, RMSE=56.3361
  -- batch 15:  loss=3089.35, RMSE=55.5820
  -- batch 16:  loss=3005.82, RMSE=54.8254
  -- batch 17:  loss=2870.87, RMSE=53.5805
  -- batch 18:  loss=2891.85, RMSE=53.7759
  -- batch 19:  loss=2786.87, RMSE=52.7908
  -- batch 20:  loss=2666.90, RMSE=51.6421
epoch  2 ...
  -- batch  1:  loss=2675.80, RMSE=51.7281
  -- batch  2:  loss=2612.00, RMSE=51.1077
  -- batch  3:  loss=2551.14

  -- batch  9:  loss=  24.87, RMSE= 4.9865
  -- batch 10:  loss=  25.89, RMSE= 5.0880
  -- batch 11:  loss=  25.66, RMSE= 5.0660
  -- batch 12:  loss=  26.19, RMSE= 5.1176
  -- batch 13:  loss=  24.92, RMSE= 4.9923
  -- batch 14:  loss=  24.16, RMSE= 4.9150
  -- batch 15:  loss=  24.97, RMSE= 4.9973
  -- batch 16:  loss=  25.31, RMSE= 5.0312
  -- batch 17:  loss=  27.09, RMSE= 5.2052
  -- batch 18:  loss=  24.04, RMSE= 4.9029
  -- batch 19:  loss=  25.37, RMSE= 5.0370
  -- batch 20:  loss=  25.82, RMSE= 5.0815


Copyright 2020 CounterFactual.AI LLC. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.