In [1]:
import os
import math
import shutil
import pandas as pd
import numpy as np
import tensorflow as tf
print(tf.__version__)

# Load the TensorBoard notebook extension
%load_ext tensorboard
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

2.0.0


## Load Our Dataset From Server

In [2]:
df = pd.read_csv(filepath_or_buffer="https://storage.googleapis.com/ml_universities/california_housing_train.csv", \
                 sep=",")

## Examine the data

It's a good idea to get to know your data a little bit before you work with it.

We'll print out a quick summary of a few useful statistics on each column.

This will include things like mean, standard deviation, max, min, and various quantiles.

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66900.0
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80100.0
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85700.0
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73400.0
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65500.0


In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207300.9
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,115983.8
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,14999.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119400.0
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180400.0
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265000.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500001.0


## Split The Data

Now, split the data into two parts -- training and evaluation.

In [5]:
np.random.seed(seed=1) #makes result reproducible
msk = np.random.rand(len(df)) < 0.80
df_train = df[msk]
df_eval = df[~msk]

print("Number Of Training Examples: {}".format(len(df_train)))
print("Number Of Evaluation Examples: {}".format(len(df_eval)))

Number Of Training Examples: 13612
Number Of Evaluation Examples: 3388


#### Add More Features:

In [6]:
def add_more_features(df):
    """
    This function will add more feture to our dataframe
    """
    df["num_rooms"] = df["total_rooms"] / df["households"]
    df["num_bedrooms"] = df["total_bedrooms"] / df["households"]
    return df

In [7]:
add_more_features(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,num_rooms,num_bedrooms
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66900.0,11.9,2.7
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80100.0,16.5,4.1
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85700.0,6.2,1.5
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73400.0,6.6,1.5
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65500.0,5.5,1.2
...,...,...,...,...,...,...,...,...,...,...,...
16993,-124.2,40.5,52.0,2694.0,453.0,1152.0,435.0,3.1,106700.0,6.2,1.0
16994,-124.2,40.3,32.0,1430.0,419.0,434.0,187.0,1.9,76100.0,7.6,2.2
16995,-124.3,40.6,52.0,2217.0,394.0,907.0,369.0,2.4,111400.0,6.0,1.1
16997,-124.3,41.8,17.0,2677.0,531.0,1244.0,456.0,3.0,103600.0,5.9,1.2


In [8]:
add_more_features(df_eval)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,num_rooms,num_bedrooms
13,-114.6,34.8,31.0,2478.0,464.0,1346.0,479.0,3.2,70400.0,5.2,1.0
20,-114.7,33.5,20.0,1491.0,360.0,1135.0,303.0,1.6,44400.0,4.9,1.2
21,-114.7,33.4,24.0,796.0,243.0,227.0,139.0,0.9,59200.0,5.7,1.7
24,-115.2,33.5,18.0,1706.0,397.0,3424.0,283.0,1.6,53500.0,6.0,1.4
25,-115.3,32.8,34.0,591.0,139.0,327.0,89.0,3.7,100000.0,6.6,1.6
...,...,...,...,...,...,...,...,...,...,...,...
16972,-124.2,40.8,43.0,2285.0,479.0,1169.0,482.0,2.0,70500.0,4.7,1.0
16976,-124.2,40.8,13.0,2171.0,339.0,951.0,353.0,4.9,116100.0,6.2,1.0
16991,-124.2,41.8,11.0,3159.0,616.0,1343.0,479.0,2.5,73200.0,6.6,1.3
16996,-124.3,40.7,36.0,2349.0,528.0,1194.0,465.0,2.5,79000.0,5.1,1.1


## Save The Data In CSV Format: 

In [9]:
df_train.to_csv(path_or_buf="./data/train.csv",header=None, index=False,sep=',')
df_eval.to_csv(path_or_buf="./data/eval.csv",header=None, index=False,sep=',')

## Setup GCP Project Details

In [10]:
PROJECT = 'ml-practice-260405'
BUCKET = 'bucket_ml-practice-260405'
REGION = 'us-central1'

Model info for python code

In [11]:
MODEL_NAME = 'house_price'
MODEL_VERSION = 'v1'
TRAINING_DIR = 'house_price_trained'

Info for bash code

In [12]:
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION
os.environ['MODEL_NAME'] = MODEL_NAME
os.environ['MODEL_VERSION'] = MODEL_VERSION
os.environ['TRAINING_DIR'] = TRAINING_DIR
os.environ['TFVERSION'] = '1.40'

## Set Default Project & Compute Engine

In [13]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


### Create the bucket to store model and training data for deploying to Google Cloud Machine Learning Engine Component

The bucket needs to exist for the gsutil commands in next cell to work

In [14]:
%%bash
gsutil mb -p ${PROJECT} gs://${BUCKET}

Creating gs://bucket_ml-practice-260405/...


### Enable the Cloud Machine Learning Engine API

The next command works with Cloud AI Platform API.  In order for the command to work, you must enable the API using the Cloud Console UI.   Use this [link.](https://console.cloud.google.com/project/_/apis/library)  Then search the API list for Cloud Machine Learning and enable the API before executing the next cell.

Allow the Cloud AI Platform service account to read/write to the bucket containing training data.

In [15]:
%%bash
PROJECT_ID=$PROJECT
AUTH_TOKEN=$(gcloud auth print-access-token)
SVC_ACCOUNT=$(curl -X GET -H "Content-Type: application/json" \
    -H "Authorization: Bearer $AUTH_TOKEN" \
    https://ml.googleapis.com/v1/projects/${PROJECT_ID}:getConfig \
    | python -c "import json; import sys; response = json.load(sys.stdin); \
    print(response['serviceAccount'])")

echo "Authorizing the Cloud AI Platform account $SVC_ACCOUNT to access files in $BUCKET"
gsutil -m defacl ch -u $SVC_ACCOUNT:R gs://$BUCKET
gsutil -m acl ch -u $SVC_ACCOUNT:R -r gs://$BUCKET  # error message (if bucket is empty) can be ignored
gsutil -m acl ch -u $SVC_ACCOUNT:W gs://$BUCKET

Authorizing the Cloud AI Platform account service-229327834475@cloud-ml.google.com.iam.gserviceaccount.com to access files in bucket_ml-practice-260405


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0100   235    0   235    0     0    114      0 --:--:--  0:00:02 --:--:--   114100   235    0   235    0     0    114      0 --:--:--  0:00:02 --:--:--   114
Updated default ACL on gs://bucket_ml-practice-260405/
Encountered a problem: CommandException: No URLs matched: gs://bucket_ml-practice-260405/*
Updated ACL on gs://bucket_ml-practice-260405/


## Packaging up the code

Take your code and put into a standard Python package structure.  <a href="house_price/trainer/house_price_model.py">house_price_model.py</a> and <a href="house_price/trainer/house_price_task.py">house_price_task.py</a> containing the Tensorflow code from earlier (explore the <a href="house_price/trainer/">directory structure</a>).

In [16]:
%%bash
find ${MODEL_NAME}

house_price
house_price/trainer
house_price/trainer/house_price_model.py
house_price/trainer/house_price_task.py
house_price/trainer/__init__.py
house_price/trainer/__pycache__
house_price/trainer/__pycache__/house_price_model.cpython-37.pyc
house_price/trainer/__pycache__/house_price_task.cpython-37.pyc
house_price/trainer/__pycache__/__init__.cpython-37.pyc


In [17]:
%%bash
## check whether there are anymore TODOs 
## exit with 0 to avoid notebook process error
grep TODO house_price/trainer/*.py; rc=$?

case $rc in 
    0) ;;
    1) echo "No more TODOs!"; exit 0;;
esac

No more TODOs!


## Find absolute paths to your data

Note the absolute paths below. 

In [18]:
%%bash
echo "Working Directory: " ${PWD}/data/

Working Directory:  /media/mujahid7292/Data/GoogleDriveSandCorp2014/ML_With_TensorFlow_On_GCP/04.Feature_Engineering/01.Improve_Model_Accuracy_With_New_Features/Production_Practice/data/


In [19]:
%%bash
echo "Head Of Training Data"
head -1 ${PWD}/data/train.csv
echo "Head Of Evaluation Data"
head -1 ${PWD}/data/eval.csv

Head Of Training Data
-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,11.889830508474576,2.718220338983051
Head Of Evaluation Data
-114.61,34.83,31.0,2478.0,464.0,1346.0,479.0,3.212,70400.0,5.1732776617954075,0.9686847599164927


## Running the Python module from the command-line

#### Clean model training dir/output dir

In [24]:
%%bash
# This is so that the trained model is started fresh each time. However, 
# this needs to be done before tensorboard is started
rm -rf $PWD/${TRAINING_DIR}

### Run Tensorboard

In [25]:
%tensorboard --logdir ./house_price_trained

### Run Training & Monitor Using Tensorboard

In [26]:
%%bash
# Setup python so it sees the task module which controls the model.py
export PYTHONPATH=${PYTHONPATH}:${PWD}/${MODEL_NAME}
# Run training
# Currently set for python 2.  To run with python 3 
#    1.  Replace 'python' with 'python3' in the following command
#    2.  Edit trainer/task.py to reflect proper module import method 
python3 -m trainer.house_price_task \
    --train_data_paths="${PWD}/data/train*" \
    --eval_data_paths="${PWD}/data/eval.csv" \
    --output_dir=${PWD}/${TRAINING_DIR} \
    --train_steps=3 \
    --job_dir=./tmp

Process is terminated.
