# Paywall Binary Classification

## Training on Cloud ML Engine

This notebook illustrates distributed training and hyperparameter tuning on Cloud ML Engine.

In [2]:
# change these to try this notebook out
BUCKET = 'agea_asl_proyect'
PROJECT = 'qwiklabs-gcp-4a684069c4776675'
REGION = 'us-central1'

In [3]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['TFVERSION'] = '1.12'  

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.cloud import bigquery
import tensorflow as tf
print(tf.__version__)

1.12.0


In [5]:
%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


In [None]:
%%bash
# This command will fail if the Cloud Machine Learning Engine API is not enabled using the link above.
echo "Getting the service account email associated with the Cloud Machine Learning Engine API"

AUTH_TOKEN=$(gcloud auth print-access-token)
SVC_ACCOUNT=$(curl -X GET -H "Content-Type: application/json" \
    -H "Authorization: Bearer $AUTH_TOKEN" \
    https://ml.googleapis.com/v1/projects/${PROJECT}:getConfig \
    | python -c "import json; import sys; response = json.load(sys.stdin); \
    print (response['serviceAccount'])")  # If this command fails, the Cloud Machine Learning Engine API has not been enabled above.

echo "Authorizing the Cloud ML Service account $SVC_ACCOUNT to access files in $BUCKET"
gsutil -m defacl ch -u $SVC_ACCOUNT:R gs://$BUCKET   
gsutil -m acl ch -u $SVC_ACCOUNT:R -r gs://$BUCKET   # error message (if bucket is empty) can be ignored.  
gsutil -m acl ch -u $SVC_ACCOUNT:W gs://$BUCKET      

## Get data

In [29]:
client = bigquery.Client(PROJECT)
sql="""
SELECT 
       pase_id,dias_navegacion_x,n_visitas_x,n_disp_x,v_pw_x,q_notas_x,q_homes_x,pv_deportes_x,
       pv_espectaculos_x,pv_estilo_x,pv_noticias_x,pv_servicios_x,pv_sudoku_x,pv_sociedad_x,
       v_deportes_x,v_espectaculos_x,v_estilo_x,v_noticias_x,v_servicios_x,v_sudoku_x,
       v_sociedad_x,pvd_n,a_edad,susc_target,antiguedad_pase,q_visitas_10,q_visitas,
       genero_n
FROM `qwiklabs-gcp-4a684069c4776675.AGEA_ASL.Paywall`
union all
SELECT 
       pase_id,dias_navegacion_x,n_visitas_x,n_disp_x,v_pw_x,q_notas_x,q_homes_x,pv_deportes_x,
       pv_espectaculos_x,pv_estilo_x,pv_noticias_x,pv_servicios_x,pv_sudoku_x,pv_sociedad_x,
       v_deportes_x,v_espectaculos_x,v_estilo_x,v_noticias_x,v_servicios_x,v_sudoku_x,
       v_sociedad_x,pvd_n,a_edad,susc_target,antiguedad_pase,q_visitas_10,q_visitas,
       genero_n
FROM `qwiklabs-gcp-4a684069c4776675.AGEA_ASL.Paywall_V2` 
 where susc_target = 1 
"""

df = client.query(sql).to_dataframe()
df.head()

Unnamed: 0,pase_id,dias_navegacion_x,n_visitas_x,n_disp_x,v_pw_x,q_notas_x,q_homes_x,pv_deportes_x,pv_espectaculos_x,pv_estilo_x,...,v_servicios_x,v_sudoku_x,v_sociedad_x,pvd_n,a_edad,susc_target,antiguedad_pase,q_visitas_10,q_visitas,genero_n
0,3628102,2,1,1,0,0,0,0,0,0,...,0,0,0,0,42,0,1416,11,12,0
1,6489400,10,1,1,0,0,12,0,0,0,...,0,0,0,0,40,0,424,9,12,0
2,1455866,5,1,1,0,0,0,0,0,0,...,0,0,0,0,63,0,1989,9,12,0
3,4575019,10,1,1,0,0,30,7,0,0,...,0,0,0,0,41,0,1103,9,12,0
4,3909939,6,1,1,0,0,0,0,0,0,...,0,1,0,0,67,0,1315,5,12,1


In [30]:
df = df.drop(['pase_id','pvd_n','n_visitas_x',], 1)

In [31]:
traindf, evaldf = train_test_split(df, test_size=0.3,random_state=123)
columns = traindf.columns

In [32]:
traindf.describe()

Unnamed: 0,dias_navegacion_x,n_disp_x,v_pw_x,q_notas_x,q_homes_x,pv_deportes_x,pv_espectaculos_x,pv_estilo_x,pv_noticias_x,pv_servicios_x,...,v_noticias_x,v_servicios_x,v_sudoku_x,v_sociedad_x,a_edad,susc_target,antiguedad_pase,q_visitas_10,q_visitas,genero_n
count,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,...,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0
mean,8.307864,1.136664,3.876359,9.629545,36.12189,8.059399,5.605752,2.12555,7.427407,0.001052,...,0.573814,0.000209,0.040544,0.623026,44.821454,0.002688,1037.497676,6.269758,9.862821,0.250955
std,7.870728,0.355735,9.861777,11.607996,103.409776,19.894098,13.073828,8.40443,21.344673,0.125016,...,0.494522,0.014444,0.197233,0.484629,21.522018,0.051773,590.321549,4.655796,2.962692,0.433563
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-7986.0,0.0,90.0,0.0,1.0,0.0
25%,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,33.0,0.0,489.0,1.0,8.0,0.0
50%,5.0,1.0,0.0,5.0,4.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,43.0,0.0,1096.0,6.0,12.0,0.0
75%,12.0,1.0,2.0,16.0,31.0,7.0,6.0,1.0,7.0,0.0,...,1.0,0.0,0.0,1.0,56.0,0.0,1440.0,11.0,12.0,1.0
max,31.0,3.0,192.0,415.0,3850.0,1074.0,677.0,338.0,1406.0,46.0,...,1.0,1.0,1.0,1.0,99.0,1.0,2288.0,12.0,12.0,1.0


In [33]:
evaldf.describe()

Unnamed: 0,dias_navegacion_x,n_disp_x,v_pw_x,q_notas_x,q_homes_x,pv_deportes_x,pv_espectaculos_x,pv_estilo_x,pv_noticias_x,pv_servicios_x,...,v_noticias_x,v_servicios_x,v_sudoku_x,v_sociedad_x,a_edad,susc_target,antiguedad_pase,q_visitas_10,q_visitas,genero_n
count,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,...,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0
mean,8.350217,1.138285,3.850355,9.696514,36.523324,8.103555,5.60221,2.140869,7.42701,0.000995,...,0.574933,0.0002,0.040402,0.624956,44.816456,0.002563,1037.957247,6.282304,9.869855,0.251436
std,7.881808,0.357854,9.768997,11.738791,104.995376,19.899949,13.067906,8.345705,21.124822,0.114214,...,0.494354,0.014132,0.196901,0.484135,22.388447,0.050565,590.283342,4.650816,2.961253,0.433839
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-7986.0,0.0,90.0,0.0,1.0,0.0
25%,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,33.0,0.0,490.0,1.0,8.0,0.0
50%,5.0,1.0,0.0,5.0,4.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,43.0,0.0,1098.0,6.0,12.0,0.0
75%,13.0,1.0,2.0,16.0,31.0,7.0,6.0,1.0,7.0,0.0,...,1.0,0.0,0.0,1.0,56.0,0.0,1440.0,11.0,12.0,1.0
max,31.0,3.0,110.0,387.0,2024.0,352.0,643.0,296.0,1848.0,33.0,...,1.0,1.0,1.0,1.0,99.0,1.0,2283.0,12.0,12.0,1.0


In [34]:
def escalar(x,escalador,nom_clase):
    temp = x.copy()
    temp = temp.reset_index(drop = True)
    clase = temp[nom_clase]
    columnas = temp.columns
    temp = pd.DataFrame(escalador.transform(temp))
    temp.columns = columnas
    temp = temp.drop(nom_clase,1)
    temp = pd.concat([temp,clase],1)
    return temp

escalador = StandardScaler()
escalador.fit(traindf)
traindf = escalar(traindf,escalador,'susc_target')
evaldf = escalar(evaldf,escalador,'susc_target')

In [35]:
traindf.describe()

Unnamed: 0,dias_navegacion_x,n_disp_x,v_pw_x,q_notas_x,q_homes_x,pv_deportes_x,pv_espectaculos_x,pv_estilo_x,pv_noticias_x,pv_servicios_x,...,v_noticias_x,v_servicios_x,v_sudoku_x,v_sociedad_x,a_edad,antiguedad_pase,q_visitas_10,q_visitas,genero_n,susc_target
count,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,...,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0,560718.0
mean,-5.1296320000000006e-17,-2.470409e-16,1.5890710000000003e-17,-5.83673e-17,5.626375e-18,1.6714390000000003e-17,-7.951690000000001e-18,-2.050332e-17,6.6908240000000005e-18,8.807051e-19,...,-7.833840000000001e-17,1.1126030000000001e-17,7.040572e-17,6.057224000000001e-17,-1.133765e-16,-3.009604e-17,9.035147e-18,1.779658e-16,3.755985e-17,0.002688
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,0.051773
min,-0.9284873,-0.3841737,-0.3930693,-0.829562,-0.3493086,-0.4051154,-0.428777,-0.2529085,-0.3479751,-0.008416741,...,-1.160343,-0.01444661,-0.2055669,-1.285575,-373.1448,-1.605055,-1.346658,-2.991478,-0.5788206,0.0
25%,-0.8014341,-0.3841737,-0.3930693,-0.7434145,-0.3493086,-0.4051154,-0.428777,-0.2529085,-0.3479751,-0.008416741,...,-1.160343,-0.01444661,-0.2055669,-1.285575,-0.549273,-0.9291515,-1.131871,-0.6287599,-0.5788206,0.0
50%,-0.4202746,-0.3841737,-0.3930693,-0.3988241,-0.3106275,-0.4051154,-0.428777,-0.2529085,-0.3011249,-0.008416741,...,0.8618144,-0.01444661,-0.2055669,0.7778618,-0.08463219,0.09910256,-0.05794024,0.7213646,-0.5788206,0.0
75%,0.4690975,-0.3841737,-0.1902659,0.5487994,-0.04953008,-0.05325196,0.03015555,-0.1339235,-0.02002409,-0.008416741,...,0.8618144,-0.01444661,-0.2055669,0.7778618,0.5194009,0.6818363,1.015991,0.7213646,1.727651,0.0
max,2.883108,5.237988,19.07606,34.92169,36.88125,53.58079,51.35411,39.96401,65.52332,367.9462,...,0.8618144,69.22038,4.864596,0.7778618,2.517357,2.118343,1.230777,0.7213646,1.727651,1.0


In [36]:
evaldf.describe()

Unnamed: 0,dias_navegacion_x,n_disp_x,v_pw_x,q_notas_x,q_homes_x,pv_deportes_x,pv_espectaculos_x,pv_estilo_x,pv_noticias_x,pv_servicios_x,...,v_noticias_x,v_servicios_x,v_sudoku_x,v_sociedad_x,a_edad,antiguedad_pase,q_visitas_10,q_visitas,genero_n,susc_target
count,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,...,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0,240308.0
mean,0.005381,0.004557,-0.002637,0.005769,0.003882,0.00222,-0.000271,0.001823,-1.9e-05,-0.000461,...,0.002262,-0.000617,-0.000721,0.003983,-0.000232,0.000779,0.002695,0.002374,0.001109,0.002563
std,1.001409,1.005957,0.990593,1.011269,1.015334,1.000295,0.999548,0.993013,0.989701,0.913599,...,0.999662,0.978405,0.998322,0.998983,1.040259,0.999936,0.998931,0.999515,1.000638,0.050565
min,-0.928487,-0.384174,-0.393069,-0.829562,-0.349309,-0.405115,-0.428777,-0.252908,-0.347975,-0.008417,...,-1.160343,-0.014447,-0.205567,-1.285575,-373.144772,-1.605055,-1.346658,-2.991478,-0.578821,0.0
25%,-0.801434,-0.384174,-0.393069,-0.743414,-0.349309,-0.405115,-0.428777,-0.252908,-0.347975,-0.008417,...,-1.160343,-0.014447,-0.205567,-1.285575,-0.549273,-0.927458,-1.131871,-0.62876,-0.578821,0.0
50%,-0.420275,-0.384174,-0.393069,-0.398824,-0.310627,-0.405115,-0.428777,-0.252908,-0.301125,-0.008417,...,0.861814,-0.014447,-0.205567,0.777862,-0.084632,0.102491,-0.05794,0.721365,-0.578821,0.0
75%,0.596151,-0.384174,-0.190266,0.548799,-0.04953,-0.053252,0.030156,-0.133924,-0.020024,-0.008417,...,0.861814,-0.014447,-0.205567,0.777862,0.519401,0.681836,1.015991,0.721365,1.727651,0.0
max,2.883108,5.237988,10.761117,32.509555,19.223327,17.28859,48.753496,34.966642,86.23108,263.959039,...,0.861814,69.220384,4.864596,0.777862,2.517357,2.109873,1.230777,0.721365,1.727651,1.0


In [None]:
traindf.to_csv(path_or_buf = "train.csv", sep = ',', header = False, index = False)
evaldf.to_csv(path_or_buf = "eval.csv", sep = ',', header = False, index = False)

In [37]:
!head train.csv

-0.9284872684547287,-0.3841736989164848,-0.39306932368834213,-0.6572668666520516,-0.32029776176593816,-0.4051154197875623,-0.4287770134171482,1.055926184018161,-0.34797505969971154,-0.008416740950961833,-0.07771682707713344,-0.5177951256905439,-0.9689542454028264,-0.9863570956736601,1.5006005961749365,-1.1603425981028934,-0.014446611598733014,-0.20556692903439944,-1.2855754298778754,-0.2704885307033339,0.049976745987389236,1.2307770341311648,0.7213645951381626,-0.5788206295481897,0
0.08793804291200027,-0.3841736989164848,-0.39306932368834213,-0.48497169061610923,0.9561784406548663,1.856863947083773,-0.4287770134171482,-0.2529084689431448,-0.34797505969971154,-0.008416740950961833,-0.07771682707713344,-0.5177951256905439,1.0320404753314916,-0.9863570956736601,-0.66639984186933,-1.1603425981028934,-0.014446611598733014,-0.20556692903439944,-1.2855754298778754,0.3800086613829106,-1.1815565586914543,-0.4875126582768798,-0.29122878426113097,1.7276509318276543,0
1.2314165181995704,-0.38417

In [38]:
!head eval.csv

-0.8014341045338876,-0.3841736989164848,-0.39306932368834213,-0.48497169061610923,-0.3493085845482292,-0.4051154197875623,-0.4287770134171482,1.8888209631753554,-0.34797505969971154,-0.008416740950961833,-0.07771682707713344,-0.4608482150365636,-0.9689542454028264,-0.9863570956736601,1.5006005961749365,-1.1603425981028934,-0.014446611598733014,-0.20556692903439944,0.777861786060267,-0.5028089564484213,-1.0900809074498345,-1.3466575044809022,-0.29122878426113097,1.7276509318276543,0
2.883107649170505,-0.3841736989164848,0.7223493176820088,0.9795373056894003,0.994859537697921,-0.002985754565991604,-0.0463332126308965,1.055926184018161,1.5260304598078127,-0.008416740950961833,-0.07771682707713344,-0.29000748307462293,1.0320404753314916,1.0138316076258587,1.5006005961749365,0.8618144344911185,-0.014446611598733014,-0.20556692903439944,0.777861786060267,0.8911135980221028,-0.22445020773746996,1.0159908225801593,0.38383346867173146,-0.5788206295481897,0
-0.9284872684547287,-0.3841736989164

Now that we have the TensorFlow code working on a subset of the data, we can package the TensorFlow code up as a Python module and train it on Cloud ML Engine.
<p>
<h2> Train on Cloud ML Engine </h2>
<p>
Training on Cloud ML Engine requires:
<ol>
<li> Making the code a Python package
<li> Using gcloud to submit the training code to Cloud ML Engine
</ol>

## Lab Task 1

The following code edits paywall_module/trainer/task.py.

In [10]:
%writefile paywall_module/trainer/task.py
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import os
import shutil

from . import model

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # Input and output paths
    parser.add_argument(
        "--train_data_paths",
        help = "GCS path to training data",
        required = True
    )
    parser.add_argument(
        "--eval_data_paths",
        help = "GCS path to validation data",
        required = True
    )
    parser.add_argument(
        "--output_dir",
        help = "GCS pat to write checkpoints and export models",
        required = True
    )
    # Tunable hyperparameters
    parser.add_argument(
        "--batch_size",
        help = "The number of examples in each mini-batch",
        type = int,
        default = 512
    )
    parser.add_argument(
        "--learning_rate",
        help = "The learning rate for gradient descent",
        type = float,
        default = 0.1
    )
    parser.add_argument(
        "--hidden_units",
        help = "Hidden layer sizes to use for DNN feature columns -- provide space-separated layers",
        type = str,
        default = "128 32 4"
    )
    parser.add_argument(
        "--weighted_cross_entropy_pos_weight",
        help = "Introduced as a multiplicative coefficient for the positive targets term in the loss expression",
        type = float,
        default = 0.1
    )
    # Training/evaluation loop parameters
    parser.add_argument(
        "--classification_threshold",
        help = "The binary classification threshold",
        type = float,
        default = 0.5
    )
    parser.add_argument(
        "--train_steps",
        help = "The number of steps/batches to train on",
        type = int,
        default = 100
    )
    parser.add_argument(
        "--start_delay_secs",
        help = "The number of seconds to delay before starting evaluation",
        type = int,
        default = 30
    )
    parser.add_argument(
        "--throttle_secs",
        help = "The number of seconds between each evaluation",
        type = int,
        default = 60
    )
    parser.add_argument(
        "--job-dir",
        help = "this model ignores this field, but it is required by gcloud",
        default = "junk"
    )

    args = parser.parse_args()
    arguments = args.__dict__

    # unused args provided by service
    arguments.pop("job_dir", None)
    arguments.pop("job-dir", None)
    
    # Create hidden_units list
    arguments["hidden_units"] = [int(x) for x in arguments["hidden_units"].split(" ")]

    # Append trial_id to path if we are doing hptuning
    # This code can be removed if you are not using hyperparameter tuning
    arguments["output_dir"] = os.path.join(
        arguments["output_dir"],
        json.loads(
            os.environ.get("TF_CONFIG", "{}")
        ).get("task", {}).get("trial", "")
    )

    # Run the training job
    shutil.rmtree(arguments["output_dir"], ignore_errors = True) # start fresh each time
    model.build_model(arguments)

Overwriting paywall_module/trainer/task.py


## Lab Task 2

The following code edits paywall_module/trainer/model.py.

In [42]:
%writefile paywall_module/trainer/model.py
#!/usr/bin/env python

# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# Import helpful libraries and setup our project, bucket, and region
import tensorflow as tf

# Build hybrid recommendation neural network model
def build_model(args):
    tf.logging.set_verbosity(tf.logging.INFO)
  
    CSV_COLUMNS = ["dias_navegacion_x", "n_disp_x", "v_pw_x", "q_notas_x", "q_homes_x",
                   "pv_deportes_x", "pv_espectaculos_x", "pv_estilo_x", "pv_noticias_x",
                   "pv_servicios_x", "pv_sudoku_x", "pv_sociedad_x", "v_deportes_x",
                   "v_espectaculos_x", "v_estilo_x", "v_noticias_x", "v_servicios_x",
                   "v_sudoku_x", "v_sociedad_x", "a_edad", "antiguedad_pase",
                   "q_visitas_10", "q_visitas", "genero_n", "susc_target"]
    LABEL_COLUMN = "susc_target"
    CSV_DEFAULTS = [[0.0] for _ in CSV_COLUMNS]

    # Create input function for train and eval
    def read_dataset(filename, mode, batch_size = 512):
        def _input_fn():
            def decode_csv(value_column):
                columns = tf.decode_csv(records = value_column, record_defaults = CSV_DEFAULTS)
                features = dict(zip(CSV_COLUMNS, columns))          
                label = features.pop(LABEL_COLUMN)         
                return features, label

            # Create list of files that match pattern
            file_list = tf.gfile.Glob(filename = filename)
#             print("file_list = \n{}".format(file_list))

            # Create dataset from file list
            dataset = tf.data.TextLineDataset(filenames = file_list).map(map_func = decode_csv)

            if mode == tf.estimator.ModeKeys.TRAIN:
                num_epochs = None # indefinitely
                dataset = dataset.shuffle(buffer_size = 10 * batch_size)
            else:
                num_epochs = 1 # end-of-input after this

            dataset = dataset.repeat(count = num_epochs).batch(batch_size = batch_size)
            return dataset.make_one_shot_iterator().get_next()
        return _input_fn

    # Create feature columns to be used in model
    def make_feature_cols():
        feature_columns = [tf.feature_column.numeric_column(k) for k in CSV_COLUMNS[0:-1]]
        return feature_columns

    # Create custom model function for our custom estimator
    def custom_dnn_binary_classifier(features, labels, mode, params):
#         print("features = \n{}".format(features))
#         print("labels = \n{}".format(labels))
#         print("mode = \n{}".format(mode))
#         print("params = \n{}".format(params))

        # Create neural network input layer using our feature columns defined above
        net = tf.feature_column.input_layer(features = features, feature_columns = make_feature_cols())
#         print("net = \n{}".format(net))

        # Create hidden layers by looping through hidden unit list
        for units in params["hidden_units"]:
            net = tf.layers.dense(inputs = net, units = units, activation = tf.nn.relu) # shape = (batch_size, units)
#             print("net = \n{}".format(net))

        # Compute logits using the output of our last hidden layer
        logits = tf.layers.dense(inputs = net, units = 1, activation = None) # shape = (batch_size, 1)
#         print("logits = \n{}".format(logits))

        # Find the probabilities from the logits
        probabilities = tf.nn.sigmoid(x = logits) # shape = (batch_size, 1)
#         print("probabilities = \n{}".format(probabilities))

        # Find the predicted class indices based on the classification threshold
        predicted_class = tf.where(condition = tf.less(x = probabilities, y = params["classification_threshold"]), 
                                   x = tf.zeros_like(tensor = probabilities, dtype = tf.int32), 
                                   y = tf.ones_like(tensor = probabilities, dtype = tf.int32)) # shape = (batch_size, 1)
#         print("predicted_class = \n{}".format(predicted_class))

        # If the mode is prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            # Create predictions dict
            predictions_dict = {
                "logits": logits,
                "probabilities": probabilities,
                "predicted_class": predicted_class}

            # Create export outputs
            export_outputs = {"predict_export_outputs": tf.estimator.export.PredictOutput(outputs = predictions_dict)}

            return tf.estimator.EstimatorSpec( # return early since we"re done with what we need for prediction mode
                mode = mode,
                predictions = predictions_dict,
                loss = None,
                train_op = None,
                eval_metric_ops = None,
                export_outputs = export_outputs)

        # Continue on with training and evaluation modes

        # Compute loss using sparse softmax cross entropy since this is classification and our labels (content id indices) and probabilities are mutually exclusive
        loss_per_example = tf.nn.weighted_cross_entropy_with_logits(targets = labels, 
                                                                    logits = tf.squeeze(input = logits, axis = 1), 
                                                                    pos_weight = params["weighted_cross_entropy_pos_weight"]) # shape = (batch_size)

        loss = tf.reduce_mean(input_tensor = loss_per_example) # shape = ()

        # Create loss scalar summaries to see in TensorBoard
        tf.summary.scalar(name = "loss", tensor = loss)

        # If the mode is evaluation
        if mode == tf.estimator.ModeKeys.EVAL:
            # Compute evaluation metrics
            accuracy = tf.metrics.accuracy(labels = labels, predictions = predicted_class)
            precision = tf.metrics.precision(labels = labels, predictions = predicted_class)
            recall = tf.metrics.recall(labels = labels, predictions = predicted_class)
            auc = tf.metrics.auc(labels = labels, predictions = predicted_class)
            true_positives = tf.metrics.true_positives(labels = labels, predictions = predicted_class)
            false_positives = tf.metrics.false_positives(labels = labels, predictions = predicted_class)
            true_negatives = tf.metrics.true_negatives(labels = labels, predictions = predicted_class)
            false_negatives = tf.metrics.false_negatives(labels = labels, predictions = predicted_class)

            # Put eval metrics into a dictionary
            eval_metric_ops = {
              "accuracy": accuracy,
              "precision": precision,
              "recall": recall, 
              "auc": auc, 
              "true_positives": true_positives, 
              "false_positives": false_positives, 
              "true_negatives": true_negatives, 
              "false_negatives": false_negatives}

            # Create eval scalar summaries to see in TensorBoard
            tf.summary.scalar(name = "accuracy", tensor = accuracy[1])
            tf.summary.scalar(name = "precision", tensor = precision[1])
            tf.summary.scalar(name = "recall", tensor = recall[1])
            tf.summary.scalar(name = "auc", tensor = auc[1])
            tf.summary.scalar(name = "true_positives", tensor = true_positives[1])
            tf.summary.scalar(name = "false_positives", tensor = false_positives[1])
            tf.summary.scalar(name = "true_negatives", tensor = true_negatives[1])
            tf.summary.scalar(name = "false_negatives", tensor = false_negatives[1])

            return tf.estimator.EstimatorSpec( # return early since we"re done with what we need for evaluation mode
                mode = mode,
                predictions = None,
                loss = loss,
                train_op = None,
                eval_metric_ops = eval_metric_ops,
                export_outputs = None)

        # Continue on with training mode

        # If the mode is training
        if mode == tf.estimator.ModeKeys.TRAIN:
            # Create a custom optimizer
            optimizer = tf.train.AdagradOptimizer(learning_rate = params["learning_rate"])

            # Create train op
            train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step())

            return tf.estimator.EstimatorSpec( # final return since we"re done with what we need for training mode
              mode = mode,
              predictions = None,
              loss = loss,
              train_op = train_op,
              eval_metric_ops = None,
              export_outputs = None)

    # Create serving input function
    def serving_input_fn():  
        feature_placeholders = {
            colname : tf.placeholder(dtype = tf.float32, shape = [None]) \
            for colname in CSV_COLUMNS[0:-1]
        }
        features = {
            key: tf.expand_dims(tensor, -1) \
            for key, tensor in feature_placeholders.items()
        }

        return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)

    # Create train and evaluate loop to combine all of the pieces together.
    def train_and_evaluate(args):
        run_config = tf.estimator.RunConfig(save_checkpoints_steps = 100,
                                            keep_checkpoint_max = 20,
                                            save_summary_steps = 100,
                                            log_step_count_steps = 100)
        
        estimator = tf.estimator.Estimator(
            model_fn = custom_dnn_binary_classifier,
            model_dir = args["output_dir"],
            config = run_config,
            params={
                "hidden_units": args["hidden_units"],
                "classification_threshold": args["classification_threshold"],
                "weighted_cross_entropy_pos_weight": args["weighted_cross_entropy_pos_weight"],
                "learning_rate": args["learning_rate"]
            })

        train_spec = tf.estimator.TrainSpec(
            input_fn = read_dataset(filename = args["train_data_paths"], mode = tf.estimator.ModeKeys.TRAIN, batch_size = args["batch_size"]),
            max_steps = args["train_steps"])

        exporter = tf.estimator.BestExporter(name = "exporter", serving_input_receiver_fn = serving_input_fn, exports_to_keep = 10)

        eval_spec = tf.estimator.EvalSpec(
            input_fn = read_dataset(filename = args["eval_data_paths"], mode = tf.estimator.ModeKeys.EVAL, batch_size = args["batch_size"]),
            steps = None,
            start_delay_secs = args["start_delay_secs"],
            throttle_secs = args["throttle_secs"],
            exporters = exporter)

        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
        
    # Call train_and_evaluate loop
    train_and_evaluate(args)

Overwriting paywall_module/trainer/model.py


## Lab Task 3

After moving the code to a package, make sure it works standalone. (Note the --pattern and --train_examples lines so that I am not trying to boil the ocean on my laptop). Even then, this takes about <b>3 minutes</b> in which you won't see any output ...

In [12]:
%writefile requirements.txt
tensorflow==1.12.0

Overwriting requirements.txt


In [None]:
%bash
rm -rf paywall_trained
export PYTHONPATH=${PYTHONPATH}:${PWD}/paywall_module
python -m trainer.task \
  --train_data_paths=train.csv \
  --eval_data_paths=eval.csv \
  --output_dir=${OUTDIR} \
  --batch_size=128 \
  --learning_rate=0.1 \
  --hidden_units="64 32 16" \
  --weighted_cross_entropy_pos_weight=40.0 \
  --classification_threshold=0.5 \
  --train_steps=2000 \
  --start_delay_secs=1 \
  --throttle_secs=5

## Lab Task 5

Once the code works in standalone mode, you can run it on Cloud ML Engine.  Because this is on the entire dataset, it will take a while. The training run took about <b> an hour </b> for me. You can monitor the job from the GCP console in the Cloud Machine Learning Engine section.

In [None]:
%bash
OUTDIR=gs://${BUCKET}/ryan_paywall/trained_model
JOBNAME=paywall_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/paywall_module/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --runtime-version=$TFVERSION \
  -- \
  --train_data_paths=gs://${BUCKET}/ryan_paywall/data/train.csv \
  --eval_data_paths=gs://${BUCKET}/ryan_paywall/data/eval.csv \
  --output_dir=${OUTDIR} \
  --batch_size=128 \
  --learning_rate=0.1 \
  --hidden_units="256 128 64" \
  --weighted_cross_entropy_pos_weight=40.0 \
  --classification_threshold=0.5 \
  --train_steps=2000 \
  --start_delay_secs=1 \
  --throttle_secs=5

In [7]:
from google.datalab.ml import TensorBoard
TensorBoard().start('gs://{}/ryan_paywall/trained_model'.format(BUCKET))

23973

In [None]:
for pid in TensorBoard.list()['pid']:
  TensorBoard().stop(pid)
  print('Stopped TensorBoard with pid {}'.format(pid))

<h2> Hyperparameter tuning </h2>
<p>
All of these are command-line parameters to my program.  To do hyperparameter tuning, create hyperparam.xml and pass it as --configFile.
This step will take <b>1 hour</b> -- you can increase maxParallelTrials or reduce maxTrials to get it done faster.  Since maxParallelTrials is the number of initial seeds to start searching from, you don't want it to be too large; otherwise, all you have is a random search.


In [45]:
%writefile hyperparam_maximize_auc.yaml
trainingInput:
  scaleTier: STANDARD_1
  hyperparameters:
    hyperparameterMetricTag: auc
    goal: MAXIMIZE
    maxTrials: 1000
    maxParallelTrials: 10
    enableTrialEarlyStopping: True
    params:
    - parameterName: batch_size
      type: INTEGER
      minValue: 8
      maxValue: 512
      scaleType: UNIT_LOG_SCALE
    - parameterName: learning_rate
      type: DOUBLE
      minValue: 0.01
      maxValue: 0.1
      scaleType: UNIT_LOG_SCALE
    - parameterName: hidden_units
      type: CATEGORICAL
      categoricalValues: ["32 16 8", "64 32 16", "128 64 32"]
    - parameterName: weighted_cross_entropy_pos_weight
      type: DOUBLE
      minValue: 2.0
      maxValue: 400.0
      scaleType: UNIT_LOG_SCALE

Overwriting hyperparam_maximize_auc.yaml


In [46]:
%writefile hyperparam_maximize_precision.yaml
trainingInput:
  scaleTier: STANDARD_1
  hyperparameters:
    hyperparameterMetricTag: precision
    goal: MAXIMIZE
    maxTrials: 1000
    maxParallelTrials: 10
    enableTrialEarlyStopping: True
    params:
    - parameterName: batch_size
      type: INTEGER
      minValue: 8
      maxValue: 512
      scaleType: UNIT_LOG_SCALE
    - parameterName: learning_rate
      type: DOUBLE
      minValue: 0.01
      maxValue: 0.1
      scaleType: UNIT_LOG_SCALE
    - parameterName: hidden_units
      type: CATEGORICAL
      categoricalValues: ["32 16 8", "64 32 16", "128 64 32"]
    - parameterName: weighted_cross_entropy_pos_weight
      type: DOUBLE
      minValue: 2.0
      maxValue: 400.0
      scaleType: UNIT_LOG_SCALE

Overwriting hyperparam_maximize_precision.yaml


In [None]:
%bash
OUTDIR=gs://${BUCKET}/ryan_paywall/hyperparam/auc
JOBNAME=paywall_hptuning_auc_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/paywall_module/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --config=hyperparam_maximize_auc.yaml \
  --runtime-version=$TFVERSION \
  -- \
  --train_data_paths=gs://${BUCKET}/ryan_paywall/data/train.csv \
  --eval_data_paths=gs://${BUCKET}/ryan_paywall/data/eval.csv \
  --output_dir=${OUTDIR} \
  --classification_threshold=0.5 \
  --train_steps=100000 \
  --start_delay_secs=1 \
  --throttle_secs=3

In [None]:
%bash
OUTDIR=gs://${BUCKET}/ryan_paywall/hyperparam/precision
JOBNAME=paywall_hptuning_precision_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/paywall_module/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --config=hyperparam_maximize_precision.yaml \
  --runtime-version=$TFVERSION \
  -- \
  --train_data_paths=gs://${BUCKET}/ryan_paywall/data/train.csv \
  --eval_data_paths=gs://${BUCKET}/ryan_paywall/data/eval.csv \
  --output_dir=${OUTDIR} \
  --classification_threshold=0.5 \
  --train_steps=100000 \
  --start_delay_secs=1 \
  --throttle_secs=3

<h2> Repeat training </h2>
<p>
This time with tuned parameters

In [None]:
%bash
OUTDIR=gs://${BUCKET}/ryan_paywall/trained_model_tuned
JOBNAME=paywall_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/paywall_module/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --runtime-version=$TFVERSION \
  -- \
  --train_data_paths=gs://${BUCKET}/ryan_paywall/data/train.csv \
  --eval_data_paths=gs://${BUCKET}/ryan_paywall/data/eval.csv \
  --output_dir=${OUTDIR} \
  --batch_size=128 \
  --learning_rate=0.1 \
  --hidden_units="256 128 64" \
  --weighted_cross_entropy_pos_weight=40.0 \
  --classification_threshold=0.5 \
  --train_steps=2000 \
  --start_delay_secs=1 \
  --throttle_secs=5

Copyright 2017 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License