In [0]:
# Israei Tech Challenge - Part 2 of 4
# Welcome to the ITC Taboola workshop. In this workshop we will address a real world problem:
# have a list of users and items (commercials) and their features. 
# Lets try to predict the probablilty of a click (a user clicking on the commercial). 

# After only using only numerical features, it's time to use our categorical features!
# To Do that we'll need to use embedding in our tensor flow graph. 

# Let's start with basic imports. 
import pandas as pd 
import numpy as np
from collections import Counter, defaultdict
import ast
import os
import operator
import csv
from numbers import Number
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf
import matplotlib.pyplot as pltb 
%matplotlib inline
low_memory=False

# Read Data
data = pd.read_csv('ITC_20K.csv')

# Basic clean-up
data.replace('', np.nan, inplace=True)
data = data.dropna()
data.head(2)

Unnamed: 0,source_id,content_category,ad_type,quality_level,source_item_type,syndicator_id,target_id,campaign_id,title,campaign_language,...,region,browser_name,user_clicks,user_recs,prev_syndicator_clicks,target_recs,campaign_recs,user_category_clicks,user_category_recs,is_click
0,-4.06151e+18,sports,DIRECT_RESPONSE,HIGH,SEARCH,1028792,-8.47673e+18,232405,This game will keep you up all night!,en,...,2,Firefox,0,13,0,0,1,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....,1
1,-6.24834e+18,fashion,DIRECT_RESPONSE_TIER2,MEDIUM,SEARCH,1058384,-8.17787e+18,722841,People With 5-49 Vehicles Have Discovered This...,en,...,TX,Microsoft Edge,3,562,0,0,2,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1....,0.0 0.0 0.0 0.0 0.0 2.9620843 18.657581 5.1518...,0


In [0]:
# These are the features which have a categorical value for their columns. 

features_for_embedding =['source_id','content_category','ad_type','quality_level','source_item_type', 
                         'syndicator_id','target_id','campaign_id','campaign_language','user_id',
                         'browser_platform', 'os_family','country_code','os_name','country','region', 'browser_name']

# (ANALYSIS) LOOKING AT THE DATA
How many unique values there're for each category?
How does the distribution looks like? (how many occurances for each value within each category?)

# (CODING) PREPROCESSING:
For each category column we wish to create *another* column for representing the possible values as integers (like enum)
1. The new column name should be same as original with the suffix "_mapped"
2. Start your indexing from 1 - keeping "0" for OOV (Out Of Volcabulary)
3. Pay attention to deside on an OOV threshold (num occurances).
4. Your output should be both:

    4.1 New columns in the original dataframe

    4.2 A dictionary of dictionaries named "look_up_dict": which maps from category_feature to a look_up dictionary (key: category_value, value: index)
    
data["sourced_id_mapped"] = ...

look_up_dict = {"source_id": {"I am a possible value": 1, "OOV": 0, ...}, "content_category": {"OOV":0, "good": 3,...},...}

# your solution

# The code we give you:

In [0]:
# Split into train, test sets
msk = np.random.rand(len(data)) < 0.8 
train = data[msk]
test = data[~msk]

train = train.dropna()
test = test.dropna()

# Create label
y_train = train["is_click"]
del train["is_click"]
y_test = test["is_click"]
del test["is_click"]

In [0]:
EMBEDDINGS_COLLECTION_NAME = "embeddings"
# a function for adding a categorical placeholder
def add_categorical_placeholder(input_name):
    batch_size = None  # using dynamic batch
    placeholder = tf.placeholder(tf.int32, shape=batch_size, name=input_name)
    tf.add_to_collection("input", placeholder)
    return placeholder

# a function that takes a placeholder and creates embedding for it
def add_categorical_embedding(input_placeholder, input_dim, output_dim):
    input_name = input_placeholder.op.name
    uniform = tf.random_uniform(shape=[input_dim, output_dim],minval=0.05,maxval=0.05)
    emb_weights = tf.Variable(initial_value=uniform, name=('{}_weights'.format(input_name)))
    tf.add_to_collection(EMBEDDINGS_COLLECTION_NAME, emb_weights)
    emb = tf.nn.embedding_lookup(emb_weights, input_placeholder, name=('{}_lookup'.format(input_name)))
    emb.set_shape([None, output_dim])
    return emb

In [0]:
embedding_size = 4

def create_embeddings_for_all_features(features_for_embedding, look_up_dict):
    embeddings = []
    placeholder_list = []

    feed_dict = {}
    num_features = 0
    for feature in features_for_embedding:
        input_placeholder = add_categorical_placeholder(feature+'_mapped')
        placeholder_list.append(input_placeholder)
        emb = add_categorical_embedding(input_placeholder, len(look_up_dict[feature]), embedding_size)
        
        embeddings.append(emb)
        num_features += embedding_size 
    return num_features, placeholder_list, embeddings

In [0]:
def _safe_encode(value_0):
    if isinstance(value_0, Number):
        value_0 = str(value_0)
    try:
        value = value_0.encode("utf8")
    except:
        value = value_0
    return value
  
def create_embedding_metadata(embedding_name, embedding_lookup, model_log_dir):
    sorted_emb = sorted(embedding_lookup.items(), key=operator.itemgetter(1))
    embedding_filename = embedding_name + '.tsv'
    metadata_path = os.path.join(model_log_dir, embedding_filename)
    with open(metadata_path, 'wb') as f:
        metadata_writer = csv.writer(f, delimiter='\t')
        for value in sorted_emb:
            value = _safe_encode(value[0])
            if embedding_name in ['target_id']:
                metadata_writer.writerow(['#'+value])  # add some char so tensorboard will show as string
            else:
                metadata_writer.writerow([value])

    return embedding_filename

def project_embeddings():
    embedding_projector = projector.ProjectorConfig()
    metadata_paths = {}
    
    for i in range(len(features_for_embedding)):
        embedding_name = features_for_embedding[i]
        metadata_paths[embedding_name] = create_embedding_metadata(embedding_name, look_up_dict[embedding_name], model_log_dir)
        
    embeddings_vars = tf.get_collection_ref(EMBEDDINGS_COLLECTION_NAME)
    for embedding_var in embeddings_vars:
        embedding = embedding_projector.embeddings.add()
        embedding_name = embedding_var.name[:-len("_mapped_weights:0")]
        embedding.tensor_name = embedding_var.name
        embedding.metadata_path = metadata_paths[embedding_name]

    projection_summary_writer = tf.summary.FileWriter(model_log_dir, sess.graph)
    projector.visualize_embeddings(projection_summary_writer, embedding_projector)

# Building the graph and training

In [0]:
# ! rm -rf /tmp/taboola_tutorial/logs/*

In [0]:
# This is the same code as before! except we move the check on out test dataframe inside the loop,
# so we can track the improvement of our model 

# Parameters
num_samples = train.shape[0]
batch_size = 50
training_epochs=3
total_batch = int(num_samples/batch_size)
learning_rate = 0.25

# TF model
# This is where we start creating our TF graph
tf.reset_default_graph()
# tf Graph Input
num_features, placeholder_list, embeddings = create_embeddings_for_all_features(features_for_embedding, look_up_dict)
    
y = tf.placeholder(tf.float32, [None, 1])

# Set model weights
x = tf.concat(axis=1, values=embeddings, name='concat')
W = tf.Variable(tf.random_normal([num_features, 1], stddev=0.15), name="weights")
b = tf.Variable(tf.zeros([1]), name="bias")

pred = tf.sigmoid(tf.matmul(x, W) + b)

# Minimize error using MSE
# cost = tf.reduce_sum(tf.square(y-pred))
cost = tf.losses.mean_squared_error(labels = y, predictions = pred)

# Gradient Descent
global_step = tf.Variable(0,name='global_step', trainable=False)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost, global_step=global_step)

# Test model
correct_prediction = tf.equal(tf.round(pred), y)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Define test_feed_dict once:
test_feed_dict = {}
for placeholder in placeholder_list:
    feature = placeholder.name[:-2]
    test_feed_dict[placeholder] = test[feature].values
test_feed_dict[y] = y_test
test_feed_dict[y] = test_feed_dict[y].values.reshape([len(y_test),1])

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
model_log_dir = "/tmp/taboola_tutorial/logs"
# Start training
sess = tf.Session()
with sess:
#     summary_writer = tf.summary.FileWriter(model_log_dir + '/train', sess.graph)
#     summary_writer_test = tf.summary.FileWriter(model_log_dir + '/test', sess.graph)
    loss_summary = tf.summary.scalar('per_batch_loss', cost)
    merge_summaries_op = tf.summary.merge_all()
    # Run the initializer
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.

        # Loop over all batches
        for i in range(total_batch):
            feed_dict = {}
         
            for placeholder in placeholder_list:
                feature = placeholder.name[:-2]
                feed_dict[placeholder] = train[feature].iloc[i*batch_size : (i+1)*batch_size].values

            feed_dict[y] = y_train.iloc[i*batch_size : (i+1)*batch_size]
            feed_dict[y] = feed_dict[y].values.reshape([batch_size,1])

            # Run optimization op (backprop) and cost op (to get loss value)
            _, c, merged_summary, current_global_step = sess.run([optimizer, cost, merge_summaries_op, global_step], feed_dict)
#             summary_writer.add_summary(merged_summary,current_global_step)
            # Compute average loss
            avg_cost += c / (1.0*total_batch)
            
        print current_global_step
        train_summary = tf.Summary(value=[tf.Summary.Value(tag="loss", simple_value=avg_cost)])
#         summary_writer.add_summary(train_summary, current_global_step)
        # Calc test error:
        test_loss = sess.run(cost, test_feed_dict)
        test_summary = tf.Summary(value=[tf.Summary.Value(tag="loss", simple_value=test_loss)])
#         summary_writer_test.add_summary(test_summary, current_global_step)
        
        # Display logs per epoch step
        print "Epoch:", '%02d' % (epoch+1)
        print "train error:\t", avg_cost
        print "test error:\t", test_loss
        print("Accuracy:", accuracy.eval(test_feed_dict))
    
#         saver.save(sess, os.path.join(model_log_dir, "model.ckpt"), current_global_step)
#     summary_writer.flush() 
#     summary_writer_test.flush() 
# project_embeddings()

319
Epoch: 01
train error:	0.222467447616
test error:	0.204587
('Accuracy:', 0.70306224)
638
Epoch: 02
train error:	0.206358872097
test error:	0.202264
('Accuracy:', 0.70306224)
957
Epoch: 03
train error:	0.204229066505
test error:	0.201157
('Accuracy:', 0.70205826)


In [0]:
# Run TensorBoard
import subprocess
cmd = 'tensorboard --logdir ' + model_log_dir
p = subprocess.Popen(cmd, shell=True)

# -> navigate to http://127.0.0.1:6006/
# go to the "projector" tab
# on the left: move from "train" directory to "."