In [0]:
# Israei Tech Challenge - Part 1 of 4
# Welcome to the ITC Taboola workshop. In this workshop we will address a real world problem:
# We have a list of users and items (ads) and their features. 
# Lets try to predict the probablilty of a click (a user clicking on the commercial). 

# We are going to open the data, get to know it a little bit, then do a basic model in tensor flow. 

# Let's start with basic imports. 

import pandas as pd 
import numpy as np
from collections import Counter, defaultdict
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline

In [0]:
# Read the files
# These are real users and real advertisment from a single site.
# Start with 'ITC_20K.csv' (10MB), and later you can move to the ITC_40K.csv (20MB).

data = pd.read_csv('ITC_20K.csv')

# Basic clean-up 
data.replace('', np.nan, inplace=True)
data = data.dropna()

# Look at the data columns. Do you understand what they mean?  
data.head()

In [0]:
# Try to understand the meaning of each column
# Separate columns to user features and source/context/publisher features

source_features = "source_id,content_category,ad_type,quality_level,source_item_type,syndicator_id,\
                    target_id,campaign_id,title,campaign_language".split(",")
user_features = "user_id,browser_platform,os_family,country_code,os_name,country,region,browser_name,\
                 user_clicks,user_recs,prev_syndicator_clicks,target_recs,campaign_recs,\
                 user_category_clicks,user_category_recs".split(",")

label = "is_click"

In [0]:
# Our machine learning algorithm will handle each column according to his type. 
# Task: Separate columns to lists of numeric and categorical features. notice that even columns that are numbers
# may be categorical features, like _id features. 
# Notice we have numerical features labeled as objects since they are arrays! Let's leave them out for now. 

numeric_features = ...
categorical_features = ...

In [0]:
# A basic check with data.dtypes shows that the python read_csv may read the format wrongfully
print data.dtypes

# To make sure all the numeric data is formated, lets do a basic clean-up of numerical data
data[numeric_features] = data[numeric_features].apply(pd.to_numeric, errors='coerce')

In [0]:
# Let's answer a few basic questions before we start: 

# Task: How big is our data set? 
data_size = ...
total_num_features = ...

print "number of samples: "
print data_size
print "number of features: " 
total_num_features

In [0]:
# Task: How many people clicked / haven't clicked? 

num_click = ...
num_no_click = ...

print "clicked: " + str(num_click)
print "haven't clicked: " + str(num_no_click)
print "clicked_ratio: " + str(num_click/(num_no_click+num_click))

In [0]:
# From what browser people are coming for this publisher? 
# Task: Plot or count. Is this data usual? 

In [0]:
# Lets try to catch some interesting "signal". 
# It is always helpful to to plot the data before starting. 
# Try to find some connection between our numerical features and the "is_click" columns. 
# If you're going to use scatter, it is sometimes easier to add noise to the is_click column,
# to help see the spread of the data. 
# This MIGHT help: 
# noise = np.random.randn(data_size)/10
# is_clicked = data["is_click"].values + noise

In [0]:
# Enough exploration! 
# Let's try to build our first model: A simple neural net, only using our numerical features. 

# Only numercial features
only_num_data = data[numeric_features].dropna()
list(only_num_data)

In [0]:
# Task: Create msk to split into train, test sets
msk = ... 

In [0]:
# train, test split
train = only_num_data[msk]
test = only_num_data[~msk]

# Separate the label columns from our features
y_train = train["is_click"]
del train["is_click"]
y_test = test["is_click"]
del test["is_click"]

In [0]:
# Let's start with linear regression with TensorFlow. 
# We are going to use the 5 numeric features we've gotten to know.
# This code is roughly based on: 
# https://github.com/aymericdamien/TensorFlow-Examples/blob/master/notebooks/2_BasicModels/logistic_regression.ipynb

# Important parameters for the model
# You can play with these, as you wish. 

num_samples = train.shape[0]
num_features = train.shape[1] 
batch_size = 50
training_epochs = 5
total_batch = int(num_samples/batch_size)
learning_rate = 0.08
print_every = 1 # epochs

In [0]:
# Lets build our TF Graph step-by-step

In [0]:
# TF Graph Input

# Dataset of features
x = tf.placeholder(tf.float32, [None, num_features])

# Labels
y = tf.placeholder(tf.float32, [None, 1])

In [0]:
# Set model variable (= what the model is going to learn)

W = tf.Variable(tf.random_normal([num_features, 1], stddev=0.15), name="weights")
b = tf.Variable(tf.zeros([1]), name="bias")

In [0]:
# Create the model operations. 
# These are different calculations of the placeholder and variables. 

# Initialize the variables 
init = tf.global_variables_initializer()

# Construct model
pred = tf.sigmoid(tf.matmul(x, W) + b)

# Minimize error using MSE
cost = tf.losses.mean_squared_error(labels = y, predictions = pred)

# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

In [0]:
# Start TensorFlow learning session. 

# Start training
def train_session(train, test, y_train, y_test):
    with tf.Session() as sess:

        # Run the initiation operation (initializer)
        sess.run(init)

        # Training cycle
        for epoch in range(training_epochs):
            avg_cost = 0.

            # Loop over all batches
            for i in range(total_batch):

                # Build inputs in every batch loop
                batch_xs = train.iloc[i*batch_size : (i+1)*batch_size].values   
                batch_ys = y_train.iloc[i*batch_size : (i+1)*batch_size]
                batch_ys = batch_ys.values.reshape([batch_size,1])

                # Run optimization operation (backprop) and cost operation (to get loss value)
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs, y: batch_ys})

                # Compute average loss
                avg_cost += c / (1.0*total_batch)

            # Display logs per epoch step
            if (epoch+1) % print_every == 0:
                 print("Epoch:", '%02d' % (epoch+1), "cost=", avg_cost)

        print("Optimization Finished!")

        # Test model
        correct_prediction = tf.equal(tf.round(pred), y)
        # Calculate accuracy
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        batch_xs = test.values            
        batch_ys = y_test
        batch_ys = batch_ys.values.reshape([len(y_test),1])
        print("Accuracy:", accuracy.eval({x: batch_xs, y: batch_ys}))     

In [0]:
train_session(train, test, y_train, y_test)

In [0]:
# The results should be about 48% 
# This is VERY low. 
# What are we missing ?
# Try again with using normalization on the numeric columns. 
# Why should this help ? 

In [0]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        if (max_value - min_value) > 0:
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [0]:
train = normalize(train)
test = normalize(test)

In [0]:
# Re-run the training
# Exactly the same training from before! We are using the exact same graph with a different input.
train_session(train, test, y_train, y_test)

In [0]:
# Almost 55% - definetly a bit better!
# We're going to add 2 columns of numerical features to our data and do the calculation once again:
# These features are past_user_category_recs and past_user_category_clicks
# We have to parse these with there two ready functions: 

# A function that takes in a single cell that has array value and 
# makes sure all arrays are the same length = max_values
def split_cell_to_list(x, max_values=24, pad_value=[0], value_type='float'):
    if type(x)==str: 
        x = x.split(' ')
        if len(x) >= max_values:
            x = x[:max_values]
        else: 
            x = x + pad_value*(max_values-len(x))
        return x
    else: 
        print 'cell with bad content'
        print type(x),x

# A function that receives a dataframe and a column_name and explodes 
# that column to num_columns different columns          
def parse_and_add_columns(df, column_name):
    if column_name in list(df):
        df[column_name] = df[column_name].apply(split_cell_to_list, args=())   
        temp = pd.DataFrame(df[column_name].values.tolist())
        temp = temp.rename(columns=lambda x: column_name+str(x))
        df = pd.concat([df, temp], axis=1, join='inner')
        del df[column_name]
    else: 
        print 'columns already parsed!'
    return df

In [0]:
# two more columns
numeric_features = "user_recs,prev_syndicator_clicks,target_recs,campaign_recs,user_clicks,\
user_category_clicks,user_category_recs,is_click".split(",") 
only_num_data = data[numeric_features]

# print the data set before parsing
only_num_data.head()

In [0]:
# parse data with given function "parse_and_add_columns"

only_num_data = parse_and_add_columns(only_num_data, 'user_category_recs')
only_num_data = parse_and_add_columns(only_num_data, 'user_category_clicks')

only_num_data.head()

# How many new features did we get? 

In [0]:
# Split into train, test sets
only_num_data = only_num_data.apply(pd.to_numeric, errors='coerce')

msk = np.random.rand(len(only_num_data)) < 0.8 
train = only_num_data[msk].dropna()
test = only_num_data[~msk].dropna()

train = normalize(train)
test = normalize(test)

# Create label
y_train = train["is_click"]
del train["is_click"]
y_test = test["is_click"]
del test["is_click"]

In [0]:
# we need to restart our graph, because now we have a different num_features -> 
# Some of our variables are going to have different sizes

tf.reset_default_graph()

# Exactly the code from before! BUT we need to run it again since num_features is different now. 

num_features = len(list(train))
num_samples = train.shape[0]
batch_size = 50
training_epochs=13
total_batch = int(num_samples/batch_size)
learning_rate = 0.8

# tf Graph Input
x = tf.placeholder(tf.float32, [None, num_features])
y = tf.placeholder(tf.float32, [None, 1])

# Set model weights
# W = tf.Variable(tf.zeros([num_features, 1]), name="weight")
W = tf.Variable(tf.random_normal([num_features, 1], stddev=0.05), name="weights")
b = tf.Variable(tf.zeros([1]), name="bias")

# Construct model
matmul = tf.matmul(x, W)
pred = tf.sigmoid(matmul + b)

# Minimize error using cross entropy
cost = tf.losses.mean_squared_error(labels = y, predictions = pred)

# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
display_step = 1

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

# Re-run the training
train_session(train, test, y_train, y_test)

In [0]:
# If you have time: 
# Change the learning rate.
# Change the cost function.
# What's the best accuracy you've got? 