In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
n_rows = 1000000
df = pd.read_csv("train.gz", nrows=n_rows, compression="infer")

In [3]:
for c in df.columns:
    df[c]=df[c].apply(str)
    le=preprocessing.LabelEncoder().fit(df[c])
    df[c] =le.transform(df[c])
    pd.to_numeric(df[c]).astype(np.float)

In [4]:
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
Y = df['click'].values

print(X.shape, Y.shape)



(500000, 19) (500000,)


In [5]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,5,0,0,2,0,215,1503,2,1518,62,...,1,1,102,4,6,18,0,19,0,27
1,9,0,0,2,0,215,1503,2,1518,62,...,1,0,100,4,6,18,0,19,60,27
2,17,0,0,2,0,215,1503,2,1518,62,...,1,0,100,4,6,18,0,19,60,27
3,28,0,0,2,0,215,1503,2,1518,62,...,1,0,102,4,6,18,0,19,60,27
4,31,0,0,2,1,1687,910,0,1518,62,...,1,0,196,4,6,59,0,19,0,8


In [6]:
n_train = int(n_rows * 0.9)
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]

In [7]:
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [8]:
!pip install tensorflow
import tensorflow as tf
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources

[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
n_iter = 50
n_classes = 2
n_features = int(X_train_enc.toarray().shape[1])
n_trees = 10
max_nodes = 30000

In [10]:
x = tf.placeholder(tf.float32, shape=[None, n_features])
y = tf.placeholder(tf.int64, shape=[None])

In [11]:
hparams = tensor_forest.ForestHParams(num_classes=n_classes, num_features=n_features, num_trees=n_trees,
                                      max_nodes=max_nodes, split_after_samples=50).fill()

In [12]:
forest_graph = tensor_forest.RandomForestGraphs(hparams)

INFO:tensorflow:Constructing forest with params = 
INFO:tensorflow:{'num_trees': 10, 'max_nodes': 30000, 'bagging_fraction': 1.0, 'feature_bagging_fraction': 1.0, 'num_splits_to_consider': 97, 'max_fertile_nodes': 0, 'split_after_samples': 50, 'valid_leaf_threshold': 1, 'dominate_method': 'bootstrap', 'dominate_fraction': 0.99, 'model_name': 'all_dense', 'split_finish_name': 'basic', 'split_pruning_name': 'none', 'collate_examples': False, 'checkpoint_stats': False, 'use_running_stats_method': False, 'initialize_average_splits': False, 'inference_tree_paths': False, 'param_file': None, 'split_name': 'less_or_equal', 'early_finish_check_every_samples': 0, 'prune_every_samples': 0, 'num_classes': 2, 'num_features': 9571, 'bagged_num_features': 9571, 'bagged_features': None, 'regression': False, 'num_outputs': 1, 'num_output_columns': 3, 'base_random_seed': 0, 'leaf_model_type': 0, 'stats_model_type': 0, 'finish_type': 0, 'pruning_type': 0, 'split_type': 0}
Instructions for updating:
Colo

In [13]:
train_op = forest_graph.training_graph(x, y)
loss_op = forest_graph.training_loss(x, y)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [14]:
infer_op, _, _ = forest_graph.inference_graph(x)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [15]:
auc = tf.metrics.auc(tf.cast(y, tf.int64), infer_op[:, 1])[1]

In [16]:
init_vars = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), resources.initialize_resources(resources.shared_resources()))

sess = tf.Session()

sess.run(init_vars)

In [17]:
batch_size = 1000

In [18]:
indices = list(range(n_train))

In [19]:
def gen_batch(indices):
    np.random.shuffle(indices)
    for batch_i in range(int(n_train / batch_size)):
        batch_index = indices[batch_i*batch_size: (batch_i+1)*batch_size]
        yield X_train_enc[batch_index], Y_train[batch_index]

In [20]:
for i in range(1, n_iter + 1):
    for X_batch, Y_batch in gen_batch(indices):
        _, l = sess.run([train_op, loss_op], feed_dict={x: X_batch.toarray(), y: Y_batch})
    acc_train = sess.run(auc, feed_dict={x: X_train_enc.toarray(), y: Y_train})
    print('Iteration %i, AUC of ROC on training set: %f' % (i, acc_train))
    acc_test = sess.run(auc, feed_dict={x: X_test_enc.toarray(), y: Y_test})
    print("AUC of ROC on testing set:", acc_test)

Iteration 1, AUC of ROC on training set: 0.751233
AUC of ROC on testing set: 0.74951583
Iteration 2, AUC of ROC on training set: 0.754487
AUC of ROC on testing set: 0.7536498
Iteration 3, AUC of ROC on training set: 0.757522
AUC of ROC on testing set: 0.7570152
Iteration 4, AUC of ROC on training set: 0.760221
AUC of ROC on testing set: 0.7598135
Iteration 5, AUC of ROC on training set: 0.762542
AUC of ROC on testing set: 0.76218456
Iteration 6, AUC of ROC on training set: 0.764548
AUC of ROC on testing set: 0.76422405
Iteration 7, AUC of ROC on training set: 0.766393
AUC of ROC on testing set: 0.7660935
Iteration 8, AUC of ROC on training set: 0.768097
AUC of ROC on testing set: 0.7678178
Iteration 9, AUC of ROC on training set: 0.769666
AUC of ROC on testing set: 0.76940334
Iteration 10, AUC of ROC on training set: 0.771129
AUC of ROC on testing set: 0.7708833
Iteration 11, AUC of ROC on training set: 0.772491
AUC of ROC on testing set: 0.7722548
Iteration 12, AUC of ROC on training 

In [21]:
#With no feature engineering, we achieved a ROCAUC of almost 80