In [19]:
import tensorflow as tf
import numpy as np
import pandas as pd
import keras
from sklearn.neighbors import KNeighborsClassifier as KNNClassifier

In [34]:
tonpyarray = lambda x: np.asarray(x).astype('float32')
data = pd.read_csv('australia.csv').sample(frac=1)

train_test_percentage = 0.8
rows = len(data.index)
# Split train/eval
dftrain = data.head(int(rows * train_test_percentage))
dfeval = data.tail(int(rows * (1 - train_test_percentage)))

# Boosted trees

In [16]:
fc = tf.feature_column



def make_input_fn(X, y, n_epochs=None, shuffle=True):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
    if shuffle:
      dataset = dataset.shuffle(2534)
    dataset = (dataset
      .repeat(n_epochs)
      .batch(2534))
    return dataset
  return input_fn

def one_hot_cat_column(feature_name, vocab):
    return fc.indicator_column(
    fc.categorical_column_with_vocabulary_list(feature_name,
                                                vocab))

y_train = dftrain.pop('RainTomorrow')
y_eval = dfeval.pop('RainTomorrow')
y_train = tonpyarray(y_train)
y_eval = tonpyarray(y_eval)

params = {
    'n_trees': 100,
    'max_depth': 6,
    'learning_rate':5e-3,
    'n_batches_per_layer': 1,
    'center_bias': True
}

feature_columns = []
for feature_name in data.columns[:-1]:
    feature_columns.append(fc.numeric_column(feature_name,
                                            dtype=tf.float32))

est = tf.estimator.BoostedTreesClassifier(feature_columns, **params)
# Train model.
train_input_fn = make_input_fn(dftrain, y_train)
est.train(train_input_fn, max_steps=1000)

# # Evaluation.
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)
trees_result = est.evaluate(eval_input_fn)
print(trees_result)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpdmr9rjlm', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:ten

# K neighbors

In [42]:
knn = KNNClassifier(
    n_neighbors=64,
    weights='distance',
    algorithm='kd_tree',
    leaf_size = 40,
    p=1.5,
    n_jobs = -1
)
x_train = tonpyarray(dftrain[data.columns[:-1]])
y_train = tonpyarray(dftrain[data.columns[-1]])
knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=40, n_jobs=-1,
                     n_neighbors=64, p=1.5, weights='distance')

In [44]:
x_eval = tonpyarray(dfeval[data.columns[:-1]])
y_eval = tonpyarray(dfeval[data.columns[-1]])
y_pred = knn.predict(x_eval)

In [61]:
tp = np.sum((y_eval==1) &  (y_pred==1))
tn = np.sum((y_eval==0) &  (y_pred==0))
fp = np.sum((y_eval==0) &  (y_pred==1))
fn = np.sum((y_eval==1) &  (y_pred==0))

knn_result = {
    'accuracy' : (tp+tn)/(tp+tn+fp+fn),
    'recall' :  (tp)/(tp+fp),
    'precision' : (tp)/(tp+fn)
}


# Small neural net


In [69]:
y_train_o[0]
x_train.shape

(45136, 17)

In [74]:
onehot = lambda arr : np.array([np.array([1,0]) if v == 1 else np.array([0,1]) for v in arr])
neural = keras.Sequential([
    keras.layers.Dense(64,activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(64,activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(2,activation='softmax') 
])
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
neural.compile(optimizer = optimizer , loss = "binary_crossentropy", metrics=["accuracy","Recall","Precision"])

y_train_o = onehot(y_train)
y_eval_o = onehot(y_eval)

neural_history = neural.fit(x_train,y_train_o,epochs=10,validation_data=(x_eval,y_eval_o))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [78]:
neural_result = {
    'accuracy' : neural_history.history['accuracy'][-1],
    'recall' :  neural_history.history['recall'][-1],
    'precision' : neural_history.history['precision'][-1]
}

In [81]:
space = '    '
for name, d in zip(['Boosted trees','K neighbors', 'Neural network'],[trees_result,knn_result,neural_result]):
    print(name + ':')
    for k in ['accuracy','recall','precision']:
        print(space + f'{k}: {d[k]:.4f}')


Boosted trees:
    accuracy: 0.8583
    recall: 0.4882
    precision: 0.7903
K neighbors:
    accuracy: 0.8559
    recall: 0.7802
    precision: 0.4878
Neural network:
    accuracy: 0.8333
    recall: 0.8333
    precision: 0.8333


# Comparison

Boosted trees, despite having the highest accuracy, have a small recall rate, probably due to being made out of weak learners, which have a harder time remembering datapoints compared to other models.

K neighbors have a much higher recall than precision, which might be due to recognizing similar datapoints with much ease using plain old distance.

The neural network has around equal accuracy, recall and precision, signifying that FP ~= FN, which means that it is biased - as P != N in the set. 

Boosted trees are probably the best choice for an unbiased, quickly learning model of this calibre, as they scale much better than K neighbors, and are simpler and perform relatively better than the neural network.