In [1]:
import pandas as pd 
import numpy as np 
import sklearn
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
import tensorflow as tf

pd.__version__, np.__version__, sklearn.__version__, tf.__version__

('1.3.5', '1.21.6', '1.0.2', '2.8.2')

In [2]:
dataset = pd.read_csv('https://raw.githubusercontent.com/marrekb/dl-ml-stat/main/02_tennis_winner_prediction/dataset.csv')

y = dataset['result'].to_numpy() 
X = dataset.drop('result', axis = 1)

columns_to_scale = ['age_diff', 'rank_diff', 'win_diff', 'loss_diff', 'swin_diff', 'sloss_diff']
scaler = StandardScaler()
scaler.fit(X[columns_to_scale])

scaled_X = scaler.transform(X[columns_to_scale])
df_scaled_X = pd.DataFrame(scaled_X, columns = columns_to_scale)
df_scaled_X.describe()

con_X = np.concatenate((scaled_X, X[['p1_hand_L', 'p1_hand_R', 'p1_hand_U', 'p2_hand_L', 'p2_hand_R', 'p2_hand_U']].to_numpy()), axis = 1)

x_train, x_test, y_train, y_test = train_test_split(con_X, y, test_size = 0.2, random_state = 42)
x_train.shape, y_test.shape

((660099, 12), (165025,))

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
def measure_with_bound_core(y_true, y_pred, lower_bound, upper_bound, return_count = False):
  zeros_idx = y_pred <= lower_bound
  ones_idx = y_pred >= upper_bound 

  true_zeros =  y_true[zeros_idx] == 0
  true_ones = y_true[ones_idx] == 1

  true_zeros_sum = tf.reduce_sum(tf.cast(true_zeros, dtype = tf.float32))
  true_ones_sum = tf.reduce_sum(tf.cast(true_ones, dtype = tf.float32))

  zeros_sum = tf.reduce_sum(tf.cast(zeros_idx, dtype = tf.float32))
  ones_sum = tf.reduce_sum(tf.cast(ones_idx, dtype = tf.float32))
  
  count = zeros_sum + ones_sum
  if count == 0:
    count = tf.constant([1.0], dtype = tf.float32)
  if return_count:
    return (true_zeros_sum + true_ones_sum) / count, count
  return (true_zeros_sum + true_ones_sum) / count

def measure_with_bound2(y_true, y_pred):
  lower_bound = 0.3
  upper_bound = 0.7 

  return measure_with_bound_core(y_true, y_pred, lower_bound, upper_bound)

def measure_with_bound1(y_true, y_pred):
  lower_bound = 0.4
  upper_bound = 0.6 

  return measure_with_bound_core(y_true, y_pred, lower_bound, upper_bound)

def measure_with_bound3(y_true, y_pred):
  lower_bound = 0.2
  upper_bound = 0.8 

  return measure_with_bound_core(y_true, y_pred, lower_bound, upper_bound)


In [5]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(64, activation = 'relu', input_shape = (12,)),
  tf.keras.layers.Dense(64, activation = 'relu'),
  tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = [tf.keras.metrics.BinaryAccuracy(), measure_with_bound1, measure_with_bound2, measure_with_bound3])

model.fit(x_train, y_train, epochs = 20, batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1320118790>

In [6]:
model.evaluate(x_test, y_test, batch_size = 2048)



[0.587731659412384,
 0.6888956427574158,
 0.7470558285713196,
 0.8037779927253723,
 0.8602906465530396]

In [7]:
dataset = tf.data.Dataset.from_tensor_slices(x_test).batch(2048)

predictions = None

for x in dataset:
  pred = model(x)
  if predictions == None:
    predictions = pred
  else:
    predictions = tf.concat([predictions, pred], axis = 0)


In [8]:
y = tf.reshape(tf.constant(y_test), shape = [-1, 1])

for i in range(5, 50, 5):
  b = i / 100.0
  acc, count = measure_with_bound_core(y, predictions, 0.5 - b, 0.5 + b, True)
  print('bound: {b:.2f}, accuracy: {acc: .4f}, count: {count}'.format(b = 0.5 + b, acc = acc, count = int(count)))

bound: 0.55, accuracy:  0.7188, count: 140434
bound: 0.60, accuracy:  0.7471, count: 116508
bound: 0.65, accuracy:  0.7741, count: 92818
bound: 0.70, accuracy:  0.8037, count: 69856
bound: 0.75, accuracy:  0.8312, count: 48309
bound: 0.80, accuracy:  0.8601, count: 29014
bound: 0.85, accuracy:  0.8862, count: 13558
bound: 0.90, accuracy:  0.9099, count: 3653
bound: 0.95, accuracy:  0.9103, count: 290


In [9]:
model2 = tf.keras.Sequential([
  tf.keras.layers.Dense(96, activation = 'relu', input_shape = (12,)),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(96, activation = 'relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(96, activation = 'relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model2.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = [tf.keras.metrics.BinaryAccuracy(), measure_with_bound1, measure_with_bound2, measure_with_bound3])

model2.fit(x_train, y_train, epochs = 30, batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f129661e310>

In [10]:
model2.evaluate(x_test, y_test, batch_size = 2048)



[0.5889211297035217,
 0.6882290840148926,
 0.7470332980155945,
 0.8045710921287537,
 0.8642696142196655]

In [11]:
#dataset = tf.data.Dataset.from_tensor_slices(x_test).batch(2048)

predictions = None

for x in dataset:
  pred = model2(x, training = False)
  if predictions == None:
    predictions = pred
  else:
    predictions = tf.concat([predictions, pred], axis = 0)

y = tf.reshape(tf.constant(y_test), shape = [-1, 1])

for i in range(5, 50, 5):
  b = i / 100.0
  acc, count = measure_with_bound_core(y, predictions, 0.5 - b, 0.5 + b, True)
  print('bound: {b:.2f}, accuracy: {acc: .4f}, count: {count}'.format(b = 0.5 + b, acc = acc, count = int(count)))

bound: 0.55, accuracy:  0.7199, count: 138528
bound: 0.60, accuracy:  0.7470, count: 115128
bound: 0.65, accuracy:  0.7746, count: 92725
bound: 0.70, accuracy:  0.8045, count: 68531
bound: 0.75, accuracy:  0.8356, count: 44975
bound: 0.80, accuracy:  0.8643, count: 25045
bound: 0.85, accuracy:  0.8885, count: 10242
bound: 0.90, accuracy:  0.9099, count: 2631
bound: 0.95, accuracy:  0.9395, count: 248
