In [1]:
import pandas as pd 
import numpy as np 
import sklearn
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

pd.__version__, np.__version__, sklearn.__version__

('1.3.5', '1.21.6', '1.0.2')

In [2]:
dataset = pd.read_csv('https://raw.githubusercontent.com/marrekb/dl-ml-stat/main/02_tennis_winner_prediction/dataset.csv')
dataset.shape[0]

825124

In [3]:
dataset.head()

Unnamed: 0,age_diff,rank_diff,win_diff,loss_diff,swin_diff,sloss_diff,result,p1_hand_L,p1_hand_R,p1_hand_U,p2_hand_L,p2_hand_R,p2_hand_U
0,-4.542094,0.0,74,-8,69,7,0,0,1,0,0,1,0
1,10.663929,0.0,27,54,11,20,0,0,1,0,0,0,1
2,-8.602327,0.0,-20,-66,-8,-32,0,0,0,1,0,1,0
3,-5.103354,785.0,-90,-87,-74,-67,0,0,1,0,0,1,0
4,0.358658,-326.0,17,26,2,6,0,0,1,0,0,1,0


In [4]:
y = dataset['result'].to_numpy() 
X = dataset.drop('result', axis = 1)


In [5]:
X.describe()

Unnamed: 0,age_diff,rank_diff,win_diff,loss_diff,swin_diff,sloss_diff,p1_hand_L,p1_hand_R,p1_hand_U,p2_hand_L,p2_hand_R,p2_hand_U
count,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0
mean,9.243224e-15,0.0,0.0,0.0,0.0,0.0,0.097238,0.660852,0.24191,0.097238,0.660852,0.24191
std,4.97985,433.177448,121.622344,73.374515,79.15656,46.272256,0.296281,0.47342,0.428241,0.296281,0.47342,0.428241
min,-38.21766,-2125.0,-1201.0,-515.0,-751.0,-266.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-3.18412,-157.0,-53.0,-41.0,-30.0,-23.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,3.18412,157.0,53.0,41.0,30.0,23.0,0.0,1.0,0.0,0.0,1.0,0.0
max,38.21766,2125.0,1201.0,515.0,751.0,266.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
columns_to_scale = ['age_diff', 'rank_diff', 'win_diff', 'loss_diff', 'swin_diff', 'sloss_diff']
scaler = StandardScaler()
scaler.fit(X[columns_to_scale])

scaled_X = scaler.transform(X[columns_to_scale])
df_scaled_X = pd.DataFrame(scaled_X, columns = columns_to_scale)
df_scaled_X.describe()

Unnamed: 0,age_diff,rank_diff,win_diff,loss_diff,swin_diff,sloss_diff
count,825124.0,825124.0,825124.0,825124.0,825124.0,825124.0
mean,1.619982e-15,1.85439e-15,-5.151909e-16,-3.328215e-15,-1.274453e-14,3.059447e-16
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-7.674464,-4.905614,-9.874836,-7.01879,-9.487533,-5.748589
25%,-0.6394013,-0.3624383,-0.4357755,-0.5587775,-0.378996,-0.4970584
50%,3.707488e-18,0.0,0.0,0.0,0.0,0.0
75%,0.6394013,0.3624383,0.4357755,0.5587775,0.378996,0.4970584
max,7.674464,4.905614,9.874836,7.01879,9.487533,5.748589


In [7]:
con_X = np.concatenate((scaled_X, X[['p1_hand_L', 'p1_hand_R', 'p1_hand_U', 'p2_hand_L', 'p2_hand_R', 'p2_hand_U']].to_numpy()), axis = 1)
con_X.shape

(825124, 12)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(con_X, y, test_size = 0.2, random_state = 42)
x_train.shape, y_test.shape

((660099, 12), (165025,))

In [9]:
model = LogisticRegression()
model.fit(x_train, y_train)

model.score(x_test, y_test)

0.6620451446750493

In [10]:
model.intercept_

array([-0.0002241])

In [11]:
df_coef = pd.DataFrame(model.coef_, columns = X.columns)
df_coef

Unnamed: 0,age_diff,rank_diff,win_diff,loss_diff,swin_diff,sloss_diff,p1_hand_L,p1_hand_R,p1_hand_U,p2_hand_L,p2_hand_R,p2_hand_U
0,0.117189,0.593004,-0.39738,0.182081,-0.230902,0.00308,-0.110345,-0.109134,0.219256,0.112991,0.109003,-0.222218


In [12]:
model2 = LogisticRegression(solver='sag')
model2.fit(x_train, y_train)

model2.score(x_test, y_test)

0.6620451446750493

In [13]:
model3 = LogisticRegression(solver='newton-cg')
model3.fit(x_train, y_train)

model3.score(x_test, y_test)

0.6620390849871232

In [14]:
model4 = LogisticRegression(solver='liblinear')
model4.fit(x_train, y_train)

model4.score(x_test, y_test)

0.6620512043629753

In [15]:
df_info = pd.DataFrame([['lbfgs', model.intercept_[0]] + list(model.coef_[0]), 
                        ['sag', model2.intercept_[0]] + list(model2.coef_[0]), 
                        ['newton-cg', model3.intercept_[0]] + list(model3.coef_[0]), 
                        ['liblinear', model4.intercept_[0]] + list(model4.coef_[0])], columns = ['solver', 'intercept'] + list(X.columns))
df_info

Unnamed: 0,solver,intercept,age_diff,rank_diff,win_diff,loss_diff,swin_diff,sloss_diff,p1_hand_L,p1_hand_R,p1_hand_U,p2_hand_L,p2_hand_R,p2_hand_U
0,lbfgs,-0.000224,0.117189,0.593004,-0.39738,0.182081,-0.230902,0.00308,-0.110345,-0.109134,0.219256,0.112991,0.109003,-0.222218
1,sag,-0.00029,0.11719,0.593017,-0.39734,0.182062,-0.230926,0.003086,-0.110342,-0.109108,0.219287,0.112986,0.10902,-0.222169
2,newton-cg,-0.000239,0.117188,0.592999,-0.397334,0.18207,-0.230927,0.003071,-0.110361,-0.109138,0.21926,0.112956,0.109011,-0.222205
3,liblinear,-0.000232,0.117194,0.593008,-0.39728,0.181965,-0.230927,0.003129,-0.110449,-0.109106,0.219323,0.113023,0.108957,-0.222212


In [16]:
predictions = model.predict_proba(x_test)
predictions

array([[0.37449618, 0.62550382],
       [0.45969962, 0.54030038],
       [0.51853665, 0.48146335],
       ...,
       [0.26051735, 0.73948265],
       [0.57946255, 0.42053745],
       [0.36931691, 0.63068309]])

In [17]:
def measure_with_bound(prediction, target, bound):
  upper_bound = 0.5 + bound 
  zeros_idx = prediction[:, 0] >= upper_bound
  ones_idx = prediction[:, 1] >= upper_bound 

  true_zeros =  target[zeros_idx] == 0
  true_ones = target[ones_idx] == 1

  count = zeros_idx.sum() + ones_idx.sum()
  return (true_zeros.sum() + true_ones.sum()) / count, count



In [18]:
for i in range(5, 50, 5):
  b = i / 100.0
  acc, count = measure_with_bound(predictions, y_test, b)
  print('bound: {b:.2f}, accuracy: {acc: .4f}, count: {count}'.format(b = 0.5 + b, acc = acc, count = count))

bound: 0.55, accuracy:  0.7024, count: 125377
bound: 0.60, accuracy:  0.7330, count: 92819
bound: 0.65, accuracy:  0.7583, count: 67499
bound: 0.70, accuracy:  0.7853, count: 48157
bound: 0.75, accuracy:  0.8085, count: 32724
bound: 0.80, accuracy:  0.8353, count: 20359
bound: 0.85, accuracy:  0.8623, count: 10790
bound: 0.90, accuracy:  0.8882, count: 4295
bound: 0.95, accuracy:  0.8938, count: 857
