# Read dataset

In [219]:
import pandas as pd

In [220]:
df = pd.read_csv("Tennis_sample.csv")
df.head()

Unnamed: 0,player_a,player_b,elo_points_a,elo_points_b,h2h_wins_a,h2h_wins_b,last5_winrate_a,last5_winrate_b,surface_winrate_a,surface_winrate_b,winrate_a,winrate_b,result
0,Alexander Zverev,Arthur Fils,2070,1954,3.0,2.0,0.8,0.6,0.79,0.68,0.76,0.64,1.0
1,Lorenzo Musetti,Daniil Medvedev,1999,2002,0.0,2.0,0.8,0.8,0.82,0.69,0.71,0.63,1.0
2,Casper Ruud,Alexander Bublik,1988,1786,6.0,1.0,1.0,0.8,0.81,0.59,0.66,0.44,1.0
3,Francisco Cerundolo,Jakub Mensik,1950,1951,1.0,0.0,0.8,0.8,0.73,0.64,0.63,0.61,1.0
4,Carlos Alcaraz,Karen Khachanov,2166,1892,4.0,0.0,0.8,0.6,0.92,0.58,0.83,0.55,1.0


# Preprocess Dataset

## Explore dataset

In [221]:
df.shape

(172, 13)

In [222]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   player_a           172 non-null    object 
 1   player_b           172 non-null    object 
 2   elo_points_a       172 non-null    int64  
 3   elo_points_b       172 non-null    int64  
 4   h2h_wins_a         172 non-null    float64
 5   h2h_wins_b         172 non-null    float64
 6   last5_winrate_a    172 non-null    object 
 7   last5_winrate_b    172 non-null    object 
 8   surface_winrate_a  172 non-null    object 
 9   surface_winrate_b  172 non-null    object 
 10  winrate_a          172 non-null    object 
 11  winrate_b          172 non-null    object 
 12  result             166 non-null    float64
dtypes: float64(3), int64(2), object(8)
memory usage: 17.6+ KB


--> Columns 6 to 11 need to be checked because they are supposed to be float64

In [223]:
df.describe()

Unnamed: 0,elo_points_a,elo_points_b,h2h_wins_a,h2h_wins_b,result
count,172.0,172.0,172.0,172.0,166.0
mean,1800.534884,1760.19186,0.866279,0.709302,0.493976
std,166.175958,175.555104,1.405762,1.314483,0.501476
min,1380.0,1234.0,0.0,0.0,0.0
25%,1684.0,1658.75,0.0,0.0,0.0
50%,1788.5,1751.5,0.0,0.0,0.0
75%,1912.25,1877.0,1.0,1.0,1.0
max,2210.0,2210.0,7.0,7.0,1.0


In [224]:
df.isnull().sum()

player_a             0
player_b             0
elo_points_a         0
elo_points_b         0
h2h_wins_a           0
h2h_wins_b           0
last5_winrate_a      0
last5_winrate_b      0
surface_winrate_a    0
surface_winrate_b    0
winrate_a            0
winrate_b            0
result               6
dtype: int64

--> Some results are missing in the column "Result"

In [225]:
df.duplicated().sum()

np.int64(0)

## Cleaning the dataset

Convert the columns 6 to 11 from object to float

In [226]:
columns_to_convert = [
    'last5_winrate_a', 'last5_winrate_b',
    'surface_winrate_a', 'surface_winrate_b',
    'winrate_a', 'winrate_b'
]
for col in columns_to_convert:
    print(f"Valeurs uniques dans {col} :")
    print(df[col].unique())
    print("\n")


Valeurs uniques dans last5_winrate_a :
['0.8' '1.00' '0.6' '0.2' '0.00' '0.80' '0.20' '0.60' '0.40' '0.4' '0,8'
 '0,6' '0,4' '1' '0,2' '0' '0,55']


Valeurs uniques dans last5_winrate_b :
['0.6' '0.8' '0.2' '1.00' '0.40' '0.60' '0.80' '0.4' '0' '0,8' '0,6' '0,2'
 '1' '0,4']


Valeurs uniques dans surface_winrate_a :
['0.79' '0.82' '0.81' '0.73' '0.92' '0.52' '0.89' '0.75' '0.56' '0.17'
 '0.67' '0.49' '0.39' '0.64' '0.45' '0.59' '0.50' '0.33' '0.66' '0.69'
 '0.83' '0.43' '0.68' '0.60' '0.58' '0.5' '0.31' '0.25' '0.42' '0.62'
 '0.54' '0.53' '0.55' '0,93' '0,61' '0,55' '0,7' '0,31' '0,65' '0,58'
 '0,73' '0,52' '0,64' '0,62' '0,5' '0,56' '0,54' '0,33' '0,46' '0,25'
 '0,47' '0' '0,39' '0,45' '0,67' '0,6' '0,53' '0,69' '0,8' '0,63' '0,51'
 '0,75' '0,77' '0,59' '0,72' '0,74' '0,4' '0,22' '0,36' '0,68' '0,43'
 '0,2' '0,71']


Valeurs uniques dans surface_winrate_b :
['0.68' '0.69' '0.59' '0.64' '0.58' '0.65' '0.73' '0.72' '0.5' '0.81'
 '0.92' '0.60' '0.75' '0.82' '0.52' '0.53' '0.56' '0.67' '0

--> They are considered as object because some values have a "," instead of a "."

In [227]:
def clean_to_float(series):
    return (
        series.astype(str)
        .str.replace(',', '.', regex=False)
        .str.strip()
        .astype(float)
    )

for col in columns_to_convert:
    df[col] = clean_to_float(df[col])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   player_a           172 non-null    object 
 1   player_b           172 non-null    object 
 2   elo_points_a       172 non-null    int64  
 3   elo_points_b       172 non-null    int64  
 4   h2h_wins_a         172 non-null    float64
 5   h2h_wins_b         172 non-null    float64
 6   last5_winrate_a    172 non-null    float64
 7   last5_winrate_b    172 non-null    float64
 8   surface_winrate_a  172 non-null    float64
 9   surface_winrate_b  172 non-null    float64
 10  winrate_a          172 non-null    float64
 11  winrate_b          172 non-null    float64
 12  result             166 non-null    float64
dtypes: float64(9), int64(2), object(2)
memory usage: 17.6+ KB


Handling missing values

In [228]:
df.dropna(subset=["result"], axis=0, inplace=True)
df.isnull().sum()

player_a             0
player_b             0
elo_points_a         0
elo_points_b         0
h2h_wins_a           0
h2h_wins_b           0
last5_winrate_a      0
last5_winrate_b      0
surface_winrate_a    0
surface_winrate_b    0
winrate_a            0
winrate_b            0
result               0
dtype: int64

Balancing the classes

In [229]:
df["result"].value_counts()

result
0.0    84
1.0    82
Name: count, dtype: int64

In [230]:
df_0 = df[df["result"] == 0]
df_1 = df[df["result"] == 1]

min_len = min(len(df_0), len(df_1))

df_0_balanced = df_0.sample(n=min_len, random_state=42)
df_1_balanced = df_1.sample(n=min_len, random_state=42)

df_balanced = pd.concat([df_0_balanced, df_1_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)

df_balanced["result"].value_counts()

result
1.0    82
0.0    82
Name: count, dtype: int64

# Determine features and label

In [231]:
from sklearn.preprocessing import StandardScaler

Creating features from the original variables to predict the result

In [232]:
df_balanced["elo_diff"] = df_balanced["elo_points_a"] - df_balanced["elo_points_b"]
df_balanced["h2h_diff"] = df_balanced["h2h_wins_a"] - df_balanced["h2h_wins_b"]
df_balanced["form5_diff"] = df_balanced["last5_winrate_a"] - df_balanced["last5_winrate_b"]
df_balanced["surface_winrate_diff"] = df_balanced["surface_winrate_a"] - df_balanced["surface_winrate_b"]
df_balanced["winrate_diff"] = df_balanced["winrate_a"] - df_balanced["winrate_b"]

features = ["elo_diff", "h2h_diff", "form5_diff", "surface_winrate_diff","winrate_diff"]

Data Transformation

In [233]:
scaler = StandardScaler()

df_balanced[features] = scaler.fit_transform(df_balanced[features])
df_balanced[features].head()

Unnamed: 0,elo_diff,h2h_diff,form5_diff,surface_winrate_diff,winrate_diff
0,0.112132,-0.065264,0.624919,0.498524,0.942351
1,-2.508337,-0.628591,-2.654656,-1.153728,-1.087036
2,-0.222506,-0.065264,-0.030996,0.726421,0.083765
3,0.328359,0.498064,-0.686911,2.150777,-0.462609
4,-0.109244,-0.065264,0.624919,1.182215,-1.165089


Separate features and label

In [234]:
x = df_balanced[features]
y = df_balanced["result"]

In [235]:
x.head()

Unnamed: 0,elo_diff,h2h_diff,form5_diff,surface_winrate_diff,winrate_diff
0,0.112132,-0.065264,0.624919,0.498524,0.942351
1,-2.508337,-0.628591,-2.654656,-1.153728,-1.087036
2,-0.222506,-0.065264,-0.030996,0.726421,0.083765
3,0.328359,0.498064,-0.686911,2.150777,-0.462609
4,-0.109244,-0.065264,0.624919,1.182215,-1.165089


In [236]:
y.head()

0    1.0
1    1.0
2    1.0
3    0.0
4    1.0
Name: result, dtype: float64

# Train the model

In [237]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Split training and testing dataset

In [238]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

In [239]:
x_train.shape

(131, 5)

In [240]:
y_train.shape

(131,)

Model training

In [241]:
param_grid = {
    "penalty": ["l1", "l2"],
    "C": [0.01, 0.1, 1, 10],
    "solver": ["liblinear"],
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring="accuracy")
grid.fit(x_train, y_train)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


Best params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best score: 0.6584045584045585


In [242]:
log = LogisticRegression(C= 0.01, penalty = "l2", solver = "liblinear")

log.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [243]:
y_pred = log.predict(x_test)
y_pred

array([0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1.,
       0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1.])

In [244]:
y_test.to_numpy()

array([0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1.,
       1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1.])

# Performance model evaluation

In [245]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [246]:
print("\n=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


=== Logistic Regression ===
Accuracy: 0.696969696969697
Confusion Matrix:
 [[ 9  3]
 [ 7 14]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.56      0.75      0.64        12
         1.0       0.82      0.67      0.74        21

    accuracy                           0.70        33
   macro avg       0.69      0.71      0.69        33
weighted avg       0.73      0.70      0.70        33

