# Predicting tennis outcomes

### Import Libraries

In [129]:
import pandas as pd
import numpy as np
import requests
import json
import os
from datetime import datetime
import pandas as pd
from sklearn.linear_model import LogisticRegression
import random
from sklearn.model_selection import train_test_split

### Import Datasets

In [130]:
#Players
players_df = pd.read_csv("atp_players.csv")
players_df["full_name"] = players_df["name_first"] + " " + players_df["name_last"]

#Newest Rankings - sorted
rankings_df = pd.read_csv("atp_rankings_current.csv")
newest_version = rankings_df.sort_values("ranking_date", ascending = False).loc[0,"ranking_date"]
rankings_df = rankings_df.loc[rankings_df["ranking_date"] == newest_version]
rankings_df = rankings_df.sort_values("rank", ascending = True, ignore_index=True)
rankings_df['ranking_date'] = pd.to_datetime(rankings_df['ranking_date'], format='%Y%m%d')

#Matches between 2001 and 2023
matches = pd.DataFrame()
for year in range(2001,2024):
  df = pd.read_csv("atp_matches_"+ str(year) +".csv")
  matches = matches.append(df)

matches.index = range(len(matches))
matches['tourney_date'] = pd.to_datetime(matches['tourney_date'], format='%Y%m%d')

#Surfaces
surfaces = {0:"Hard", 1: "Clay", 2: "Grass"}

### Define Classes

In [131]:
class Player:

    def __init__(self, full_name, age = None):
        self.name = full_name
        self.age = age
        self.id = int(players_df["player_id"].loc[players_df["full_name"] == full_name].values[0])
        self.nationality = players_df["ioc"].loc[players_df["full_name"] == full_name].values[0]
        self.hand = players_df["hand"].loc[players_df["full_name"] == full_name].values[0]
        self.height = players_df["height"].loc[players_df["full_name"] == full_name].values[0]
        self.n_wins = len(matches.loc[matches["winner_id"] == self.id].index)
        self.n_losses = len(matches.loc[matches["loser_id"] == self.id].index)
        self.total_matches = self.n_wins + self.n_losses
    
        #Winning Percentage
        if self.total_matches > 0:
            self.winning_percentage = self.n_wins/(self.total_matches)
        else:
            self.winning_percentage = ""

        #Best Surface
        try:
            wp_surfaces = list(map(self.get_winning_perc_per_surface,("Hard","Clay","Grass")))
            index_max = max(range(len(wp_surfaces)), key=wp_surfaces.__getitem__)
            self.best_surface = surfaces[index_max]
        except TypeError:
            self.best_surface = ""

    def get_current_ranking(self, current_date):
        ranking_data = rankings_df.loc[(rankings_df["player"] == self.id) & (rankings_df["ranking_date"] <= current_date)]
        ranking_data = ranking_data.sort_values(by="ranking_date", ascending=False)
        if not ranking_data.empty:
            return ranking_data.iloc[0]["rank"]
        else:
            return None
    
    def get_winning_perc_per_surface(self,surface):
        n_wins = len(matches.loc[(matches["surface"] == surface) & (matches["winner_id"] == self.id)].index)
        n_losses = len(matches.loc[(matches["surface"] == surface) & (matches["loser_id"] == self.id)].index)
        
        if (n_wins+n_losses) > 0:
            return(n_wins/(n_wins+n_losses))
        else:
            return 0


    def H2H(self,opponent):
        opponent_id = Player(opponent).id
        n_wins = len(matches.loc[(matches["winner_id"] == self.id) &
                                 (matches["loser_id"] == opponent_id)].index)
        n_losses = len(matches.loc[(matches["loser_id"] == self.id) &
                                   (matches["winner_id"] == opponent_id)].index)
        print(str(n_wins),"-",str(n_losses))


    def H2H_per_surface(self,opponent,surface):
        opponent_id = Player(opponent).id
        n_wins = len(matches.loc[(matches["winner_id"] == self.id) &
                                 (matches["loser_id"] == opponent_id) &
                                 (matches["surface"] == surface)].index)

        n_losses = len(matches.loc[(matches["loser_id"] == self.id) &
                                   (matches["winner_id"] == opponent_id) &
                                   (matches["surface"] == surface)].index)
        print(str(n_wins),"-",str(n_losses))

    def get_aces_per_match(self):
        if self.total_matches != 0:
            aces_wins = matches["w_ace"].loc[matches["winner_id"] == self.id].sum()
            aces_losses = matches["l_ace"].loc[matches["loser_id"] == self.id].sum()
            return (aces_wins + aces_losses)/self.total_matches

    def get_double_faults_per_match(self):
        if self.total_matches != 0:
            df_wins = matches["w_df"].loc[matches["winner_id"] == self.id].sum()
            df_losses = matches["l_df"].loc[matches["loser_id"] == self.id].sum()
            return (df_wins + df_losses)/self.total_matches
        
        else:
            return 0

    def calculate_win_percentage_last_3_months(self, gameday):
        #gameday = pd.to_datetime(str(gameday), format='%Y%m%d')

        # Calculate the start date of the 3-month period
        three_months_ago = gameday - pd.DateOffset(months=3)

        # Filter matches DataFrame to include only matches played in the last 3 months
        mask = (matches['tourney_date'] >= pd.Timestamp(three_months_ago)) & (matches['tourney_date'] <= pd.Timestamp(gameday))
        player_matches = matches[(matches['winner_id'] == self.id) | (matches['loser_id'] == self.id)][mask]

        # Calculate number of games won and lost by player
        games_won_last_3_months = len(player_matches[player_matches['winner_id'] == self.id])
        games_lost_last_3_months = len(player_matches[player_matches['loser_id'] == self.id])

        # Calculate win percentage
        total_games_last_3_months = games_won_last_3_months + games_lost_last_3_months
        win_percentage = games_won_last_3_months / total_games_last_3_months if total_games_last_3_months > 0 else None

        return win_percentage
    
    def summary(self, suffix=''):
        data = {
                f"Player{suffix}": [self.name],
                f"Age{suffix}": [self.age],
                f"Winning %{suffix}": [self.winning_percentage],
                f"Avg. Aces/Match{suffix}": [self.get_aces_per_match()],
                f"Avg. Double Faults/Match{suffix}": [self.get_double_faults_per_match()],
                f"Best Surface{suffix}": [self.best_surface]
            }

        return pd.DataFrame(data)

In [132]:
class Match:
    
    def __init__(self, match_string):
        self.match_string = match_string
        
        # Separar los nombres de los jugadores
        self.player1_name, self.player2_name = match_string.split(" - ")
        
        # Crear instancias de la clase Player para cada jugador
        self.player1 = Player(self.player1_name)
        self.player2 = Player(self.player2_name)
        
        
    def predict(self,surface):
        self.ranking1 = self.player1.get_current_ranking(datetime.today())
        self.ranking2 = self.player1.get_current_ranking(datetime.today())
        self.surface = surface_mapping[surface]
        self.player1_surface_winning_perc = self.player1.get_winning_perc_per_surface(self.surface)
        self.player2_surface_winning_perc = self.player2.get_winning_perc_per_surface(self.surface)
        self.player1_aces_per_match = self.player1.get_aces_per_match()
        self.player2_aces_per_match = self.player1.get_aces_per_match()
        self.player1_double_faults_per_match = self.player1.get_double_faults_per_match()
        self.player2_double_faults_per_match = self.player2.get_double_faults_per_match()
        
        result = logreg.predict(pd.DataFrame([[self.ranking1, self.ranking2, self.surface, self.player1_surface_winning_perc,
                              self.player2_surface_winning_perc, self.player1_aces_per_match,
                              self.player2_aces_per_match, self.player1_double_faults_per_match,
                              self.player2_double_faults_per_match]], columns = input_features))
        
        if result == 1:
            print(self.match_string,"----- Prediction:",self.player1_name)
        else:
            print(self.match_string,"----- Prediction:",self.player2_name)

### Train ML-Model

In [133]:
# Define function for preparing data
def transform_matches_df(df):
    new_df = pd.DataFrame(columns=["rank_1", "rank_2", "surface", "player_1_surface_winning_perc",
                                   "player_2_surface_winning_perc", "aces_1_per_match", "aces_2_per_match",
                                   "double_faults_1_per_match", "double_faults_2_per_match", "player_1_wins"])

    for index, row in df.iterrows():
        if random.randint(0, 1) == 1:
            rank_1 = row["winner_rank"]
            rank_2 = row["loser_rank"]
            player_1 = row["winner_name"]
            player_2 = row["loser_name"]
            p1_wins = 1

        else:
            rank_1 = row["loser_rank"]
            rank_2 = row["winner_rank"]
            player_1 = row["loser_name"]
            player_2 = row["winner_name"]
            p1_wins = 0

        surface = row["surface"]
        p1 = Player(player_1)
        p2 = Player(player_2)
        player_1_surface_winning_perc = p1.get_winning_perc_per_surface(surface)
        player_2_surface_winning_perc = p2.get_winning_perc_per_surface(surface)
        aces_1_per_match = p1.get_aces_per_match()
        aces_2_per_match = p2.get_aces_per_match()
        double_faults_1_per_match = p1.get_double_faults_per_match()
        double_faults_2_per_match = p2.get_double_faults_per_match()

        new_df = new_df.append({"rank_1": rank_1, "rank_2": rank_2, "surface": surface,
                                "player_1_surface_winning_perc": player_1_surface_winning_perc,
                                "player_2_surface_winning_perc": player_2_surface_winning_perc,
                                "aces_1_per_match": aces_1_per_match, "aces_2_per_match": aces_2_per_match,
                                "double_faults_1_per_match": double_faults_1_per_match,
                                "double_faults_2_per_match": double_faults_2_per_match,
                                "player_1_wins": p1_wins}, ignore_index=True)

    return new_df

In [134]:
# Tomemos 500 partidos
df = transform_matches_df(matches.loc[50000:50500])

In [135]:
#Map categorical variable 'surface' to integer values
surface_mapping = {'Hard': 0, 'Clay': 1, 'Grass': 2, 'Carpet': 3}
df['surface'] = df['surface'].replace(surface_mapping)


# Define the input features and the target attribute
input_features = ["rank_1", "rank_2", "surface", "player_1_surface_winning_perc",
                                   "player_2_surface_winning_perc", "aces_1_per_match", "aces_2_per_match",
                                   "double_faults_1_per_match", "double_faults_2_per_match"]

output_feature = "player_1_wins"

#Filter out NAs
df = df.dropna()
df.head()

Unnamed: 0,rank_1,rank_2,surface,player_1_surface_winning_perc,player_2_surface_winning_perc,aces_1_per_match,aces_2_per_match,double_faults_1_per_match,double_faults_2_per_match,player_1_wins
0,4.0,12.0,0,0.627184,0.676471,6.971655,9.439776,2.468254,2.316527,1
1,9.0,3.0,0,0.773669,0.677835,3.07121,14.488455,1.644717,3.062167,1
2,15.0,11.0,0,0.607981,0.580556,6.688576,5.106306,3.397496,3.246847,1
3,17.0,4.0,0,0.84933,0.627184,7.524931,6.971655,1.682825,2.468254,1
4,9.0,15.0,0,0.773669,0.607981,3.07121,6.688576,1.644717,3.397496,1


In [136]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()


# Split the transformed dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop("player_1_wins", axis=1),
                                                    pd.Series(df["player_1_wins"]),
                                                    test_size=0.2,
                                                    random_state=42)


y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6105263157894737


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Get data from ESPN website

In [137]:
# Hacemos una solicitud a la API de tenis de ESPN
response = requests.get('https://site.api.espn.com/apis/site/v2/sports/tennis/atp/scoreboard')

# Verificamos si la solicitud fue exitosa
if response.status_code == 200:
    # Obtenemos la información de los partidos
    data = response.json()['events']
    
    # Guardamos el archivo json
    with open('datos.json', 'w') as archivo:
      json.dump(data, archivo)
      print("El archivo se ha guardado con éxito")

else:
    # Si la solicitud no fue exitosa, mostramos un mensaje de error
    print("Error al obtener los partidos. Código de estado: ", response.status_code)

El archivo se ha guardado con éxito


In [138]:
# Obtenemos la información de los partidos
with open('datos.json') as archivo:
    data = json.load(archivo)

# Recorremos cada partido y mostramos el nombre de los jugadores si el partido es programado para hoy
l_matches = []
for i in range(len(data[0]["competitions"])):
    if data[0]["competitions"][i]["status"]["type"]["completed"] == False:
        try:
            Player_1 = data[0]["competitions"][i]["competitors"][0]['athlete']["fullName"]
            Player_2 = data[0]["competitions"][i]["competitors"][1]['athlete']["fullName"]
            l_matches += [Player_1 + " - " + Player_2]
        except KeyError:
            pass

#Elimino a los partidos que todavia no tienen confirmados a sus jugadores
l_matches = [p for p in l_matches if "TBD" not in p]

#Jaja aguanten Aliassime y Wawrinka
l_matches = [p.replace("Auger-Aliassime","Auger Aliassime") if "Auger-Aliassime"  in p else p for p in l_matches]
l_matches = [p.replace("Stanislas","Stan") if "Stanislas Wawrinka" in p else p for p in l_matches]

In [139]:
#Predicting today's matches
for game in l_matches:
    try:
        m = Match(game)
        m.predict("Hard")
    except IndexError:
        pass
    except ValueError:
        pass

Daniel Evans - Jack Draper ----- Prediction: Jack Draper
Jannik Sinner - Richard Gasquet ----- Prediction: Richard Gasquet
Pedro Martinez - Felix Auger Aliassime ----- Prediction: Felix Auger Aliassime
Marton Fucsovics - Alex De Minaur ----- Prediction: Alex De Minaur
Andy Murray - Radu Albot ----- Prediction: Radu Albot
Miomir Kecmanovic - Stan Wawrinka ----- Prediction: Stan Wawrinka
Borna Coric - Alex Molcan ----- Prediction: Alex Molcan
Francisco Cerundolo - Jack Sock ----- Prediction: Jack Sock
Adrian Mannarino - Lorenzo Musetti ----- Prediction: Lorenzo Musetti
Taylor Fritz - Ben Shelton ----- Prediction: Ben Shelton
Rinky Hijikata - Sebastian Baez ----- Prediction: Sebastian Baez
Hubert Hurkacz - Alexei Popyrin ----- Prediction: Alexei Popyrin
Carlos Alcaraz - Thanasi Kokkinakis ----- Prediction: Thanasi Kokkinakis
Cristian Garin - Casper Ruud ----- Prediction: Casper Ruud
Daniil Medvedev - Ilya Ivashka ----- Prediction: Ilya Ivashka
Taro Daniel - Cameron Norrie ----- Prediction