# Predicting tennis outcomes

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
from datetime import datetime
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Import Datasets

In [2]:
#Players
players_df = pd.read_csv("atp_players.csv")
players_df["full_name"] = players_df["name_first"] + " " + players_df["name_last"]

#Newest Rankings - sorted
rankings_df = pd.read_csv("atp_rankings_current.csv")
newest_version = rankings_df.sort_values("ranking_date", ascending = False).loc[0,"ranking_date"]
rankings_df = rankings_df.loc[rankings_df["ranking_date"] == newest_version]
rankings_df = rankings_df.sort_values("rank", ascending = True, ignore_index=True)

#Matches between 2001 and 2023
matches = pd.DataFrame()
for year in range(2001,2024):
  df = pd.read_csv("atp_matches_"+ str(year) +".csv")
  matches = matches.append(df)

matches.index = range(len(matches))
matches['tourney_date'] = pd.to_datetime(matches['tourney_date'], format='%Y%m%d')

#Surfaces
surfaces = {0:"Hard", 1: "Clay", 2: "Grass"}

### Define Classes

In [3]:
class Player:

    def __init__(self, full_name, age = None):
        self.name = full_name
        self.age = age
        self.id = int(players_df["player_id"].loc[players_df["full_name"] == full_name].values[0])
        self.nationality = players_df["ioc"].loc[players_df["full_name"] == full_name].values[0]
        self.hand = players_df["hand"].loc[players_df["full_name"] == full_name].values[0]
        self.height = players_df["height"].loc[players_df["full_name"] == full_name].values[0]
        self.n_wins = len(matches.loc[matches["winner_id"] == self.id].index)
        self.n_losses = len(matches.loc[matches["loser_id"] == self.id].index)
        self.total_matches = self.n_wins + self.n_losses
    
        #Winning Percentage
        if self.total_matches > 0:
            self.winning_percentage = self.n_wins/(self.total_matches)
        else:
            self.winning_percentage = ""

        #Best Surface
        try:
            wp_surfaces = list(map(self.get_winning_perc_per_surface,("Hard","Clay","Grass")))
            index_max = max(range(len(wp_surfaces)), key=wp_surfaces.__getitem__)
            self.best_surface = surfaces[index_max]
        except TypeError:
            self.best_surface = ""

    def get_winning_perc_per_surface(self,surface):
        n_wins = len(matches.loc[(matches["surface"] == surface) & (matches["winner_id"] == self.id)].index)
        n_losses = len(matches.loc[(matches["surface"] == surface) & (matches["loser_id"] == self.id)].index)
        if (n_wins+n_losses) > 0:
            return(n_wins/(n_wins+n_losses))


    def H2H(self,opponent):
        opponent_id = Player(opponent).id
        n_wins = len(matches.loc[(matches["winner_id"] == self.id) &
                                 (matches["loser_id"] == opponent_id)].index)
        n_losses = len(matches.loc[(matches["loser_id"] == self.id) &
                                   (matches["winner_id"] == opponent_id)].index)
        print(str(n_wins),"-",str(n_losses))


    def H2H_per_surface(self,opponent,surface):
        opponent_id = Player(opponent).id
        n_wins = len(matches.loc[(matches["winner_id"] == self.id) &
                                 (matches["loser_id"] == opponent_id) &
                                 (matches["surface"] == surface)].index)

        n_losses = len(matches.loc[(matches["loser_id"] == self.id) &
                                   (matches["winner_id"] == opponent_id) &
                                   (matches["surface"] == surface)].index)
        print(str(n_wins),"-",str(n_losses))

    def get_aces_per_match(self):
        if self.total_matches != 0:
            aces_wins = matches["w_ace"].loc[matches["winner_id"] == self.id].sum()
            aces_losses = matches["l_ace"].loc[matches["loser_id"] == self.id].sum()
            return (aces_wins + aces_losses)/self.total_matches

    def get_double_faults_per_match(self):
        if self.total_matches != 0:
            df_wins = matches["w_df"].loc[matches["winner_id"] == self.id].sum()
            df_losses = matches["l_df"].loc[matches["loser_id"] == self.id].sum()
            return (df_wins + df_losses)/self.total_matches

    def calculate_win_percentage_last_3_months(self, gameday):
        #gameday = pd.to_datetime(str(gameday), format='%Y%m%d')

        # Calculate the start date of the 3-month period
        three_months_ago = gameday - pd.DateOffset(months=3)

        # Filter matches DataFrame to include only matches played in the last 3 months
        mask = (matches['tourney_date'] >= pd.Timestamp(three_months_ago)) & (matches['tourney_date'] <= pd.Timestamp(gameday))
        player_matches = matches[(matches['winner_id'] == self.id) | (matches['loser_id'] == self.id)][mask]

        # Calculate number of games won and lost by player
        games_won_last_3_months = len(player_matches[player_matches['winner_id'] == self.id])
        games_lost_last_3_months = len(player_matches[player_matches['loser_id'] == self.id])

        # Calculate win percentage
        total_games_last_3_months = games_won_last_3_months + games_lost_last_3_months
        win_percentage = games_won_last_3_months / total_games_last_3_months if total_games_last_3_months > 0 else None

        return win_percentage
    
    def summary(self, suffix=''):
        data = {
                f"Player{suffix}": [self.name],
                f"Age{suffix}": [self.age],
                f"Winning %{suffix}": [self.winning_percentage],
                f"Avg. Aces/Match{suffix}": [self.get_aces_per_match()],
                f"Avg. Double Faults/Match{suffix}": [self.get_double_faults_per_match()],
                f"Best Surface{suffix}": [self.best_surface]
            }

        return pd.DataFrame(data)

In [104]:
class Match:
    
    def __init__(self, match_string, ranking_string = None):
        self.match_string = match_string
        
        # Separar los nombres de los jugadores
        self.player1_name, self.player2_name = match_string.split(" - ")
        
        # Crear instancias de la clase Player para cada jugador
        self.player1 = Player(self.player1_name)
        self.player2 = Player(self.player2_name)
        
        # Separar los rankings de los jugadores
        if ranking_string != None:
            self.player1_ranking, self.player2_ranking = ranking_string.split(" - ")
        
    def get_summary(self, gameday, surface = None, level = None):
        #Players data
        df1 = self.player1.summary(suffix='_1')
        df2 = self.player2.summary(suffix='_2')
        self.df = pd.concat([df1, df2], axis=1)
        
        #Match Data
        self.df["surface"] = surface
        self.df["tourney_level"] = level
        
        #More Player data
        self.df["Ranking_1"] = self.player1_ranking
        self.df["Ranking_2"] = self.player2_ranking
        
        self.df["Best Surface_1"] = (self.player1.best_surface == surface)
        self.df["Best Surface_2"] = (self.player2.best_surface == surface)
        
        self.df["Win %_last 3 months_1"] = self.player1.calculate_win_percentage_last_3_months(gameday)
        self.df["Win %_last 3 months_2"] = self.player2.calculate_win_percentage_last_3_months(gameday)
        
        return self.df

### Train ML-Model

In [48]:
# Define function for preparing data
def execute_match(df):
    results = []
    for _, row in df.iterrows():
        winner = row["winner_name"]
        loser = row["loser_name"]
        winner_rank = row["winner_rank"]
        loser_rank = row["loser_rank"]
        if np.random.randint(0, 2) == 1:
            match = Match(f"{winner} - {loser}", f"{winner_rank} - {loser_rank}")
            result = np.ravel(match.get_summary(row["tourney_date"],row["surface"], row["tourney_level"]))
            result = np.append(result, 1)
            
        else:
            match = Match(f"{loser} - {winner}", f"{loser_rank} - {winner_rank}")
            result = np.ravel(match.get_summary(row["tourney_date"],row["surface"], row["tourney_level"]))
            result = np.append(result, 0)

        results.append(result)
    return pd.DataFrame(results, columns = ['Player_1', 'Age_1', 'Winning %_1', 'Avg. Aces/Match_1', 'Avg. Double Faults/Match_1', 'Best Surface_1', 'Player_2', 'Age_2', 'Winning %_2', 'Avg. Aces/Match_2', 'Avg. Double Faults/Match_2', 'Best Surface_2', 'surface', 'tourney_level', 'Ranking_1', 'Ranking_2', 'Win %_last 3 months_1', 'Win %_last 3 months_2','Player 1 wins'])

In [96]:
#Tomemos los ultimos 40000 partidos
test = matches.loc[50000:50200]
df = execute_match(test)



In [97]:
surface_mapping = {'Hard': 0, 'Clay': 1, 'Grass': 2, 'Carpet': 3}
df['surface'] = df['surface'].replace(surface_mapping)

#tourney_level_map = {'G': 1, 'M': 2, 'A': 3, 'C': 4, 'S': 5, 'F': 6, 'D': 7}
#df['tourney_level'] = df['tourney_level'].map(tourney_level_map)

#surface_logical = {False : 0, True: 1}
#df['Best Surface_1'] = df['Best Surface_1'].replace(surface_logical)
#df["Best Surface_2"] = df['Best Surface_2'].replace(surface_logical)

# Define the input features and the target attribute
input_features = ['Ranking_1', 'Ranking_2', 'Win %_last 3 months_1', 'Win %_last 3 months_2', 'Avg. Aces/Match_1', 'Avg. Aces/Match_2', 'Avg. Double Faults/Match_1', 'Avg. Double Faults/Match_2', 'surface']
output_feature = 'Player 1 wins'


#Filter out NAs
df = df[input_features + ["Player 1 wins"]]
df = df.dropna()
df. head()

Unnamed: 0,Ranking_1,Ranking_2,Win %_last 3 months_1,Win %_last 3 months_2,Avg. Aces/Match_1,Avg. Aces/Match_2,Avg. Double Faults/Match_1,Avg. Double Faults/Match_2,surface,Player 1 wins
0,4.0,12.0,0.625,0.75,6.971655,9.439776,2.468254,2.316527,0,1
1,9.0,3.0,0.8,0.647059,3.07121,14.488455,1.644717,3.062167,0,1
2,11.0,15.0,0.6,0.764706,5.106306,6.688576,3.246847,3.397496,0,0
3,4.0,17.0,0.625,1.0,6.971655,7.524931,2.468254,1.682825,0,0
4,9.0,15.0,0.8,0.764706,3.07121,6.688576,1.644717,3.397496,0,1


In [98]:
X = df[input_features]
y = df[output_feature]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Predict labels for test data
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Get data from ESPN website

In [7]:
# Hacemos una solicitud a la API de tenis de ESPN
response = requests.get('https://site.api.espn.com/apis/site/v2/sports/tennis/atp/scoreboard')

# Verificamos si la solicitud fue exitosa
if response.status_code == 200:
    # Obtenemos la información de los partidos
    data = response.json()['events']
    
    # Guardamos el archivo json
    with open('datos.json', 'w') as archivo:
      json.dump(data, archivo)
      print("El archivo se ha guardado con éxito")

else:
    # Si la solicitud no fue exitosa, mostramos un mensaje de error
    print("Error al obtener los partidos. Código de estado: ", response.status_code)

El archivo se ha guardado con éxito


In [9]:
# Obtenemos la información de los partidos
with open('datos.json') as archivo:
    data = json.load(archivo)

# Recorremos cada partido y mostramos el nombre de los jugadores si el partido es programado para hoy
l_matches = []
for i in range(len(data[0]["competitions"])):
    if data[0]["competitions"][i]["status"]["type"]["completed"] == False:
        try:
            Player_1 = data[0]["competitions"][i]["competitors"][0]['athlete']["fullName"]
            Player_2 = data[0]["competitions"][i]["competitors"][1]['athlete']["fullName"]
            l_matches += [Player_1 + " - " + Player_2]
        except KeyError:
          pass

#Elimino a los partidos que todavia no tienen confirmados a sus jugadores
l_matches = [p for p in l_matches if "TBD" not in p]

#Jaja aguanten Aliassime y Wawrinka
l_matches = [p.replace("Auger-Aliassime","Auger Aliassime") if "Auger-Aliassime"  in p else p for p in l_matches]
l_matches = [p.replace("Stanislas","Stan") if "Stanislas Wawrinka" in p else p for p in l_matches]
l_matches

['Daniel Evans - Jack Draper',
 'Jannik Sinner - Richard Gasquet',
 'Pedro Martinez - Felix Auger Aliassime',
 'Marton Fucsovics - Alex De Minaur',
 'Andy Murray - Radu Albot',
 'Jan-Lennard Struff - Tommy Paul',
 'Miomir Kecmanovic - Stan Wawrinka',
 'Borna Coric - Alex Molcan',
 'Francisco Cerundolo - Jack Sock',
 'Adrian Mannarino - Lorenzo Musetti',
 'Guido Pella - Tallon Griekspoor',
 'Taylor Fritz - Ben Shelton',
 'Rinky Hijikata - Sebastian Baez',
 'Hubert Hurkacz - Alexei Popyrin',
 'Mackenzie McDonald - Holger Rune',
 'Carlos Alcaraz - Thanasi Kokkinakis',
 'Cristian Garin - Casper Ruud',
 'Daniil Medvedev - Ilya Ivashka',
 'Taro Daniel - Cameron Norrie',
 'Frances Tiafoe - Jason Kubler',
 'Karen Khachanov - Alejandro Davidovich Fokina',
 'Alejandro Tabilo - Jordan Thompson',
 'Andrey Rublev - Ugo Humbert',
 'Emil Ruusuvuori - Alexander Zverev']

In [105]:
#Getting today's matches
df = pd.DataFrame({})
for game in l_matches:
    try:
        m = Match(game)
        df = df.append(m.get_summary(gameday = pd.datetime.today(), surface = "Hard",level = "M"))
    
    except IndexError:
        pass

df

  


AttributeError: 'Match' object has no attribute 'player1_ranking'