In [4]:
# 1. Importações
import pandas as pd

# Inicializar DataFrame vazio
all_data = pd.DataFrame()

# Concatenar os ficheiros de 1992 a 2023
for year in range(1992, 2024):
    file = f"D:/projetos/Tenis ML-AI/data/tennis_atp/atp_matches_{year}.csv"
    year_data = pd.read_csv(file)
    all_data = pd.concat((all_data, year_data), axis=0, ignore_index=True)

# Visualizar primeiras linhas
all_data.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,1992-339,Adelaide,Hard,32,A,19911230,1,101964,1.0,,...,34.0,23.0,6.0,9.0,0.0,3.0,16.0,,80.0,
1,1992-339,Adelaide,Hard,32,A,19911230,2,101924,,,...,65.0,39.0,9.0,10.0,8.0,12.0,65.0,,63.0,
2,1992-339,Adelaide,Hard,32,A,19911230,3,101195,,,...,68.0,45.0,22.0,16.0,8.0,12.0,62.0,,730.0,
3,1992-339,Adelaide,Hard,32,A,19911230,4,101820,,,...,49.0,34.0,16.0,14.0,1.0,4.0,60.0,,42.0,
4,1992-339,Adelaide,Hard,32,A,19911230,5,100870,,,...,95.0,65.0,15.0,18.0,5.0,7.0,68.0,,32.0,


In [8]:
# 1. Manter só jogos com dados essenciais
critical_cols = [
    'tourney_id', 'tourney_name', 'surface', 'tourney_date',
    'winner_name', 'loser_name',
    'winner_age', 'loser_age',
    'winner_rank', 'loser_rank'
]
all_data_filtered = all_data.dropna(subset=critical_cols)

# 2. Depois filtra para manter apenas as colunas desejadas
final_cols = [
    'tourney_id','tourney_name','surface','draw_size','tourney_level','tourney_date','match_num',
    'winner_id','winner_seed','winner_entry','winner_name','winner_hand','winner_ht','winner_ioc','winner_age',
    'loser_id','loser_seed','loser_entry','loser_name','loser_hand','loser_ht','loser_ioc','loser_age',
    'score','best_of','round','minutes',
    'w_ace','w_df','w_svpt','w_1stIn','w_1stWon','w_2ndWon','w_SvGms','w_bpSaved','w_bpFaced',
    'l_ace','l_df','l_svpt','l_1stIn','l_1stWon','l_2ndWon','l_SvGms','l_bpSaved','l_bpFaced',
    'winner_rank','winner_rank_points','loser_rank','loser_rank_points'
]

# 3. Filtrar apenas essas colunas
all_data_filtered = all_data_filtered[final_cols].reset_index(drop=True)

print(f"Jogos válidos após filtro equilibrado: {all_data_filtered.shape[0]}")

Jogos válidos após filtro equilibrado: 98538


In [9]:
numeric_cols = ['draw_size', 'match_num', 'winner_ht', 'loser_ht', 'winner_age', 'loser_age', 'minutes',
                'w_ace','w_df','w_svpt','w_1stIn','w_1stWon','w_2ndWon','w_SvGms','w_bpSaved','w_bpFaced',
                'l_ace','l_df','l_svpt','l_1stIn','l_1stWon','l_2ndWon','l_SvGms','l_bpSaved','l_bpFaced',
                'winner_rank','winner_rank_points','loser_rank','loser_rank_points']

for col in numeric_cols:
    all_data_filtered[col] = pd.to_numeric(all_data_filtered[col], errors='coerce')

In [10]:
missing_values = all_data.isnull().sum()
missing_values[missing_values > 0].sort_values(ascending=False)

winner_entry          89152
loser_entry           81078
loser_seed            78905
winner_seed           60352
minutes               12546
l_bpFaced              9774
w_svpt                 9774
w_ace                  9774
w_df                   9774
w_1stWon               9774
w_1stIn                9774
w_2ndWon               9774
l_1stIn                9774
l_df                   9774
l_ace                  9774
w_bpFaced              9774
w_bpSaved              9774
l_svpt                 9774
l_bpSaved              9774
l_1stWon               9774
l_2ndWon               9774
l_SvGms                9773
w_SvGms                9773
loser_ht               4234
loser_rank_points      3206
loser_rank             2394
winner_ht              2114
winner_rank_points     1921
winner_rank            1105
surface                  53
loser_age                 7
winner_age                4
loser_hand                4
dtype: int64

In [12]:
# 1. Carregar o dataset de Elo
elo = pd.read_csv("D:/projetos/Tenis ML-AI/data/elo_ratings.csv", encoding="latin1")

# 2. Normalizar nomes (função)
def normalize_name(name):
    return str(name).strip().lower().replace('-', ' ').replace('.', '')

# 3. Criar colunas com nomes normalizados
all_data_filtered['winner_name_clean'] = all_data_filtered['winner_name'].apply(normalize_name)
all_data_filtered['loser_name_clean'] = all_data_filtered['loser_name'].apply(normalize_name)
elo['player_name_clean'] = elo['Player'].apply(normalize_name)

# 4. Criar dicionários de Elo ratings por superfície
hard_elo = dict(zip(elo['player_name_clean'], elo['hard court elo rating']))
clay_elo = dict(zip(elo['player_name_clean'], elo['clay-court elo rating']))
grass_elo = dict(zip(elo['player_name_clean'], elo['grass-court elo rating']))

# 5. Função para ir buscar o Elo por superfície
def get_elo(player, surface):
    name = normalize_name(player)
    if surface == 'Hard':
        return hard_elo.get(name)
    elif surface == 'Clay':
        return clay_elo.get(name)
    elif surface == 'Grass':
        return grass_elo.get(name)
    else:
        return None

# 6. Adicionar Elo ao dataset
all_data_filtered['winner_elo'] = all_data_filtered.apply(lambda row: get_elo(row['winner_name'], row['surface']), axis=1)
all_data_filtered['loser_elo'] = all_data_filtered.apply(lambda row: get_elo(row['loser_name'], row['surface']), axis=1)

# 7. Calcular novas features
all_data_filtered['elo_diff'] = all_data_filtered['winner_elo'] - all_data_filtered['loser_elo']
all_data_filtered['height_diff'] = all_data_filtered['winner_ht'] - all_data_filtered['loser_ht']
all_data_filtered['age_diff'] = all_data_filtered['winner_age'] - all_data_filtered['loser_age']
all_data_filtered['rank_diff'] = all_data_filtered['winner_rank'] - all_data_filtered['loser_rank']
all_data_filtered['rank_points_diff'] = all_data_filtered['winner_rank_points'] - all_data_filtered['loser_rank_points']

# 8. Verifica resultado
all_data_filtered[['winner_name', 'loser_name', 'surface', 'winner_elo', 'loser_elo', 'elo_diff']].head()


Unnamed: 0,winner_name,loser_name,surface,winner_elo,loser_elo,elo_diff
0,Goran Ivanisevic,Nicklas Kulti,Hard,,,
1,Stefano Pescosolido,Jimmy Arias,Hard,,,
2,Amos Mansdorf,Grant Doyle,Hard,,,
3,Marc Rosset,Cristiano Caratti,Hard,,,
4,Christo Van Rensburg,Javier Sanchez,Hard,,,


In [13]:
normalize_name("Goran Ivanisevic") in elo['player_name_clean'].tolist()

False