In [13]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ----------------------------
# 1️⃣ Load & Clean WTA Data
# ----------------------------
years = range(2000, 2025)
base_url = "https://raw.githubusercontent.com/JeffSackmann/tennis_wta/master/wta_matches_{}.csv"

frames = []
for year in years:
    try:
        df = pd.read_csv(base_url.format(year))
        df['year'] = year
        frames.append(df)
    except Exception as e:
        print(f"Error loading {year}: {e}")

wta_all = pd.concat(frames, ignore_index=True)

# Keep relevant columns
wta_clean = wta_all[['tourney_date', 'surface', 'winner_name', 'loser_name', 'year']].copy()

# Convert date
wta_clean['tourney_date'] = pd.to_datetime(wta_clean['tourney_date'], format='%Y%m%d')

# Standardize surface
wta_clean['surface'] = wta_clean['surface'].str.strip().str.capitalize()

# Drop missing player names
wta_clean = wta_clean.dropna(subset=['winner_name', 'loser_name'])

# Sort by date
wta_clean = wta_clean.sort_values('tourney_date').reset_index(drop=True)

# Remove duplicates
wta_clean = wta_clean.drop_duplicates(subset=['tourney_date', 'winner_name', 'loser_name'])

# Save cleaned data
os.makedirs("data/cleaned", exist_ok=True)
wta_clean.to_csv("data/cleaned/wta_cleaned.csv", index=False)
print("✅ Cleaned WTA data saved!")

# ----------------------------
# 2️⃣ Compute Elo Ratings
# ----------------------------
K = 32
players = pd.unique(wta_clean[['winner_name', 'loser_name']].values.ravel())
elo_ratings = {player: 1500 for player in players}

def expected_score(Ra, Rb):
    return 1 / (1 + 10 ** ((Rb - Ra) / 400))

# Keep Elo history for analysis
elo_history = []

for _, match in wta_clean.iterrows():
    winner = match['winner_name']
    loser = match['loser_name']
    
    Ra = elo_ratings[winner]
    Rb = elo_ratings[loser]
    
    Ea = expected_score(Ra, Rb)
    Eb = 1 - Ea
    
    # Update ratings
    elo_ratings[winner] = Ra + K * (1 - Ea)
    elo_ratings[loser] = Rb + K * (0 - Eb)
    
    # Record history
    elo_history.append({
        'date': match['tourney_date'],
        'winner': winner,
        'loser': loser,
        'winner_elo': elo_ratings[winner],
        'loser_elo': elo_ratings[loser],
        'surface': match['surface']
    })

elo_df = pd.DataFrame(elo_history)

# ----------------------------
# 3️⃣ Prepare Dataset for Logistic Regression
# ----------------------------
# Feature: Elo difference
elo_df['elo_diff'] = elo_df['winner_elo'] - elo_df['loser_elo']

# Encode surface as dummy variables
elo_df = pd.get_dummies(elo_df, columns=['surface'], drop_first=True)

# Target: winner=1, loser=0 (we predict if the winner wins = 1)
X = elo_df.drop(columns=['date', 'winner', 'loser'])
y = [1]*len(elo_df)  # all rows are winners

# For logistic regression, create a balanced dataset by flipping some rows
# We will randomly flip winner/loser to simulate prediction dataset
elo_flipped = elo_df.copy()
elo_flipped['elo_diff'] = -elo_flipped['elo_diff']
y_flipped = [0]*len(elo_flipped)

# Combine
X_final = pd.concat([X, elo_flipped.drop(columns=['date','winner','loser'])], ignore_index=True)
y_final = y + y_flipped

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

# Train logistic regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"✅ Logistic Regression Accuracy: {acc:.2%}")


✅ Cleaned WTA data saved!
✅ Logistic Regression Accuracy: 75.36%
