# Initial Exploration

## Imports

In [1]:
import pandas as pd
from rapidfuzz import process
from typing import Union, Any, Optional

## Read All Data

In [6]:
match_info_df = pd.read_csv("/home/mark/Documents/interviews/tennis/data/match_info.csv")
match_outcome_stats_df = pd.read_csv("/home/mark/Documents/interviews/tennis/data/match_outcome_stats.csv")
player_info_df = pd.read_csv("/home/mark/Documents/interviews/tennis/data/player_info.csv")
player_outcome_stats_df = pd.read_csv("/home/mark/Documents/interviews/tennis/data/player_outcome_stats.csv")

## Match Info

In [7]:
match_info_df.head()

Unnamed: 0,match_id,tourney_id,tourney_name,tourney_date,tourney_level,surface,match_num,best_of,round
0,0,0,Orlando,2000-05-01,A,Clay,1,3.0,R32
1,1,0,Orlando,2000-05-01,A,Clay,2,3.0,R32
2,2,0,Orlando,2000-05-01,A,Clay,3,3.0,R32
3,3,0,Orlando,2000-05-01,A,Clay,4,3.0,R32
4,4,0,Orlando,2000-05-01,A,Clay,5,3.0,R32


Look at data types and unique values.

In [8]:
match_info_df.dtypes

match_id           int64
tourney_id         int64
tourney_name      object
tourney_date      object
tourney_level     object
surface           object
match_num          int64
best_of          float64
round             object
dtype: object

In [9]:
match_info_df.tourney_level.unique()

array(['A', 'G', 'M', 'F', 'C', 'D'], dtype=object)

In [10]:
match_info_df.surface.unique()

array(['Clay', 'Hard', 'Grass', 'Carpet'], dtype=object)

In [11]:
match_info_df["round"].unique()

array(['R32', 'R16', 'QF', 'SF', 'F', 'R128', 'R64', 'RR', 'BR'],
      dtype=object)

In [12]:
match_info_df.best_of.unique()

array([3., 5.])

Look at combination of (tourney_id, tourney_name). See if there is any tourney_id that has multiple tourney_name.

In [13]:
tourney_id_group = match_info_df.groupby("tourney_id")["tourney_name"].nunique()
print(
    f"Number of tourney_id with multiple tourney_name: {len(tourney_id_group[tourney_id_group > 1])}"
)

Number of tourney_id with multiple tourney_name: 0


Use string similarity packages to check for similar tourney_name. (Multiple choices here but using `fuzzywuzzy` for now)

In [14]:
sorted_lower_tourney_names = (
    match_info_df["tourney_name"].str.lower().sort_values().unique()
)

In [15]:
sorted_lower_tourney_names

array(["'s-hertogenbosch", 'acapulco', 'adelaide', 'amersfoort',
       'amsterdam', 'antwerp', 'atlanta', 'auckland', 'australian open',
       'bangkok', 'barcelona', 'basel', 'bastad', 'beijing',
       'beijing olympics', 'belgrade', 'bogota', 'brighton', 'brisbane',
       'bucharest', 'buenos aires', 'canada masters', 'casablanca',
       'chengdu', 'chennai', 'cincinnati masters', 'copenhagen',
       'costa do sauipe', 'curitiba ch', 'davis cup g1 r1: bar vs ecu',
       'davis cup g1 r1: chi vs dom', 'davis cup g1 r1: dom vs chi',
       'davis cup g1 r1: hun vs isr', 'davis cup g1 r1: isr vs por',
       'davis cup g1 r1: nzl vs ind', 'davis cup g1 r1: nzl vs kor',
       'davis cup g1 r1: pak vs chn', 'davis cup g1 r1: per vs ecu',
       'davis cup g1 r1: pol vs bih', 'davis cup g1 r1: por vs aut',
       'davis cup g1 r1: rou vs blr', 'davis cup g1 r1: rou vs slo',
       'davis cup g1 r1: rus vs swe', 'davis cup g1 r1: uzb vs kor',
       'davis cup g1 r2: bra vs ecu', 'd

We see a lot of repeated "davis cup" tournaments where the tournament name contains round and competitor information.  We will exclude these from our analysis.

In [16]:
match_info_df = match_info_df[
    ~match_info_df["tourney_name"].str.contains("davis cup", case=False)
]
sorted_lower_tourney_names = (
    match_info_df["tourney_name"].str.lower().sort_values().unique()
)

Set similarity threshold to 80

In [17]:
CHOSEN_THRESHOLD = 80

In [18]:
similarity_results = []
for name in sorted_lower_tourney_names:
    candidates = [n for n in sorted_lower_tourney_names if n != name]
    matches = process.extract(name, candidates, limit=len(candidates))

    for match in matches:
        if match[1] >= CHOSEN_THRESHOLD:
            similarity_results.append(
                {"name": name, "match": match[0], "score": match[1]}
            )

In [19]:
similarity_results

[{'name': "'s-hertogenbosch",
  'match': 's-hertogenbosch',
  'score': 96.7741935483871},
 {'name': 'australian open', 'match': 'us open', 'score': 85.5},
 {'name': 'beijing', 'match': 'beijing olympics', 'score': 90.0},
 {'name': 'beijing olympics', 'match': 'beijing', 'score': 90.0},
 {'name': 'beijing olympics', 'match': 'olympics', 'score': 90.0},
 {'name': 'cincinnati masters', 'match': 'masters cup', 'score': 85.5},
 {'name': 'cincinnati masters', 'match': 'rome masters', 'score': 85.5},
 {'name': 'hamburg', 'match': 'hamburg masters', 'score': 90.0},
 {'name': 'hamburg masters', 'match': 'hamburg', 'score': 90.0},
 {'name': 'indian wells masters', 'match': 'masters cup', 'score': 85.5},
 {'name': 'indian wells masters', 'match': 'miami masters', 'score': 85.5},
 {'name': 'indian wells masters', 'match': 'paris masters', 'score': 85.5},
 {'name': 'indian wells masters', 'match': 'rome masters', 'score': 85.5},
 {'name': 'london', 'match': 'london olympics', 'score': 90.0},
 {'nam

Check for duplicated match IDs.

In [56]:
match_info_df[match_info_df.duplicated("match_id")]

Unnamed: 0,match_id,tourney_id,tourney_name,tourney_date,tourney_level,surface,match_num,best_of,round


No duplicated match IDs found.

### Action

- There are a lot of close "matches" that look genuine, i.e, containing extra spaces or foreign characters. These should be mapped to the correct name & id.
- There are some that require further manual inspection.
- The Davis cup entries require further processing and cleaning.

### Thoughts

- Surface will probably be a useful feature for modelling.
- Tourney level may be useful, but will information will also be contained within player names and rankings etc, since good players will play in higher level tournaments.
- Best_of may be useful for validation, and also for modelling/evaluating, since better players should win more best of 5 matches.

## Match Outcome Stats

In [44]:
match_outcome_stats_df.head()

Unnamed: 0,match_id,score,minutes
0,0,3-6 7-6(6) 7-6(4),162.0
1,1,6-2 7-5,86.0
2,2,6-1 6-3,64.0
3,3,4-6 6-2 7-5,150.0
4,4,6-1 6-4,60.0


It is clear that the score column is the most important column. We will focus on this column.
There is a large amount of cleaning that can be done.

Below are a number of methods that are then used to clean the score column.

In [46]:
def parse_set_score(set_score: str) -> dict[str, Union[int, str, None]]:
    """
    Parse a single set score and extract points for player 1 and player 2,
    as well as tiebreaker points if present.

    Parameters:
    - set_score (str): The set score to parse.

    Returns:
    - Dict[str, Union[int, str, None]]: A dictionary containing the parsed set score details.
    """
    set_score = set_score.strip()
    if set_score in {'RET', 'DEF', 'W/O'}:
        return {
            'p1': None,
            'p2': None,
            'winner': None,
            'tiebreaker': None,
            'status': set_score
        }
    tiebreaker = None
    if '(' in set_score:
        set_score, tiebreaker = set_score.split('(')
        set_score = set_score.strip()
        tiebreaker = int(tiebreaker.replace(')', '').strip())
    p1, p2 = map(int, set_score.split('-'))
    
    # Determine if the set is complete
    if (p1 >= 6 and (p1 - p2) >= 2) or p1 >= 7 or (p2 >= 6 and (p2 - p1) >= 2) or p2 >= 7:
        status = 'complete'
        winner = 'p1' if p1 > p2 else 'p2'
    else:
        status = 'incomplete'
        winner = None
    
    return {
        'p1': p1,
        'p2': p2,
        'winner': winner,
        'tiebreaker': tiebreaker,
        'status': status
    }

def handle_special_marker(parsed_sets: dict[str, Any], i: int, marker: str) -> None:
    """
    Handle the special markers (RET, DEF, W/O) by updating the parsed sets accordingly.

    Parameters:
    - parsed_sets (Dict[str, Any]): The parsed sets dictionary.
    - i (int): The current set index.
    - marker (str): The special marker (RET, DEF, W/O).
    """
    if i > 1 and parsed_sets[f'set_{i-1}_status'] == 'incomplete':
        parsed_sets[f'set_{i-1}_status'] = marker
    else:
        parsed_sets[f'set_{i}_status'] = marker
        parsed_sets[f'set_{i}_p1'] = None
        parsed_sets[f'set_{i}_p2'] = None
        parsed_sets[f'set_{i}_tiebreaker'] = None
        parsed_sets[f'set_{i}_winner'] = None

def ensure_all_sets(parsed_sets: dict[str, Any], match_ended: bool) -> None:
    """
    Ensure that all five sets are represented and handle cases where the match ends early.

    Parameters:
    - parsed_sets (Dict[str, Any]): The parsed sets dictionary.
    - match_ended (bool): Flag indicating if the match ended early.
    """
    for i in range(1, 6):
        parsed_sets.setdefault(f'set_{i}_p1', None)
        parsed_sets.setdefault(f'set_{i}_p2', None)
        parsed_sets.setdefault(f'set_{i}_winner', None)
        parsed_sets.setdefault(f'set_{i}_tiebreaker', None)
        if match_ended and f'set_{i}_status' not in parsed_sets:
            parsed_sets[f'set_{i}_status'] = 'NA'
        else:
            parsed_sets.setdefault(f'set_{i}_status', 'incomplete')

def parse_scores(score: str) -> Optional[dict[str, Any]]:
    """
    Parse the match score string into a structured format.

    Parameters:
    - score (str): The score string to parse.

    Returns:
    - Optional[Dict[str, Any]]: A dictionary containing the parsed match details,
      or None if the parsing failed.
    """
    sets = [set_score.strip() for set_score in score.split(' ') if set_score.strip()]
    parsed_sets: dict[str, Any] = {}
    match_ended = False
    p1_set_wins = 0
    p2_set_wins = 0

    try:
        for i, set_score in enumerate(sets, 1):
            if set_score in {'RET', 'DEF', 'W/O'}:
                handle_special_marker(parsed_sets, i, set_score)
                match_ended = True
                break

            set_result = parse_set_score(set_score)
            parsed_sets[f'set_{i}_p1'] = set_result['p1']
            parsed_sets[f'set_{i}_p2'] = set_result['p2']
            parsed_sets[f'set_{i}_tiebreaker'] = set_result['tiebreaker']
            parsed_sets[f'set_{i}_status'] = set_result['status']
            
            if set_result['status'] == 'complete':
                parsed_sets[f'set_{i}_winner'] = set_result['winner']
                if set_result['winner'] == 'p1':
                    p1_set_wins += 1
                elif set_result['winner'] == 'p2':
                    p2_set_wins += 1
            else:
                parsed_sets[f'set_{i}_winner'] = None

        # Ensure all five sets are represented and handle cases where match ends early
        ensure_all_sets(parsed_sets, match_ended)

        # Determine if the match was completed
        parsed_sets['match_completed'] = not match_ended

        # Determine the overall winner if the match was completed
        if not match_ended:
            if p1_set_wins > p2_set_wins:
                parsed_sets['match_winner'] = 'p1'
            elif p2_set_wins > p1_set_wins:
                parsed_sets['match_winner'] = 'p2'
            else:
                parsed_sets['match_winner'] = None
        else:
            parsed_sets['match_winner'] = None

    except Exception as e:
        print(f"Error parsing score '{score}': {e}")
        return None

    return parsed_sets

def process_match_scores(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the match scores in the DataFrame and add structured match details.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing match scores.

    Returns:
    - pd.DataFrame: The DataFrame with structured match details added.
    """
    parsed_scores = df['score'].apply(parse_scores).dropna()
    parsed_scores_df = pd.DataFrame(parsed_scores.tolist())
    df = df.loc[parsed_scores.index].join(parsed_scores_df)
    df.drop(columns=['score'], inplace=True)
    return df

In [47]:
# Process the match scores
df = process_match_scores(match_outcome_stats_df)

Error parsing score '3-Jun': invalid literal for int() with base 10: 'Jun'


In [52]:
df.head()

Unnamed: 0,match_id,minutes,set_1_p1,set_1_p2,set_1_tiebreaker,set_1_status,set_1_winner,set_2_p1,set_2_p2,set_2_tiebreaker,...,set_4_winner,set_4_tiebreaker,set_4_status,set_5_p1,set_5_p2,set_5_winner,set_5_tiebreaker,set_5_status,match_completed,match_winner
0,0,162.0,3.0,6.0,,complete,p2,7.0,6.0,6.0,...,,,incomplete,,,,,incomplete,True,p1
1,1,86.0,6.0,2.0,,complete,p1,7.0,5.0,,...,,,incomplete,,,,,incomplete,True,p1
2,2,64.0,6.0,1.0,,complete,p1,6.0,3.0,,...,,,incomplete,,,,,incomplete,True,p1
3,3,150.0,4.0,6.0,,complete,p2,6.0,2.0,,...,,,incomplete,,,,,incomplete,True,p1
4,4,60.0,6.0,1.0,,complete,p1,6.0,4.0,,...,,,incomplete,,,,,incomplete,True,p1


### Action

- Lack of rules information makes it difficult to completely validate data.
- We can merge with other data sets to get more information.
- Validation could be done to check required number of sets are present.
- We don't know the rules so for tiebreaks, we don't know how many points the winning player obtained, i.e. 7, 10 etc.
- We don't know which player Retires or Defaults.
- We don't know rules on final set so cannot do any validation, i.e. is final set a tiebreak or continue until 2 game lead.

### Thoughts

- We probably want to exclude matches that are not completed from our evaluation.
    - Although depending on rules of bookmaker, we may want to separately predict "RET", "DEF", "W/O" outcomes.
- Possibly limited value (at least initially) from score information above, since we don't know the rules, who served first etc.

## Player Info

We are told in the README that player names are unique.

In [50]:
player_info_df.head()

Unnamed: 0,match_id,winner_name,loser_name,winner_age,loser_age,winner_rank,loser_rank,winner_rank_points,loser_rank_points,winner_seed,loser_seed,winner_ioc,loser_ioc,winner_hand,loser_hand
0,0,Antony Dupuis,Andrew Ilie,27.181383,24.035592,113.0,50.0,351.0,762.0,,1.0,FRA,AUS,R,R
1,1,Fernando Gonzalez,Cecil Mamiit,19.756331,23.843943,352.0,139.0,76.0,280.0,,,CHI,PHI,R,R
2,2,Paradorn Srichaphan,Sebastien Lareau,20.881588,27.011636,103.0,133.0,380.0,293.0,,,THA,CAN,R,R
3,3,Jan Siemerink,Justin Gimelstob,30.047912,23.26078,107.0,95.0,371.0,408.0,,8.0,NED,USA,L,R
4,4,Jason Stoltenberg,Alex Lopez Moron,30.075291,29.423682,74.0,111.0,543.0,357.0,4.0,,AUS,ESP,R,R


In [51]:
player_info_df.dtypes

match_id                int64
winner_name            object
loser_name             object
winner_age            float64
loser_age             float64
winner_rank           float64
loser_rank            float64
winner_rank_points    float64
loser_rank_points     float64
winner_seed           float64
loser_seed            float64
winner_ioc             object
loser_ioc              object
winner_hand            object
loser_hand             object
dtype: object

Thoughts:
- Can determine player D.O.B from age and fixture date.
- Age possibly useful but may just be a proxy for "experience" or "fitness".
- Rank/Seed possibly useful but hopefully player names will contain this information.
- Handedness possibly useful for "match up" purposes, but again this will be contained in player names. (since players are unlikely to change handedness)

Look at missing data.

In [53]:
player_info_df.isnull().sum()

match_id                  0
winner_name               0
loser_name                0
winner_age                6
loser_age                 9
winner_rank             629
loser_rank              745
winner_rank_points      629
loser_rank_points       745
winner_seed           25745
loser_seed            36019
winner_ioc                0
loser_ioc                 0
winner_hand               4
loser_hand               14
dtype: int64

Check summary statistics.

In [57]:
player_info_df.describe()

Unnamed: 0,match_id,winner_age,loser_age,winner_rank,loser_rank,winner_rank_points,loser_rank_points,winner_seed,loser_seed
count,47740.0,47734.0,47731.0,47111.0,46995.0,47111.0,46995.0,21995.0,11721.0
mean,23869.5,26.174268,26.292111,60.279616,94.876114,1602.614825,943.653431,7.277336,8.882604
std,13781.495262,3.617728,3.716342,80.106384,127.979022,2032.654654,1075.985612,6.684672,7.259397
min,0.0,15.824778,15.430527,1.0,1.0,1.0,1.0,1.0,1.0
25%,11934.75,23.540041,23.603012,16.0,35.0,585.0,439.0,3.0,4.0
50%,23869.5,26.078029,26.198494,41.0,65.0,935.0,684.0,5.0,7.0
75%,35804.25,28.66256,28.895277,77.0,105.0,1690.0,1065.0,9.0,12.0
max,47739.0,38.313484,46.036961,1890.0,2159.0,16950.0,16950.0,35.0,35.0


Nothing immediately stands out. (i.e. age, rank, points, seeds etc are all positive numbers, nothing absurd as min/max values)

## Player Outcome Stats

In [54]:
player_outcome_stats_df.head()

Unnamed: 0,match_id,player_name,stat,stat_value
0,0,Andrew Ilie,ace,13
1,0,Andrew Ilie,df,4
2,0,Andrew Ilie,svpt,110
3,0,Andrew Ilie,firstin,59
4,0,Andrew Ilie,firstwon,49


In [55]:
player_outcome_stats_df.dtypes

match_id        int64
player_name    object
stat           object
stat_value      int64
dtype: object

Look at unique "stats" columns. Readme provides below: <br>
ace = absolute number of aces <br>
df = number of double faults <br>
svpt = total serve points <br>
firstin = 1st serve in <br>
firstwon = points won on 1st serve <br>
secondwon = points won on 2nd serve <br>
svgms = serve games <br>
bpsaved = break point saved <br>
bpfaced = break point faced <br>


In [63]:
sorted(player_outcome_stats_df['stat'].unique())

['ace',
 'bpfaced',
 'bpsaved',
 'df',
 'firstin',
 'firstwon',
 'secondwon',
 'svgms',
 'svpt']

Transform from long to wide format. Index is combination of match_id and player_name

In [66]:
player_outcome_stats_df = player_outcome_stats_df.pivot(
    index=["match_id", "player_name"], columns="stat", values="stat_value"
).reset_index()

player_outcome_stats_df.head()

stat,match_id,player_name,ace,bpfaced,bpsaved,df,firstin,firstwon,secondwon,svgms,svpt
0,0,Andrew Ilie,13,4,4,4,59,49,31,17,110
1,0,Antony Dupuis,8,15,14,1,76,56,29,16,126
2,1,Cecil Mamiit,0,9,4,0,24,13,17,10,57
3,1,Fernando Gonzalez,4,6,4,2,35,25,16,10,67
4,2,Paradorn Srichaphan,4,0,0,1,29,23,11,8,46


In [67]:
player_outcome_stats_df.dtypes

stat
match_id        int64
player_name    object
ace             int64
bpfaced         int64
bpsaved         int64
df              int64
firstin         int64
firstwon        int64
secondwon       int64
svgms           int64
svpt            int64
dtype: object

In [68]:
player_outcome_stats_df.describe()

stat,match_id,ace,bpfaced,bpsaved,df,firstin,firstwon,secondwon,svgms,svpt
count,95480.0,95480.0,95480.0,95480.0,95480.0,95480.0,95480.0,95480.0,95480.0,95480.0
mean,23869.5,5.912317,6.84337,4.114977,3.009175,47.565145,33.795957,15.865982,12.317375,78.954807
std,13781.423092,5.215375,4.462851,3.236043,2.435003,18.97337,14.018965,7.223921,4.218137,29.208128
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11934.75,2.0,4.0,2.0,1.0,34.0,24.0,11.0,9.0,58.0
50%,23869.5,5.0,6.0,4.0,3.0,44.0,32.0,15.0,11.0,74.0
75%,35804.25,8.0,9.0,6.0,4.0,58.0,41.0,20.0,15.0,95.0
max,47739.0,113.0,34.0,25.0,23.0,361.0,292.0,101.0,91.0,491.0


The max values look a bit suspect. We will need to check these.

In [69]:
player_outcome_stats_df[player_outcome_stats_df["ace"] > 100]

stat,match_id,player_name,ace,bpfaced,bpsaved,df,firstin,firstwon,secondwon,svgms,svpt
57562,28781,John Isner,113,3,2,10,361,292,82,90,491
57563,28781,Nicolas Mahut,103,14,12,21,328,284,101,91,489


Isner-Mahut infamous match. We can see that the stats are correct. Exclude this match and check again.


In [72]:
subset_player_outcome_stats_df = player_outcome_stats_df[
    player_outcome_stats_df["match_id"] != 28781
]

In [73]:
subset_player_outcome_stats_df.describe()

stat,match_id,ace,bpfaced,bpsaved,df,firstin,firstwon,secondwon,svgms,svpt
count,95478.0,95478.0,95478.0,95478.0,95478.0,95478.0,95478.0,95478.0,95478.0,95478.0
mean,23869.397118,5.910178,6.843336,4.114917,3.008913,47.558925,33.790632,15.864398,12.315738,78.946197
std,13781.549101,5.194407,4.46282,3.235969,2.434227,18.924683,13.970738,7.215567,4.202976,29.147783
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11934.25,2.0,4.0,2.0,1.0,34.0,24.0,11.0,9.0,58.0
50%,23869.0,5.0,6.0,4.0,3.0,44.0,32.0,15.0,11.0,74.0
75%,35804.75,8.0,9.0,6.0,4.0,58.0,41.0,20.0,15.0,95.0
max,47739.0,75.0,34.0,25.0,23.0,196.0,148.0,65.0,42.0,273.0


In [74]:
player_outcome_stats_df[player_outcome_stats_df["ace"] > 70]

stat,match_id,player_name,ace,bpfaced,bpsaved,df,firstin,firstwon,secondwon,svgms,svpt
57562,28781,John Isner,113,3,2,10,361,292,82,90,491
57563,28781,Nicolas Mahut,103,14,12,21,328,284,101,91,489
95105,47552,Ivo Karlovic,75,4,3,12,140,127,44,42,213


In [77]:
player_info_df[player_info_df['match_id'] == 47552]

Unnamed: 0,match_id,winner_name,loser_name,winner_age,loser_age,winner_rank,loser_rank,winner_rank_points,loser_rank_points,winner_seed,loser_seed,winner_ioc,loser_ioc,winner_hand,loser_hand
47552,47552,Ivo Karlovic,Horacio Zeballos,37.883641,31.723477,21.0,68.0,1795.0,725.0,20.0,,CRO,ARG,R,L


Quick check also seems valid.

### Action

- We could do further validation, ie check that service games for both players agrees with the match outcome total games.
- Assume data is valid for now.

### Feature Engineering

We aim to transform some of the data with the aim of making it more useful for modelling.

In [82]:
import pandas as pd
import numpy as np



# Safely divide to avoid division by zero
def safe_divide(numerator, denominator):
    return np.where(denominator == 0, 0, numerator / denominator)


def add_transformed_variables(df:pd.DataFrame) -> pd.DataFrame:
    df['second_serves'] = df['svpt'] - df['firstin']
    df['total_serve_points_won'] = df['firstwon'] + df['secondwon']
    df['total_serve_points_lost'] = df['svpt'] - df['total_serve_points_won']
    df['first_serve_pct'] = safe_divide(df['firstin'], df['svpt'])
    df['first_serve_win_pct'] = safe_divide(df['firstwon'], df['firstin'])
    df['second_serve_win_pct'] = safe_divide(df['secondwon'], df['second_serves'])
    df['total_serve_win_pct'] = safe_divide(df['total_serve_points_won'], df['svpt'])
    df['df_pct'] = safe_divide(df['df'], df['svpt'])
    df['bpsaved_pct'] = safe_divide(df['bpsaved'], df['bpfaced'])
    df['aces_per_game'] = safe_divide(df['ace'], df['svgms'])
    df['ace_pct'] = safe_divide(df['ace'], df['svpt'])
    df['df_per_game'] = safe_divide(df['df'], df['svgms'])
    df['serve_points_won_per_game'] = safe_divide(df['total_serve_points_won'], df['svgms'])
    df['serve_points_lost_per_game'] = safe_divide(df['total_serve_points_lost'], df['svgms'])
    return df

player_outcome_stats_df = add_transformed_variables(player_outcome_stats_df)
player_outcome_stats_df.head()

stat,match_id,player_name,ace,bpfaced,bpsaved,df,firstin,firstwon,secondwon,svgms,...,aces_per_game,df_per_game,net_serve_performance,bp_performance,serve_points_won_per_game,serve_points_lost_per_game,ace_pct,second_serves,total_serve_points_won,total_serve_points_lost
0,0,Andrew Ilie,13,4,4,4,59,49,31,17,...,0.764706,0.235294,0.719176,4,4.705882,1.764706,0.118182,51,80,30
1,0,Antony Dupuis,8,15,14,1,76,56,29,16,...,0.5,0.0625,0.658421,13,5.3125,2.5625,0.063492,50,85,41
2,1,Cecil Mamiit,0,9,4,0,24,13,17,10,...,0.0,0.0,0.528409,-1,3.0,2.7,0.0,33,30,27
3,1,Fernando Gonzalez,4,6,4,2,35,25,16,10,...,0.4,0.2,0.607143,2,4.1,2.6,0.059701,32,41,26
4,2,Paradorn Srichaphan,4,0,0,1,29,23,11,8,...,0.5,0.125,0.720081,0,4.25,1.5,0.086957,17,34,12


In [83]:
player_outcome_stats_df.columns

Index(['match_id', 'player_name', 'ace', 'bpfaced', 'bpsaved', 'df', 'firstin',
       'firstwon', 'secondwon', 'svgms', 'svpt', 'first_serve_pct',
       'first_serve_win_pct', 'second_serve_win_pct', 'total_serve_win_pct',
       'df_pct', 'bpsaved_pct', 'aces_per_game', 'df_per_game',
       'net_serve_performance', 'bp_performance', 'serve_points_won_per_game',
       'serve_points_lost_per_game', 'ace_pct', 'second_serves',
       'total_serve_points_won', 'total_serve_points_lost'],
      dtype='object', name='stat')

"Inverse" data may be useful, i.e. when player 2 serving how many points does player 1 allow, i.e. measure of player 1's returning ability.