``` Before running this notebook, look at all commented-out function calls and determine if you actually need to call them or not. The following notebook will alter the contents (drop and add rows and cols). The function uses randomization, so it may or may not mess up with the previous accuracy of the model.```

In [1]:
import pandas as pd 
import numpy as np
import os

In [2]:
curr_dir = os.getcwd()

parent_dir = os.path.dirname(curr_dir) # gtes the name of the parent directory

singles_net_stats_path = os.path.join(parent_dir, 'stats', 'singles_net_stats', 'singles_net_stats2.csv')
df = pd.read_csv(singles_net_stats_path)

In [3]:
# drop unnamed cols
def drop_unnamed(df):
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
    return df

In [4]:
df = drop_unnamed(df)
# drop all NaN
df = df.dropna(axis=0, how='any', subset=[
    'winner_ht',
    'loser_ht',
    'winner_age',
    'loser_age',
    'score',
    'minutes',
    'w_df',
    'w_svpt',
    'w_1stIn',
    'w_2ndWon',
    'w_bpSaved',
    'w_bpFaced',
    'l_df',
    'l_svpt',
    'l_1stIn',
    'l_2ndWon',
    'l_bpSaved',
    'l_bpFaced',
    'rank_points_diff',
    'winner_rank',
    'loser_rank'
])
df

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,winner_seed,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,...,diff_age,diff_ace,diff_df,diff_svpt,diff_1stIn,diff_1stWon,diff_2ndWon,diff_SvGms,diff_bpFaced,diff_bpSaved
0,Auckland,Hard,32.0,A,1.0,Emilio Sanchez,R,180.0,ESP,25.6,...,-2.7,0.0,-1.0,0.0,2.0,1.0,1.0,1.0,2.0,3.0
1,Auckland,Hard,32.0,A,,Malivai Washington,R,180.0,USA,21.5,...,-10.0,0.0,-1.0,-3.0,-10.0,-1.0,9.0,1.0,-6.0,-3.0
2,Auckland,Hard,32.0,A,,Jean Philippe Fleurian,R,185.0,FRA,25.3,...,10.5,1.0,-1.0,0.0,9.0,8.0,-4.0,1.0,1.0,2.0
3,Auckland,Hard,32.0,A,,Eric Jelen,R,180.0,GER,25.8,...,2.7,2.0,2.0,11.0,8.0,0.0,-4.0,0.0,8.0,4.0
4,Auckland,Hard,32.0,A,,Chuck Adams,R,185.0,USA,19.7,...,1.0,-11.0,3.0,14.0,10.0,-5.0,3.0,0.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,Wimbledon,Grass,128.0,G,12.0,Andre Agassi,R,180.0,USA,22.1,...,-3.2,-1.0,1.0,2.0,-3.0,6.0,6.0,1.0,-6.0,-3.0
4501,Wimbledon,Grass,128.0,G,,Derrick Rostagno,R,185.0,USA,26.6,...,2.2,1.0,0.0,7.0,5.0,13.0,2.0,1.0,-1.0,2.0
4502,Wimbledon,Grass,128.0,G,,Christian Saceanu,R,190.0,GER,23.9,...,-2.1,-1.0,3.0,18.0,2.0,-7.0,5.0,0.0,7.0,3.0
4503,Wimbledon,Grass,128.0,G,,Jakob Hlasek,R,188.0,SUI,27.6,...,10.3,1.0,0.0,-32.0,-36.0,-14.0,8.0,0.0,-7.0,-3.0


In [5]:
import random

In [6]:
def add_binary_col(df):
    def assign(row):
        if random.random() > 0.5:
            return pd.Series({
                'winner_p1': 1,
                'p1': row['winner_name_n'],
                'p2': row['loser_name_n']
            })
        else:
            return pd.Series({
                'winner_p1': 0,
                'p1': row['loser_name_n'],
                'p2': row['winner_name_n']
            })

    new_cols = df.apply(assign, axis=1)
    df[['winner_p1', 'p1', 'p2']] = new_cols
    return df

In [7]:
# df = add_binary_col(df)
df

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,winner_seed,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,...,diff_age,diff_ace,diff_df,diff_svpt,diff_1stIn,diff_1stWon,diff_2ndWon,diff_SvGms,diff_bpFaced,diff_bpSaved
0,Auckland,Hard,32.0,A,1.0,Emilio Sanchez,R,180.0,ESP,25.6,...,-2.7,0.0,-1.0,0.0,2.0,1.0,1.0,1.0,2.0,3.0
1,Auckland,Hard,32.0,A,,Malivai Washington,R,180.0,USA,21.5,...,-10.0,0.0,-1.0,-3.0,-10.0,-1.0,9.0,1.0,-6.0,-3.0
2,Auckland,Hard,32.0,A,,Jean Philippe Fleurian,R,185.0,FRA,25.3,...,10.5,1.0,-1.0,0.0,9.0,8.0,-4.0,1.0,1.0,2.0
3,Auckland,Hard,32.0,A,,Eric Jelen,R,180.0,GER,25.8,...,2.7,2.0,2.0,11.0,8.0,0.0,-4.0,0.0,8.0,4.0
4,Auckland,Hard,32.0,A,,Chuck Adams,R,185.0,USA,19.7,...,1.0,-11.0,3.0,14.0,10.0,-5.0,3.0,0.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,Wimbledon,Grass,128.0,G,12.0,Andre Agassi,R,180.0,USA,22.1,...,-3.2,-1.0,1.0,2.0,-3.0,6.0,6.0,1.0,-6.0,-3.0
4501,Wimbledon,Grass,128.0,G,,Derrick Rostagno,R,185.0,USA,26.6,...,2.2,1.0,0.0,7.0,5.0,13.0,2.0,1.0,-1.0,2.0
4502,Wimbledon,Grass,128.0,G,,Christian Saceanu,R,190.0,GER,23.9,...,-2.1,-1.0,3.0,18.0,2.0,-7.0,5.0,0.0,7.0,3.0
4503,Wimbledon,Grass,128.0,G,,Jakob Hlasek,R,188.0,SUI,27.6,...,10.3,1.0,0.0,-32.0,-36.0,-14.0,8.0,0.0,-7.0,-3.0


In [8]:
def compute_diffence_stats(df):
    ht_p1 = []
    age_p1 = []
    ace_p1 = []
    df_p1 = []
    svpt_p1 = []
    first_serve_p1 = []
    first_serve_won_p1 = []
    second_serve_won_p1 = []
    svgm_p1 = []
    bpsaved_p1 = []
    bpfaced_p1 = []

    ht_p2 = []
    age_p2 = []
    ace_p2 = []
    df_p2 = []
    svpt_p2 = []
    first_serve_p2 = []
    first_serve_won_p2 = []
    second_serve_won_p2 = []
    svgm_p2 = []
    bpsaved_p2 = []
    bpfaced_p2 = []

    for index, row in df.iterrows():
        if row['winner_name_n'] == row['p1']:
            # p1 is winner
            ht_p1.append(row['winner_ht'])
            age_p1.append(row['winner_age'])
            ace_p1.append(row['w_ace'])
            df_p1.append(row['w_df'])
            svpt_p1.append(row['w_svpt'])
            first_serve_p1.append(row['w_1stIn'])
            first_serve_won_p1.append(row['w_1stWon'])
            second_serve_won_p1.append(row['w_2ndWon'])
            svgm_p1.append(row['w_SvGms'])
            bpfaced_p1.append(row['w_bpFaced'])
            bpsaved_p1.append(row['w_bpSaved'])

            # p2 is loser
            ht_p2.append(row['loser_ht'])
            age_p2.append(row['loser_age'])
            ace_p2.append(row['l_ace'])
            df_p2.append(row['l_df'])
            svpt_p2.append(row['l_svpt'])
            first_serve_p2.append(row['l_1stIn'])
            first_serve_won_p2.append(row['l_1stWon'])
            second_serve_won_p2.append(row['l_2ndWon'])
            svgm_p2.append(row['l_SvGms'])
            bpfaced_p2.append(row['l_bpFaced'])
            bpsaved_p2.append(row['l_bpSaved'])
        else:
            # p1 is loser
            ht_p1.append(row['loser_ht'])
            age_p1.append(row['loser_age'])
            ace_p1.append(row['l_ace'])
            df_p1.append(row['l_df'])
            svpt_p1.append(row['l_svpt'])
            first_serve_p1.append(row['l_1stIn'])
            first_serve_won_p1.append(row['l_1stWon'])
            second_serve_won_p1.append(row['l_2ndWon'])
            svgm_p1.append(row['l_SvGms'])
            bpfaced_p1.append(row['l_bpFaced'])
            bpsaved_p1.append(row['l_bpSaved'])

            # p2 is winner
            ht_p2.append(row['winner_ht'])
            age_p2.append(row['winner_age'])
            ace_p2.append(row['w_ace'])
            df_p2.append(row['w_df'])
            svpt_p2.append(row['w_svpt'])
            first_serve_p2.append(row['w_1stIn'])
            first_serve_won_p2.append(row['w_1stWon'])
            second_serve_won_p2.append(row['w_2ndWon'])
            svgm_p2.append(row['w_SvGms'])
            bpfaced_p2.append(row['w_bpFaced'])
            bpsaved_p2.append(row['w_bpSaved'])

    # compute diffence stats
    df['diff_ht'] = pd.Series(ht_p1) - pd.Series(ht_p2)
    df['diff_age'] = pd.Series(age_p1) - pd.Series(age_p2)
    df['diff_ace'] = pd.Series(ace_p1) - pd.Series(ace_p2)
    df['diff_df'] = pd.Series(df_p1) - pd.Series(df_p2)
    df['diff_svpt'] = pd.Series(svpt_p1) - pd.Series(svpt_p2)
    df['diff_1stIn'] = pd.Series(first_serve_p1) - pd.Series(first_serve_p2)
    df['diff_1stWon'] = pd.Series(first_serve_won_p1) - pd.Series(first_serve_won_p2)
    df['diff_2ndWon'] = pd.Series(second_serve_won_p1) - pd.Series(second_serve_won_p2)
    df['diff_SvGms'] = pd.Series(svgm_p1) - pd.Series(svgm_p2)
    df['diff_bpFaced'] = pd.Series(bpfaced_p1) - pd.Series(bpfaced_p2)
    df['diff_bpSaved'] = pd.Series(bpsaved_p1) - pd.Series(bpsaved_p2)

    return df


In [9]:
df = compute_diffence_stats(df)
df

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,winner_seed,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,...,diff_age,diff_ace,diff_df,diff_svpt,diff_1stIn,diff_1stWon,diff_2ndWon,diff_SvGms,diff_bpFaced,diff_bpSaved
0,Auckland,Hard,32.0,A,1.0,Emilio Sanchez,R,180.0,ESP,25.6,...,-5.0,2.0,0.0,-7.0,-7.0,-13.0,0.0,-1.0,0.0,-3.0
1,Auckland,Hard,32.0,A,,Malivai Washington,R,180.0,USA,21.5,...,10.3,-1.0,6.0,0.0,5.0,5.0,-14.0,-1.0,9.0,6.0
2,Auckland,Hard,32.0,A,,Jean Philippe Fleurian,R,185.0,FRA,25.3,...,-3.7,-1.0,-1.0,-12.0,-12.0,-11.0,-2.0,-1.0,4.0,2.0
3,Auckland,Hard,32.0,A,,Eric Jelen,R,180.0,GER,25.8,...,-2.0,3.0,1.0,14.0,6.0,3.0,1.0,0.0,2.0,2.0
4,Auckland,Hard,32.0,A,,Chuck Adams,R,185.0,USA,19.7,...,0.9,-3.0,-1.0,-16.0,-21.0,-13.0,0.0,-1.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,Wimbledon,Grass,128.0,G,12.0,Andre Agassi,R,180.0,USA,22.1,...,6.3,4.0,-1.0,-4.0,-2.0,-5.0,-6.0,-1.0,9.0,5.0
4501,Wimbledon,Grass,128.0,G,,Derrick Rostagno,R,185.0,USA,26.6,...,-2.0,-5.0,3.0,7.0,3.0,-12.0,-5.0,-1.0,5.0,0.0
4502,Wimbledon,Grass,128.0,G,,Christian Saceanu,R,190.0,GER,23.9,...,-0.9,-3.0,2.0,-7.0,-24.0,-19.0,10.0,0.0,-6.0,-5.0
4503,Wimbledon,Grass,128.0,G,,Jakob Hlasek,R,188.0,SUI,27.6,...,-3.2,0.0,9.0,21.0,-7.0,-11.0,14.0,0.0,4.0,5.0


In [10]:
df = df.dropna(axis=0, how='any', subset=[
    'winner_ht',
    'loser_ht',
    'winner_age',
    'loser_age',
    'score',
    'minutes',
    'w_df',
    'w_svpt',
    'w_1stIn',
    'w_2ndWon',
    'w_bpSaved',
    'w_bpFaced',
    'l_df',
    'l_svpt',
    'l_1stIn',
    'l_2ndWon',
    'l_bpSaved',
    'l_bpFaced',
    'rank_points_diff',
    'winner_rank',
    'loser_rank',
    'diff_ht',
    'diff_age',
    'diff_age',
    'diff_df',
    'diff_svpt',
    'diff_1stIn',
    'diff_1stWon',
    'diff_2ndWon',
    'diff_SvGms',
    'diff_bpFaced',
    'diff_bpSaved'
])
df

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,winner_seed,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,...,diff_age,diff_ace,diff_df,diff_svpt,diff_1stIn,diff_1stWon,diff_2ndWon,diff_SvGms,diff_bpFaced,diff_bpSaved
0,Auckland,Hard,32.0,A,1.0,Emilio Sanchez,R,180.0,ESP,25.6,...,-5.0,2.0,0.0,-7.0,-7.0,-13.0,0.0,-1.0,0.0,-3.0
1,Auckland,Hard,32.0,A,,Malivai Washington,R,180.0,USA,21.5,...,10.3,-1.0,6.0,0.0,5.0,5.0,-14.0,-1.0,9.0,6.0
2,Auckland,Hard,32.0,A,,Jean Philippe Fleurian,R,185.0,FRA,25.3,...,-3.7,-1.0,-1.0,-12.0,-12.0,-11.0,-2.0,-1.0,4.0,2.0
3,Auckland,Hard,32.0,A,,Eric Jelen,R,180.0,GER,25.8,...,-2.0,3.0,1.0,14.0,6.0,3.0,1.0,0.0,2.0,2.0
4,Auckland,Hard,32.0,A,,Chuck Adams,R,185.0,USA,19.7,...,0.9,-3.0,-1.0,-16.0,-21.0,-13.0,0.0,-1.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,Wimbledon,Grass,128.0,G,12.0,Andre Agassi,R,180.0,USA,22.1,...,6.3,4.0,-1.0,-4.0,-2.0,-5.0,-6.0,-1.0,9.0,5.0
4501,Wimbledon,Grass,128.0,G,,Derrick Rostagno,R,185.0,USA,26.6,...,-2.0,-5.0,3.0,7.0,3.0,-12.0,-5.0,-1.0,5.0,0.0
4502,Wimbledon,Grass,128.0,G,,Christian Saceanu,R,190.0,GER,23.9,...,-0.9,-3.0,2.0,-7.0,-24.0,-19.0,10.0,0.0,-6.0,-5.0
4503,Wimbledon,Grass,128.0,G,,Jakob Hlasek,R,188.0,SUI,27.6,...,-3.2,0.0,9.0,21.0,-7.0,-11.0,14.0,0.0,4.0,5.0


In [11]:
# df = df.drop(['winner_entry', 'loser_entry'], axis='columns')
# df

KeyError: "['winner_entry', 'loser_entry'] not found in axis"

In [None]:
df.to_csv(singles_net_stats_path)