In [2]:
import os
import pandas as pd

# Load the Data

In [10]:
# load the clean database tables
players_df = pd.read_csv('tennis_players_table.csv')
matches_df = pd.read_csv('tennis_matches_table.csv')
tournaments_df = pd.read_csv('tennis_tournaments_table.csv')

# load the messy odds table that must be linked to the database
odds_df = pd.concat([pd.read_csv('odds_atp.csv'), pd.read_csv('odds_wta.csv')]).drop(columns=['Unnamed: 0'])

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


# Parse Odds Player Names

In [11]:
# this function pulls the first initial and last name from a name formatted like: Capp N.
# it applies a few extra steps to attempt to match better the data that is in the database
def parse_name(x):
    
    # db does not have the ' character
    x = x.lower().replace("'", "")
    
    # split the name on whitespace
    parts = x.split(' ')
    
    # simply grab the first initial from the end
    first_initial = parts[-1]
    
    # add back in the white space to the last name
    last_name = ' '.join(parts[:-1])
 
    # FIX: remove the first part of the hypenated last name
    if '-' in last_name:
        last_name = last_name.split('-')[-1]
    
    return first_initial, last_name

def parse_last_name(x):
    return parse_name(x)[1]
def parse_first_initial(x):
    initial = parse_name(x)[0]
    
    # FIX: for multiple initials just take the first one
    initial = initial.split('.')[0]

    return initial

# get all the unique names in the odds dataset
odds_names = list(set(odds_df['Winner'].unique())) + list(set(odds_df['Loser'].unique()))
names_df = pd.DataFrame({'name': odds_names}).drop_duplicates()

# parse out last name and first initial from the names in the odds dataset
names_df['last_name'] = names_df['name'].apply(parse_last_name)
names_df['first_initial'] = names_df['name'].apply(parse_first_initial)
names_df

Unnamed: 0,name,last_name,first_initial
0,Solovieva V.,solovieva,v
1,Troicki V.,troicki,v
2,Elias G.,elias,g
3,Voracova R.,voracova,r
4,Kralert P.,kralert,p
...,...,...,...
4135,Lopes H.,lopes,h
4138,Gigounon G.,gigounon,g
4141,Wachiramanowong K.,wachiramanowong,k
4143,Tan H.,tan,h


# Parse DB Player Names

In [12]:
# this function parses out the first initial from a name formatted like: Noah Capp
def parse_first_initial(x):
    x = x.lower()
    parts = x.split(' ')
    first_name = parts[0]


    if len(first_name) == 0:
        return ''
    else:
        return first_name[0]

# this function parses out the first initial from a name formatted like: Noah Capp
# NOTE: it treats names like Noah El Capp as {Noah},{Van Capp} etc.
# TODO: make more special cases!
def parse_last_name(x):
    x = x.lower()
    parts = x.split(' ')
    extras = ['el', 'van', 'di', 'herazo']
    if len(parts) > 1:
        last_name = parts[-1]
        for extra in extras:
            if extra in parts:
                last_name = extra+' '+last_name
        return last_name
    else:
        print(x)

# parse the last name and first initial from the names as well
players_df['first_initial'] = players_df['name'].apply(parse_first_initial)
players_df['last_name'] = players_df['name'].apply(parse_last_name)
players_df

Unnamed: 0,id,name,hand,height,ioc,dob,league,first_initial,last_name
0,0,Daniil Medvedev,R,198.0,RUS,1996-02-11,atp,d,medvedev
1,1,Jo-Wilfried Tsonga,R,188.0,FRA,1985-04-17,atp,j,tsonga
2,2,Jeremy Chardy,R,188.0,FRA,1987-02-12,atp,j,chardy
3,3,Alex De Minaur,R,183.0,AUS,1999-02-17,atp,a,minaur
4,4,Milos Raonic,R,196.0,CAN,1990-12-27,atp,m,raonic
...,...,...,...,...,...,...,...,...,...
41110,41110,Raminta Mackeviciute,U,,LTU,1971-05-08,wta,r,mackeviciute
41111,41111,Marlene Zuleta,U,,CHI,1964-08-13,wta,m,zuleta
41112,41112,Siri Mittet,U,,NOR,1973-06-30,wta,s,mittet
41113,41113,Tina Vukasovic,U,,SLO,1975-09-06,wta,t,vukasovic


# Replace Odds Names with Player ID

In [13]:
# merge the odds and players datasets using the parsed last names and first initials
# IMPORTANT NOTE: we have multiple players from the database per name in the odds dataset, this is because the only 
#                 identifier for the player in the odds dataset is their name, we must later figure out who it is
#                 based on the match information
name_merge = names_df.merge(
    players_df, 
    on=['last_name', 'first_initial'], 
    how='left', 
    suffixes=['_odds', '_players']
)
print('TODO: num missing players: %d/%d' % (len(name_merge[name_merge['id'].isna()]), len(name_merge)))
print('num players:', len(name_merge), 'num odds names:', len(names_df))

# replace the winner and loser names with the player id
tmp = odds_df.merge(name_merge[['name_odds', 'id']], left_on='Winner', right_on='name_odds', how='left')
tmp = tmp.merge(name_merge[['name_odds', 'id']], left_on='Loser', right_on='name_odds', suffixes=['_winner', '_loser'], how='left')
tmp = tmp.rename(columns={'id_winner': 'winner_id', 'id_loser': 'loser_id'})
tmp = tmp.drop_duplicates()

# NOTE: here is where we see that we now have more matches in the odds dataset after the join
#.       this again is because Noel Capp and Noah Capp will have the same name in the odds dataset for now
print('odds_matches:', len(odds_df), 'odds_matches with ambiguous player id:', len(tmp))

TODO: num missing players: 323/3275
num players: 3275 num odds names: 2521
odds_matches: 86961 odds_matches with ambiguous player id: 132514


In [14]:
odds_df = tmp[~(tmp['winner_id'].isna() | tmp['loser_id'].isna())]
odds_df

Unnamed: 0,ATP,AvgL,AvgW,B&WL,B&WW,B365L,B365W,Best of,CBL,CBW,...,WPts,WRank,Winner,Wsets,Tier,WTA,name_odds_winner,winner_id,name_odds_loser,loser_id
0,1.0,,,,,,,3,,,...,,63,Dosedel S.,2.0,,,Dosedel S.,693.0,Ljubicic I.,815.0
1,1.0,,,,,,,3,,,...,,5,Enqvist T.,2.0,,,Enqvist T.,12694.0,Clement A.,665.0
2,1.0,,,,,,,3,,,...,,40,Escude N.,2.0,,,Escude N.,875.0,Baccanello P.,1198.0
3,1.0,,,,,,,3,,,...,,65,Federer R.,2.0,,,Federer R.,107.0,Knippschild J.,643.0
4,1.0,,,,,,,3,,,...,,81,Fromberg R.,2.0,,,Fromberg R.,765.0,Woodbridge T.,1083.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132507,,1.94,1.86,,,1.9,1.80,3,,,...,4120.0,7.0,Bencic B.,1.0,Tour Championships,58.0,Bencic B.,36975.0,Bertens K.,36882.0
132508,,2.34,1.60,,,2.37,1.53,3,,,...,3995.0,8.0,Svitolina E.,2.0,Tour Championships,58.0,Svitolina E.,36657.0,Kenin S.,37910.0
132510,,2.49,1.54,,,2.37,1.57,3,,,...,3995.0,8.0,Svitolina E.,1.0,Tour Championships,58.0,Svitolina E.,36657.0,Bencic B.,2999.0
132511,,2.49,1.54,,,2.37,1.57,3,,,...,3995.0,8.0,Svitolina E.,1.0,Tour Championships,58.0,Svitolina E.,36657.0,Bencic B.,36975.0



# Replace Odds Match Info with DB Match ID

In [16]:
# add an ID column based on the table index to be able to quickly find the original rows later on
matches_df['match_id'] = matches_df.index
odds_df['odds_id'] = odds_df.index

# clean up/rename some columns to make joins easier
matches_df['winner_rank'] = matches_df['winner_rank'].apply(lambda x: float(x) if not x is str else pd.NA)
matches_df['loser_rank'] = matches_df['loser_rank'].apply(lambda x: float(x) if not x is str else pd.NA)
odds_df['winner_rank'] = pd.to_numeric(odds_df['WRank'], errors='coerce')
odds_df['loser_rank'] = pd.to_numeric(odds_df['LRank'], errors='coerce')
matches_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  odds_df['odds_id'] = odds_df.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  odds_df['winner_rank'] = pd.to_numeric(odds_df['WRank'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  odds_df['loser_rank'] = pd.to_numeric(odds_df['LRank'], errors='coerce')


Unnamed: 0,tourney_id,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,match_id
0,2019-M020-World,300,111,2.0,,Kei Nishikori,R,178.0,JPN,29.0,...,34.0,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0,0
1,2019-M020-World,299,0,4.0,,Daniil Medvedev,R,198.0,RUS,22.8,...,36.0,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0,1
2,2019-M020-World,298,111,2.0,,Kei Nishikori,R,178.0,JPN,29.0,...,15.0,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0,2
3,2019-M020-World,297,1,,PR,Jo-Wilfried Tsonga,R,188.0,FRA,33.7,...,38.0,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0,3
4,2019-M020-World,296,0,4.0,,Daniil Medvedev,R,198.0,RUS,22.8,...,46.0,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054205,1978-W001,1,33657,,,Chris Evert,R,168.0,USA,23.8,...,,,,,,,,,,1054205
1054206,1978-W001,2,33648,,,Michelle Tyler,U,,GBR,20.3,...,,,,,,,,,,1054206
1054207,1978-W001,3,33656,,,Virginia Wade,R,170.0,GBR,33.3,...,,,,,,,,,,1054207
1054208,1978-W001,4,33657,,,Chris Evert,R,168.0,USA,23.8,...,,,,,,,,,,1054208


# Find matches between player id, player rank
This gets ~77k matches

In [17]:
# merge the tables
merge_cols = ['winner_id', 'loser_id', 'winner_rank', 'loser_rank']
merged_df = matches_df.merge(odds_df, on=merge_cols)

# get only the successful merges
temp_df = merged_df[~merged_df['odds_id'].isna()]

# get the unique groups
match_groups = temp_df.groupby(merge_cols)

# get only the groups with 1 match in it, the rest are ambiguous
n = match_groups['odds_id'].nunique().reset_index()
valid = n[n['odds_id'] == 1].drop(columns=['odds_id'])
successful_df = valid.merge(temp_df, how='left')
successful_df

Unnamed: 0,winner_id,loser_id,winner_rank,loser_rank,tourney_id,match_num,winner_seed,winner_entry,winner_name,winner_hand,...,Winner,Wsets,Tier,WTA,name_odds_winner,name_odds_loser,odds_id,winner_rank_points_y,loser_rank_points_y,best_of_y
0,0,1,16.0,140.0,2019-0407-World,294,5.0,,Daniil Medvedev,R,...,Medvedev D.,2.0,,,Medvedev D.,Tsonga J.W.,86299,2160.0,405.0,3
1,0,1,16.0,239.0,2019-M020-World,299,4.0,,Daniil Medvedev,R,...,Medvedev D.,2.0,,,Medvedev D.,Tsonga J.W.,85759,1977.0,200.0,3
2,0,2,16.0,35.0,2019-0407-World,273,5.0,,Daniil Medvedev,R,...,Medvedev D.,2.0,,,Medvedev D.,Chardy J.,86287,2160.0,1180.0,3
3,0,4,16.0,18.0,2019-M020-World,296,4.0,,Daniil Medvedev,R,...,Medvedev D.,2.0,,,Medvedev D.,Raonic M.,85756,1977.0,1855.0,3
4,0,4,32.0,20.0,2018-0329-World,296,,Q,Daniil Medvedev,R,...,Medvedev D.,2.0,,,Medvedev D.,Raonic M.,85356,1287.0,1755.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75606,38463,36947,279.0,65.0,2009-W-INT-MAR-01A-2009,11,,LL,Eva Fernandez Brugues,R,...,Fernandez-Brugues E.,2.0,International,19.0,Fernandez-Brugues E.,Cetkovska P.,96874,176.0,907.0,3
75607,38469,37188,271.0,357.0,2008-W-T2-IND-01A-2008,11,,Q,Agnes Szatmari,R,...,Szatmari A.,2.0,Tier 2,15.0,Szatmari A.,Lakhani I.,93194,94.0,60.0,3
75608,38602,36821,306.0,454.0,2008-W-T3-TUR-01A-2008,9,,Q,Veronika Chvojkova,R,...,Chvojkova V.,2.0,Tier 3,25.0,Chvojkova V.,Ozgen P.,93915,76.0,41.0,3
75609,38702,36866,121.0,51.0,2019-M014,283,,Q,Varvara Gracheva,R,...,Gracheva V.,2.0,Premier,56.0,Gracheva V.,Tomljanovic A.,132456,541.0,1115.0,3


# Clean up the table to write out

In [36]:
out_df = successful_df.drop(columns=[
    'winner_rank', 'loser_rank', 
    'winner_seed', 'loser_seed', 
    'winner_entry', 'loser_entry',
    'winner_name', 'loser_name',
    'winner_hand', 'loser_hand',
    'winner_ht', 'loser_ht',
    'winner_ioc', 'loser_ioc',
    'winner_age', 'loser_age',
    'score', 'best_of_x', 'round', 'minutes', 
    'w_ace', 'l_ace',
    'w_df', 'l_df',
    'w_svpt', 'l_svpt',
    'w_1stIn', 'l_1stIn',
    'w_1stWon', 'l_1stWon',
    'w_2ndWon', 'l_2ndWon',
    'w_SvGms', 'l_SvGms',
    'w_bpSaved', 'l_bpSaved',
    'w_bpFaced', 'l_bpFaced',
    'winner_rank_points_x', 'loser_rank_points_x',
    'match_id', 'odds_id', 
    'ATP', 'Best of', 'Comment',
    'Court', 'Date',
    'Location', 'Loser', 'Winner',
    'Wsets', 'Lsets',
    'L1', 'L2', 'L3', 'L4', 'L5',
    'W1', 'W2', 'W3', 'W4', 'W5',
    'WPts', 'LPts',
    'WRank', 'LRank',
    'Round', 'Series',
    'Surface', 'Tournament',
    'Tier', 'WTA',
    'name_odds_winner', 'name_odds_loser',
    'winner_rank_points_y', 'loser_rank_points_y', 'best_of_y',
    'winner_id', 'loser_id',
])
out_df.to_csv('preprocessed_odds.csv')
out_df

Unnamed: 0,tourney_id,match_num,AvgL,AvgW,B&WL,B&WW,B365L,B365W,CBL,CBW,...,MaxL,MaxW,PSL,PSW,SBL,SBW,SJL,SJW,UBL,UBW
0,2019-0407-World,294,2.33,1.61,,,2.37,1.57,,,...,2.42,1.74,2.31,1.700,,,,,,
1,2019-M020-World,299,2.23,1.67,,,2.3,1.61,,,...,2.46,1.79,2.30,1.710,,,,,,
2,2019-0407-World,273,3.96,1.25,,,4.0,1.22,,,...,4.40,1.30,4.06,1.280,,,,,,
3,2019-M020-World,296,1.79,2.04,,,1.8,2.00,,,...,1.91,2.16,1.80,2.160,,,,,,
4,2018-0329-World,296,1.46,2.73,,,1.44,2.75,,,...,1.53,2.87,1.49,2.870,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75606,2009-W-INT-MAR-01A-2009,11,,,,,1.11,6.00,,,...,,,,,,,,,1.12,6.0
75607,2008-W-T2-IND-01A-2008,11,,,,,2.75,1.40,,,...,,,3.00,1.455,,,,,,
75608,2008-W-T3-TUR-01A-2008,9,,,,,2.5,1.50,,,...,,,,,,,,,,
75609,2019-M014,283,1.81,1.99,,,1.72,2.00,,,...,1.89,2.10,1.81,2.090,,,,,,
