In [1]:
import pandas as pd
import os

curr_dir = os.getcwd()

parent_dir = os.path.dirname(curr_dir) # gtes the name of the parent directory

singles_net_stats_path = os.path.join(parent_dir, 'stats', 'singles_net_stats', 'singles_net_stats2.csv')
df = pd.read_csv(singles_net_stats_path)

In [2]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [3]:
# columns that need to be encoded
str_vals = ['tourney_name', 'surface','tourney_level','winner_hand','winner_ioc','loser_hand','loser_ioc','round','winner_name','loser_name','winner_entry','loser_entry']
# array of encoded label encoder objs
label_encoder_variables = []
for variable_name in str_vals:
    var_name = "nlabel_" + variable_name
    globals()[var_name] = LabelEncoder()
    label_encoder_variables.append(globals()[var_name])

In [4]:
label_encoder_variables

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [5]:
# 1
def encode():
    for i in range(len(str_vals)):
        if str_vals[i] in df.columns:
            encoded_number = label_encoder_variables[i].fit_transform(df[str_vals[i]]) # encodes each needed column --> type df
            df[str_vals[i] + "_n"] = encoded_number
    df.to_csv(singles_net_stats_path)


In [6]:
# 2
def build_player_name_map():
    player_name_map = defaultdict(int)
    
    winners = zip(df['winner_name'], df['winner_name_n'])
    losers = zip(df['loser_name'], df['loser_name_n'])

    for name, encoded in list(winners) + list(losers):
        player_name_map[name] = encoded

    return player_name_map

In [7]:
encode()
build_player_name_map()

defaultdict(int,
            {'Emilio Sanchez': 101,
             'Malivai Washington': 235,
             'Jean Philippe Fleurian': 171,
             'Eric Jelen': 102,
             'Chuck Adams': 68,
             'Christian Bergstrom': 64,
             'Jaime Yzaga': 156,
             'Luiz Mattar': 232,
             'Omar Camporese': 293,
             'Lars Jonsson': 222,
             'Grant Connell': 137,
             'Karel Novacek': 209,
             'Alex Antonitsch': 4,
             'Patrik Kuhnen': 303,
             'Marian Vajda': 243,
             'Andrei Chesnokov': 18,
             'Wally Masur': 384,
             'Derrick Rostagno': 92,
             'Martin Jaite': 256,
             'Andrei Cherkasov': 17,
             'Guy Forget': 144,
             'Todd Woodbridge': 372,
             'Fabrice Santoro': 106,
             'Aaron Krickstein': 0,
             'Johan Anderson': 184,
             'Jimmy Arias': 178,
             'Michael Stich': 273,
             'Jakob Hlase

In [8]:
# 3 
def drop_cols():
    # drop str value columns plus winner name and loser name
    global df
    for col in str_vals: 
        if col in df.columns:
            df = df.drop(str_vals,axis='columns')
            temp_cols = ['score','Unnamed: 0.1','Unnamed: 0']
            for temp_cols1 in temp_cols:
                if temp_cols1 in df.columns:
                    df = df.drop(['winner_rank','loser_rank','score','Unnamed: 0.1','Unnamed: 0'],axis='columns')

In [9]:
# rank diff --> negative = loser rank points > winner rank points --> positive = winner rank points > loser rank points
df['rank_points_diff'] = df['winner_rank_points'] - df['loser_rank_points']
df

Unnamed: 0.1,Unnamed: 0,tourney_name,surface,draw_size,tourney_level,winner_seed,winner_name,winner_hand,winner_ht,winner_ioc,...,diff_age,diff_ace,diff_df,diff_svpt,diff_1stIn,diff_1stWon,diff_2ndWon,diff_SvGms,diff_bpFaced,diff_bpSaved
0,0,Auckland,Hard,32.0,A,1.0,Emilio Sanchez,R,180.0,ESP,...,-5.0,2.0,0.0,-7.0,-7.0,-13.0,0.0,-1.0,0.0,-3.0
1,1,Auckland,Hard,32.0,A,,Malivai Washington,R,180.0,USA,...,-10.3,1.0,-6.0,0.0,-5.0,-5.0,14.0,1.0,-9.0,-6.0
2,2,Auckland,Hard,32.0,A,,Jean Philippe Fleurian,R,185.0,FRA,...,3.7,1.0,1.0,12.0,12.0,11.0,2.0,1.0,-4.0,-2.0
3,3,Auckland,Hard,32.0,A,,Eric Jelen,R,180.0,GER,...,2.0,-3.0,-1.0,-14.0,-6.0,-3.0,-1.0,0.0,-2.0,-2.0
4,4,Auckland,Hard,32.0,A,,Chuck Adams,R,185.0,USA,...,-0.9,3.0,1.0,16.0,21.0,13.0,0.0,1.0,-4.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,4500,Wimbledon,Grass,128.0,G,12.0,Andre Agassi,R,180.0,USA,...,-6.3,-4.0,1.0,4.0,2.0,5.0,6.0,1.0,-9.0,-5.0
4501,4501,Wimbledon,Grass,128.0,G,,Derrick Rostagno,R,185.0,USA,...,-2.0,-5.0,3.0,7.0,3.0,-12.0,-5.0,-1.0,5.0,0.0
4502,4502,Wimbledon,Grass,128.0,G,,Christian Saceanu,R,190.0,GER,...,-0.9,-3.0,2.0,-7.0,-24.0,-19.0,10.0,0.0,-6.0,-5.0
4503,4503,Wimbledon,Grass,128.0,G,,Jakob Hlasek,R,188.0,SUI,...,3.2,0.0,-9.0,-21.0,7.0,11.0,-14.0,0.0,-4.0,-5.0


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import warnings
import joblib
warnings.filterwarnings('ignore')

In [11]:
X = df.drop(columns=[*str_vals, 'winner_name_n', 'score', 'Unnamed: 0', 'Unnamed: 0.1', 'loser_name_n'], errors='ignore')
y = df['winner_name_n']

In [12]:
# 4
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
def train_model():
    global X
    global y
    random_forest_classifier = RandomForestClassifier(n_estimators=110,criterion='entropy',max_depth=15,n_jobs=1,random_state=42, oob_score=True)
    random_forest_classifier.fit(X_train,y_train)
    joblib.dump(random_forest_classifier, 'rf_bin_model.pkl')
    
    return random_forest_classifier

In [13]:
model = None
if __name__ == "__main__":
    encode()
    build_player_name_map()
    model = train_model()    

In [14]:
oob_score = model.oob_score_
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'oob score {oob_score}')
print(f'accuracy score {accuracy}')

oob score 0.8512763596004439
accuracy score 0.9100998890122086


In [15]:
df

Unnamed: 0.1,Unnamed: 0,tourney_name,surface,draw_size,tourney_level,winner_seed,winner_name,winner_hand,winner_ht,winner_ioc,...,diff_age,diff_ace,diff_df,diff_svpt,diff_1stIn,diff_1stWon,diff_2ndWon,diff_SvGms,diff_bpFaced,diff_bpSaved
0,0,Auckland,Hard,32.0,A,1.0,Emilio Sanchez,R,180.0,ESP,...,-5.0,2.0,0.0,-7.0,-7.0,-13.0,0.0,-1.0,0.0,-3.0
1,1,Auckland,Hard,32.0,A,,Malivai Washington,R,180.0,USA,...,-10.3,1.0,-6.0,0.0,-5.0,-5.0,14.0,1.0,-9.0,-6.0
2,2,Auckland,Hard,32.0,A,,Jean Philippe Fleurian,R,185.0,FRA,...,3.7,1.0,1.0,12.0,12.0,11.0,2.0,1.0,-4.0,-2.0
3,3,Auckland,Hard,32.0,A,,Eric Jelen,R,180.0,GER,...,2.0,-3.0,-1.0,-14.0,-6.0,-3.0,-1.0,0.0,-2.0,-2.0
4,4,Auckland,Hard,32.0,A,,Chuck Adams,R,185.0,USA,...,-0.9,3.0,1.0,16.0,21.0,13.0,0.0,1.0,-4.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,4500,Wimbledon,Grass,128.0,G,12.0,Andre Agassi,R,180.0,USA,...,-6.3,-4.0,1.0,4.0,2.0,5.0,6.0,1.0,-9.0,-5.0
4501,4501,Wimbledon,Grass,128.0,G,,Derrick Rostagno,R,185.0,USA,...,-2.0,-5.0,3.0,7.0,3.0,-12.0,-5.0,-1.0,5.0,0.0
4502,4502,Wimbledon,Grass,128.0,G,,Christian Saceanu,R,190.0,GER,...,-0.9,-3.0,2.0,-7.0,-24.0,-19.0,10.0,0.0,-6.0,-5.0
4503,4503,Wimbledon,Grass,128.0,G,,Jakob Hlasek,R,188.0,SUI,...,3.2,0.0,-9.0,-21.0,7.0,11.0,-14.0,0.0,-4.0,-5.0
