In [32]:
import pandas as pd
import os

curr_dir = os.getcwd()

parent_dir = os.path.dirname(curr_dir) # gtes the name of the parent directory

singles_net_stats_path = os.path.join(parent_dir, 'stats', 'singles_net_stats', 'singles_net_stats2.csv')
df = pd.read_csv(singles_net_stats_path)

In [33]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [34]:
# columns that need to be encoded
str_vals = ['tourney_name', 'surface','tourney_level','winner_hand','winner_ioc','loser_hand','loser_ioc','round','winner_name','loser_name','winner_entry','loser_entry']
# array of encoded label encoder objs
label_encoder_variables = []
for variable_name in str_vals:
    var_name = "nlabel_" + variable_name
    globals()[var_name] = LabelEncoder()
    label_encoder_variables.append(globals()[var_name])

In [35]:
label_encoder_variables

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [36]:
# 1
def encode():
    for i in range(len(str_vals)):
        if str_vals[i] in df.columns:
            encoded_number = label_encoder_variables[i].fit_transform(df[str_vals[i]]) # encodes each needed column --> type df
            df[str_vals[i] + "_n"] = encoded_number
    df.to_csv(singles_net_stats_path)


In [37]:
encode()

In [38]:
# 2
def build_player_name_map():
    player_name_map = defaultdict(int)
    
    winners = zip(df['winner_name'], df['winner_name_n'])
    losers = zip(df['loser_name'], df['loser_name_n'])

    for name, encoded in list(winners) + list(losers):
        player_name_map[name] = encoded

    return player_name_map

In [39]:
# 3 
def drop_cols():
    # drop str value columns plus winner name and loser name
    global df
    for col in str_vals: 
        if col in df.columns:
            df = df.drop(str_vals,axis='columns')
            temp_cols = ['score','Unnamed: 0.1','Unnamed: 0']
            for temp_cols1 in temp_cols:
                if temp_cols1 in df.columns:
                    df = df.drop(['winner_rank','loser_rank','score','Unnamed: 0.1','Unnamed: 0'],axis='columns')

In [40]:
# rank diff --> negative = loser rank points > winner rank points --> positive = winner rank points > loser rank points
df['rank_points_diff'] = df['winner_rank_points'] - df['loser_rank_points']
df

Unnamed: 0,Unnamed: 0.16,Unnamed: 0.15,Unnamed: 0.14,Unnamed: 0.13,Unnamed: 0.12,Unnamed: 0.11,Unnamed: 0.10,Unnamed: 0.9,Unnamed: 0.8,Unnamed: 0.7,...,winner_hand_n,winner_ioc_n,loser_hand_n,loser_ioc_n,round_n,winner_name_n,loser_name_n,winner_entry_n,loser_entry_n,rank_points_diff
0,0,0,0,0,0,0,0,0,0,0,...,3,5,3,61,6,1034,5565,10,11,
1,1,1,1,1,1,1,1,1,1,1,...,2,98,3,61,6,2520,4733,10,11,
2,2,2,2,2,2,2,2,2,2,2,...,3,54,3,61,6,649,3587,10,11,
3,3,3,3,3,3,3,3,3,3,3,...,2,81,3,145,6,4090,7071,10,11,
4,4,4,4,4,4,4,4,4,4,4,...,2,124,3,61,6,335,2742,10,11,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194991,194991,194991,194991,194991,194991,194991,194991,194991,194991,194991,...,3,123,2,82,8,2023,2963,10,11,-26.0
194992,194992,194992,194992,194992,194992,194992,194992,194992,194992,194992,...,2,127,2,109,8,3021,5634,10,11,35.0
194993,194993,194993,194993,194993,194993,194993,194993,194993,194993,194993,...,2,98,2,141,8,2410,4242,10,11,
194994,194994,194994,194994,194994,194994,194994,194994,194994,194994,194994,...,2,127,2,109,8,3021,4085,10,11,-42.0


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import warnings
import joblib
warnings.filterwarnings('ignore')

In [42]:
X = df.drop(columns=[*str_vals, 'winner_name_n', 'score', 'Unnamed: 0', 'Unnamed: 0.1', 'loser_name_n'], errors='ignore')
y = df['winner_name_n']

In [43]:
# 4
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
def train_model():
    global X
    global y
    random_forest_classifier = RandomForestClassifier(n_estimators=100,criterion='entropy',max_depth=10,n_jobs=1,random_state=42, oob_score=True)
    random_forest_classifier.fit(X_train,y_train)
    joblib.dump(random_forest_classifier, 'rf1_model.pkl')
    
    return random_forest_classifier

In [44]:
model = None
if __name__ == "__main__":
    encode()
    build_player_name_map()
    model = train_model()    

In [46]:
oob_score = model.oob_score_
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'oob score {oob_score}')
print(f'accuracy score {accuracy}')

oob score 0.6905305264237545
accuracy score 0.7144102564102565
