In [121]:
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Import scores
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


# Import classifiers3
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import pandas as pd
import numpy as np
from datetime import datetime,date
import seaborn as sns
import matplotlib.pyplot as plt

from data_prep_db import data_import_db_simple,select_by_name

import warnings
warnings.filterwarnings("ignore")

# Importing Data

In [74]:
### LOADING DATA
db_loc = r'tennis_atp.db'

sql = """SELECT * FROM MATCHES WHERE tourney_level NOT IN ('C','S','F','D','P','PM','I','E','J','T')
    AND tourney_name NOT LIKE '%Olympics%'
    AND tourney_name NOT LIKE '%Cup%'
    AND tourney_name NOT LIKE '%Finals%';"""

round_map = {'RR':7,'R128':6,'R64':5,'R32':4,'R16':3,'QF':2,'SF':1,'F':0}
points_map = {'A':250,'M':1000,'G':2000}
# transform matches and return view
df_matches = data_import_db_simple(db_loc,sql)
df_matches['tourney_date'] = pd.to_datetime(df_matches['tourney_date'],format='%Y%m%d')
df_matches['tourney_points'] = df_matches['tourney_level'].map(points_map).fillna(0)
df_matches['round_level'] = df_matches['round'].map(round_map).fillna(7)
    
# read players and create view
df_players = data_import_db_simple(db_loc,"SELECT * FROM PLAYERS;")
df_players['name'] = df_players['name_first'] + " " + df_players['name_last']
df_players['dob'] = pd.to_datetime(df_players['dob'],format='%Y%m%d',errors='coerce')

# read rankings and create view
df_rankings = data_import_db_simple(db_loc,"SELECT * FROM RANKINGS;")
df_rankings['ranking_date'] = pd.to_datetime(df_rankings['ranking_date'],format='%Y%m%d')


In [3]:
tourns_dict = {}
for i, row in df_matches.groupby(['tourney_name','surface','tourney_points','best_of','draw_size']):
    key = i[0]
    dates = row['tourney_date'].to_list()
    values = list(i[1:])
    keys = ['surface','tourney_points','best_of','draw_size','tourney_dates']
    values.append(dates)
    zip_val = dict(zip(keys,values))
    tourns_dict[key] = zip_val
    tourns_dict[key]

In [4]:
players_dict = {}
for i, row in df_players.groupby(['player_id']):
    name = row['name'].iloc[0]
    players_dict[name] = row.to_dict('records')[0]

# Preparing Data

In [6]:
# select variables to keep
matches = df_matches.copy()

# get data for all matches available
matches['order'] = np.where(matches['winner_rank']<matches['loser_rank'],'p1','p2')
matches['winner_id_final'] = matches['winner_id']

matches = matches[['id','order','winner_id_final',
    'tourney_name','tourney_date','round_level','surface','tourney_points','best_of','draw_size',
    'winner_id', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age','winner_rank',
    'loser_id', 'loser_hand','loser_ht', 'loser_ioc', 'loser_age','loser_rank', 
    # 'best_of', 'minutes', 
    # 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon','w_SvGms', 'w_bpSaved', 'w_bpFaced',
    # 'l_ace', 'l_df', 'l_svpt','l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
    # 'winner_rank_points', 'loser_rank_points',
    ]]

In [7]:
hand_map = {'R':0,'L':1,'U':-1}
surface_map = {'Hard':0,'Clay':1,'Grass':2,'Carpet':4}

losers_ioc = matches['loser_ioc']
winners_ioc = matches['winner_ioc']
all_countries = pd.concat([losers_ioc,winners_ioc])
all_countries = all_countries.unique()

le = LabelEncoder()
a = le.fit_transform(all_countries)
countries_map = {}
countries_map = dict(zip(all_countries,a))


matches['winner_hand'] = matches['winner_hand'].map(hand_map).fillna(-1)
matches['loser_hand'] = matches['loser_hand'].map(hand_map).fillna(-1)
matches['surface'] = matches['surface'].map(surface_map).fillna(-1)

# matches['winner_entry'] = np.where(matches['winner_entry'].isna(),1,0)
# matches['loser_entry'] = np.where(matches['loser_entry'].isna(),1,0)

matches['loser_ioc'] = matches['loser_ioc'].map(countries_map).fillna(-1)
matches['winner_ioc'] = matches['winner_ioc'].map(countries_map).fillna(-1)

In [8]:
vars_feats = ['id','order','winner_id_final','tourney_name','tourney_date','round_level','surface','tourney_points','best_of','draw_size']

players_data = matches.melt(id_vars=vars_feats)


players_data[['winner_loser','col_name']] = players_data['variable'].str.split("_",expand=True,n=1)

players_data['col_name'] = np.where(players_data['col_name'].isna(),players_data['winner_loser'],players_data['col_name'])

cond_1 = (players_data['winner_loser']=='winner') & (players_data['order']=='p1')
cond_2 = (players_data['winner_loser']=='loser') & (players_data['order']=='p2')

players_data['final_name'] = np.where(cond_1|cond_2,'p1_' + players_data['col_name'],'p2_' + players_data['col_name'])

vars_feats = [v for v in vars_feats if v!='order']
model_basis = players_data.pivot_table(index=vars_feats,columns='final_name',values='value').reset_index()

model_basis['result'] = np.where(model_basis['winner_id_final']==model_basis['p1_id'],1,0)

In [218]:
all_players_id = players_data[players_data['variable'].isin(['winner_id','loser_id'])]['value'].unique()

# calculate kpis
kpis = pd.DataFrame()
for p_id in all_players_id:
    cond = (model_basis['p1_id']==p_id) | (model_basis['p2_id']==p_id)
    games_for_p = model_basis[cond].copy()

    games_for_p['win'] = np.where(games_for_p['winner_id_final']==p_id,1,0)
    games_for_p['loss'] = np.where(games_for_p['winner_id_final']!=p_id,1,0)
    games_for_p = games_for_p.sort_values(by=['tourney_date','round_level'])
    
    gp = games_for_p.groupby(['id','tourney_date']).agg({'win':'sum','loss':'sum'}).reset_index()

    gp['win_loss_ratio_start'] = np.where(gp['loss'].cumsum()==0,np.nan,gp['win'].cumsum()/gp['loss'].cumsum())

    gp['win_last_10'] = gp['win'].rolling(10).sum()
    gp['loss_last_10'] = gp['loss'].rolling(10).sum()

    gp['win_loss_ratio_last_10'] = np.where(gp['loss_last_10']==0,np.nan,gp['win_last_10']/gp['loss_last_10'])

    gp['win_perc'] = gp['win'].cumsum()/(gp['loss'].cumsum()+gp['win'].cumsum())
    gp['win_perc_last_10'] = gp['win_last_10']/(gp['win_last_10']+gp['loss_last_10'])


    gp['player_id'] = p_id
    gp = gp.rename(columns={'id':'match_id','tourney_date':'match_date'})

    kpis = pd.concat([gp,kpis])


print(f'ini {kpis.shape[0]}')
kpis = kpis.dropna()
print(f'fim {kpis.shape[0]}')

# join kpis with model_basis
model = model_basis.drop(columns=['winner_id_final']).copy()

kpis_d = kpis.drop(columns=['match_date'])

kpis_p1 = kpis_d.copy()
kpis_p1.columns = ["p1_" + k for k in kpis_p1.columns]

kpis_p2 = kpis_d.copy()
kpis_p2.columns = ["p2_" + k for k in kpis_p2.columns]

model = model.merge(kpis_p1,left_on=['p1_id','id'], right_on=['p1_player_id','p1_match_id'],how='left')
model = model.merge(kpis_p2,left_on=['p2_id','id'],right_on=['p2_player_id','p2_match_id'],how='left')

model = model.dropna()

model = model.drop(columns=['p1_player_id','p2_player_id','p1_match_id','p2_match_id','p1_win','p2_win',
                            'p1_loss','p2_loss','p1_win_last_10','p2_win_last_10','p1_loss_last_10','p2_loss_last_10'])

ini 128018
fim 117990


# Loading Needed Functions

In [389]:
class TennisPredModel():

    def __init__(self,dataset):
        self.dataset = dataset
    
    def split_model(self,train_size_val=0.8,random_state_val=10):
        match_to_predict = pd.DataFrame(self.dataset.iloc[-1]).T
        matches_ready = self.dataset.iloc[0:-1]

        def create_feature_target_var(df):
            # Create feature variable
            X = df.drop('result', axis=1)
            # Create target variable
            y = df['result']
            return X,y
        
        X_to_predict,y_to_predict = create_feature_target_var(match_to_predict)
        X,y = create_feature_target_var(matches_ready)

        # Create training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size_val,random_state=random_state_val)

        return X, y, X_train, X_test, y_train, y_test, X_to_predict, y_to_predict


    def hyperparameter_tuning(self,name,X_train,y_train):
        if name=='LogisticRegression':
            # Logistic Regression Classifier
            param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
            model = LogisticRegression()
            log_cv = GridSearchCV(model, param_grid, cv=5)
            log_cv.fit(X_train, y_train)
            model = LogisticRegression(log_cv.best_params_)
            print(f'model {model}: {log_cv.best_params_}')

        elif name =='KNN':
            # KNN - Neighrest Neighbor 
            param_grid = {'n_neighbors': np.arange(1,50),
                          'algorithm':('auto','ball_tree','kd_tree','brute')}
            knn = KNeighborsClassifier()
            knn_cv = GridSearchCV(knn, param_grid, cv=5)
            knn_cv.fit(X_train, y_train)
            model = KNeighborsClassifier(knn_cv.best_params_)
            print(f'model {model}: {knn_cv.best_params_}')
        
        elif name == 'RandomForestClassifier':
            param_grid = {
                'n_estimators': [25,50],
                'max_features': ['sqrt','log2'],
                'max_depth': [3,6,9],
                'max_leaf_nodes': [3,6,9],
            }

            rfc = RandomForestClassifier()
            cv = GridSearchCV(rfc, param_grid, cv = 5)
            cv.fit(X_train, y_train)
            model = RandomForestClassifier(**cv.best_params_)
            print(f'model {model}: {cv.best_params_}')

        elif name == 'DecisionTreeClassifier':
            param_grid = {
                'criterion': ['gini','entropy'],
                'splitter': ['best','random'],
                'max_depth': [50,60,70,80],
            }

            dtc = DecisionTreeClassifier()
            cv = GridSearchCV(dtc, param_grid, cv = 5)
            cv.fit(X_train, y_train)
            model = DecisionTreeClassifier(**cv.best_params_)
            print(f'model {model}: {cv.best_params_}')
    
    def predictive_model(self,model,X_train,y_train,X_test,y_test):
        # Fit to the training data
        model.fit(X_train,y_train)

        # Compute accuracy
        accuracy = model.score(X_test,y_test)
        # print(f'Accuracy: {accuracy:.0%}')

        # Predict the labels of the test set
        y_pred = model.predict(X_test)

        precision = precision_score(y_test,y_pred)
        recall = recall_score(y_test,y_pred)
        # print(f'Precision: {precision:.0%}')
        # print(f'Recall: {recall:.0%}')

        # Generate the probabilities
        y_pred_prob = model.predict_proba(X_test)[:, 1]

        auc = roc_auc_score(y_test, y_pred_prob)
        f1_score_val = f1_score(y_test,y_pred)
        # print(f'AUC: {auc:.0%}')
        # print(f'F1 Score: {f1_score_val:.0%}')

        # # Calculate the roc metrics
        # fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

        # # Plot the ROC curve
        # plt.plot(fpr,tpr)

        # # Add labels and diagonal line
        # plt.xlabel("False Positive Rate")
        # plt.ylabel("True Positive Rate")
        # plt.plot([0, 1], [0, 1], "k--")
        # plt.show()

        return [accuracy,precision,recall,auc,f1_score_val], y_pred

    def pick_best_model(self,X_train,y_train,X_test,y_test):
        #LogisticRegression, KNN não fazem fit
        classifiers = [RandomForestClassifier(),DecisionTreeClassifier()]
        classifiers_names = ['RandomForestClassifier','DecisionTreeClassifier']

        # classifiers = classifiers[2:3]
        # classifiers_names = classifiers_names[2:3]
        dict_classifiers = dict(zip(classifiers_names, classifiers))
        
        results = {}
        predictions = {}
        best_score = 0
        best_model = ""
        for name, clf in dict_classifiers.items():
            print(f'Predicting using {name}')
            self.hyperparameter_tuning(name, X_train, y_train)
            scores, y_pred = self.predictive_model(clf, X_train, y_train, X_test, y_test)
            
            results[name] = scores
            predictions[name] = y_pred
            preci = scores[1]
            recall = scores[2]
            score = preci * recall
            print(f'{name} score is {score:.4f} | Precision {preci:.2f} | Recall {recall:.2f}')
            if score >= best_score:
                best_score = score
                best_model = name
            print(f'Current Best model {best_model}\n')

        model_selected = dict_classifiers[name]
        print(f'Model Selected: {best_model} with score {best_score:.4f}\n')
        
        return model_selected,best_model,preci,recall

    def predictive_model_apply(self,model,X,y,X_to_predict):
        # Fit to the training data

        model.fit(X,y)
        self.model = model

        # Predict the labels of the test set
        y_pred = model.predict(X_to_predict)

        prob = model.predict_proba(X_to_predict)[:][0]
        feat_imp =  model.feature_importances_

        # Sort the feature importances from greatest to least using the sorted indices
        sorted_indices = feat_imp.argsort()[::-1]
        sorted_feature_names = list(self.dataset.columns[sorted_indices])
        sorted_importances = list(feat_imp[sorted_indices])
        feat_imp_dict = dict(zip(sorted_feature_names,sorted_importances))

        return y_pred,prob,feat_imp_dict

In [401]:
def prep_model(model,p1,p2,tournament):

    model_date = model[model['tourney_date']<=tournament['tourney_date']].copy()

    # add data for predicting match        
    all_feats = ['id',
                    'tourney_name','tourney_date','round_level','surface','tourney_points','best_of','draw_size',
                    'p1_id', 'p1_ioc', 'p1_rank', 'p1_age', 'p1_hand', 'p1_ht', 'p1_win_loss_ratio_last_10',
                    'p1_win_loss_ratio_start','p1_win_perc','p1_win_perc_last_10',
                    'p2_id','p2_ioc', 'p2_rank', 'p2_age', 'p2_hand', 'p2_ht','p2_win_loss_ratio_last_10',
                    'p2_win_loss_ratio_start','p2_win_perc','p2_win_perc_last_10']

    match_id = model_date['id'].max()+1
    list_input = [match_id,
                    tournament['tourney_name'], tournament['tourney_date'], tournament['round_level'],
                    tournament['surface'],tournament['tourney_points'], tournament['best_of'],tournament['draw_size'],
                    p1['player_id'],p1['ioc'],p1['last_rank'],p1['age'],p1['hand'],p1['height'],
                    p1['win_loss_ratio_last_10'],p1['win_loss_ratio_start'],p1['win_perc'],p1['win_perc_last_10'],
                    p2['player_id'],p2['ioc'],p2['last_rank'],p2['age'],p2['hand'],p2['height'],
                    p2['win_loss_ratio_last_10'],p2['win_loss_ratio_start'],p2['win_perc'],p2['win_perc_last_10']
                    ]


    input_match = pd.DataFrame([list_input],columns=all_feats)

    input_match['p1_hand'] = input_match['p1_hand'].map(hand_map).fillna(-1)
    input_match['p2_hand'] = input_match['p2_hand'].map(hand_map).fillna(-1)
    input_match['surface'] = input_match['surface'].map(surface_map).fillna(-1)
    input_match['p1_ioc'] = input_match['p1_ioc'].map(countries_map).fillna(-1)
    input_match['p2_ioc'] = input_match['p2_ioc'].map(countries_map).fillna(-1)
    input_match['result'] = 0 
    
    model_to_predict = pd.concat([model_date,input_match])

    model_to_predict = model_to_predict
    model_to_predict = model_to_predict.set_index('id')

    model_to_predict['time_since'] = (datetime.now() - pd.to_datetime(model_to_predict['tourney_date'],format='%Y-%m-%d')).dt.days

    cond_1 = model_to_predict['p1_id']==p1['player_id']
    cond_2 = model_to_predict['p2_id']==p2['player_id']

    cond_3 = model_to_predict['p1_id']==p2['player_id']
    cond_4 = model_to_predict['p2_id']==p1['player_id']

    model_to_predict['h2h'] = np.where((cond_1)&(cond_2),1,
                                np.where((cond_3)&(cond_4),1,0))
    
    return model_to_predict

def scale_model(model):
    new_dataset = model.copy()
    new_dataset = new_dataset.drop(columns=['tourney_date'])

    # encoding using one hot encoder
    cols_ohe = ['surface']
    one_hot_encoded_data = pd.get_dummies(new_dataset[cols_ohe], columns = cols_ohe) 
    one_hot_encoded_data = one_hot_encoded_data.astype(int)

    cols_le = ['tourney_name']
    le = LabelEncoder()
    tourn_encoded = le.fit_transform(new_dataset['tourney_name'])
    new_dataset['tourney_name'] = tourn_encoded

    ini = new_dataset.shape[0]
    new_dataset = new_dataset.dropna()
    fim = new_dataset.shape[0]
    # print(f'Dropped: {fim-ini}|{(fim-ini)/ini:.2%}')

    result = new_dataset['result'].copy()
    new_dataset = new_dataset.drop(columns=['result'])
    new_dataset = new_dataset.drop(columns=cols_ohe)
    # new_dataset = new_dataset.drop(columns=cols_le)

    cols_to_scale = new_dataset.columns

    scl = StandardScaler()
    matches_scaled = scl.fit_transform(new_dataset)


    df_scaled = pd.DataFrame(matches_scaled,columns=cols_to_scale,index=new_dataset.index)

    model_scaled = pd.concat([result,one_hot_encoded_data,df_scaled],axis=1)

    ini = model_scaled.shape[0]
    model_scaled = model_scaled.dropna()
    fim = model_scaled.shape[0]
    
    return model_scaled


def main_model(model,name_a,name_b,tournament_name,tournament_date,round_level,df_rankings,kpis,tourns_dict):
    pa = players_dict[name_a]
    pb = players_dict[name_b]

    tournament = tourns_dict[tournament_name]
    tournament['tourney_date'] = tournament_date
    tournament['round_level'] = round_level
    tournament['tourney_name'] = tournament_name

    rankings_date = df_rankings[df_rankings['ranking_date']<=tournament_date].copy()

    last_rank_date = rankings_date.groupby(['player'])['ranking_date'].max().reset_index()
    last_rank = rankings_date.merge(last_rank_date,on=['player','ranking_date'],how='inner')

    pa['last_rank'] = last_rank[last_rank['player']==pa['player_id']]['rank'].iloc[0]
    pb['last_rank'] = last_rank[last_rank['player']==pb['player_id']]['rank'].iloc[0]

    if pa['last_rank']<pb['last_rank']:
        p1 = pa
        p2 = pb
    else:
        p2 = pa
        p1 = pb

    p1['age'] = int((tournament_date-p1['dob']).days)//365
    p2['age'] = int((tournament_date-p2['dob']).days)//365

    p1_kpis = kpis[kpis['player_id']==p1['player_id']].sort_values(by='match_id')
    p1_kpis = p1_kpis.iloc[-1][['win_loss_ratio_start','win_loss_ratio_last_10','win_perc','win_perc_last_10']]
    p1.update(p1_kpis.to_dict())

    p2_kpis = kpis[kpis['player_id']==p2['player_id']].sort_values(by='match_id')
    p2_kpis = p2_kpis.iloc[-1][['win_loss_ratio_start','win_loss_ratio_last_10','win_perc','win_perc_last_10']]
    p2.update(p2_kpis.to_dict())

    model_to_predict = prep_model(model,p1,p2,tournament)
    model_scaled = scale_model(model_to_predict)

    return p1,p2,model_to_predict, model_scaled

In [572]:
## PLOTTING INFO

import plotly.express as px

def rank_evol(df_rankings,p1,p2,n_years=None):
    ranks = df_rankings[df_rankings['player'].isin([p1['player_id'],p2['player_id']])]

    if n_years == None:
        first_date = ranks.groupby(['player'])['ranking_date'].min().max()
    else:
        start_date = ranks.groupby(['player'])['ranking_date'].max().min()
        first_date = pd.to_datetime(start_date) - pd.DateOffset(years=n_years)
        first_date = str(first_date)

    ranks = ranks[ranks['ranking_date']>=first_date]
    ranks = ranks.sort_values(by='ranking_date')

    names = {p1['player_id']:p1['name'],p2['player_id']:p2['name']}

    ranks['player_name'] = ranks['player'].map(names)

    color_player_map = {p1['name']:'seagreen',p2['name']:'tomato'}

    plot = px.line(data_frame=ranks,x='ranking_date',y='rank',color='player_name',title='Rank Evolution',color_discrete_map=color_player_map)
    plot.update_traces(mode="markers+lines", hovertemplate=None)
    plot.update_layout(hovermode="x")
    plot.update_yaxes(autorange="reversed")
    
    return plot


def win_loss_ratio(kpis,p1,p2,period='wl_start',n_years=None):
    kpis_players = kpis[kpis['player_id'].isin([p1['player_id'],p2['player_id']])]

    if n_years == None:
        first_date = kpis_players.groupby(['player_id'])['match_date'].min().max()
    else:
        start_date = kpis_players.groupby(['player_id'])['match_date'].max().min()
        first_date = pd.to_datetime(start_date) - pd.DateOffset(years=n_years)
        first_date = str(first_date)

    kpis_players = kpis_players[kpis_players['match_date']>=first_date]
    kpis_players = kpis_players.sort_values(by='match_date')

    names = {p1['player_id']:p1['name'],p2['player_id']:p2['name']}

    if period =='wl_start':
        kpi_to_use = 'win_loss_ratio_start'
        title_name = 'W/L Ratio Start'
    elif period == 'wl_last_10':
        kpi_to_use = 'win_loss_ratio_last_10'
        title_name = 'W/L Ratio Last 10 Matches'
    elif period == 'perc_start':
        kpi_to_use = 'win_perc'
        title_name = 'Win % Start'
    elif period == 'perc_last_10':
        kpi_to_use = 'win_perc_last_10'
        title_name = 'Win % Last 10 Matches'

    kpis_players['player_name'] = kpis_players['player_id'].map(names)

    color_player_map = {p1['name']:'seagreen',p2['name']:'tomato'}

    plot = px.line(data_frame=kpis_players,x='match_date',y=kpi_to_use,color='player_name',title=title_name,color_discrete_map=color_player_map)
    plot.update_traces(mode="markers+lines", hovertemplate=None)
    plot.update_layout(hovermode="x")
    
    return plot


def tournament_performance(player,matches,n_years=None):
    matches_p = matches[(matches['winner_id']==player['player_id'])|(matches['loser_id']==player['player_id'])]
    matches_p['win'] = np.where(matches_p['winner_id']==player['player_id'],1,0)

    if n_years != None:
        first_date = pd.to_datetime(matches['tourney_date'].max()) - pd.DateOffset(years=n_years)
        first_date = str(first_date)
        matches_p = matches_p[matches_p['tourney_date']>=first_date]

    matches_p['tourney_year'] = matches_p['tourney_date'].dt.year
    idx = matches_p.groupby(['tourney_year','tourney_name','tourney_points','surface'])['round_level'].idxmin()
    last_rounds = matches_p.loc[idx].sort_values(by='tourney_date')

    zz = {v: k for k, v in round_map.items()}
    last_rounds['round'] = last_rounds['round_level'].map(zz)

    last_rounds['last_round'] = np.where((last_rounds['round_level']==0)&(last_rounds['win']==1),'🏆Winner',last_rounds['round'])
    last_rounds['last_round'] = np.where((last_rounds['round_level']==0)&(last_rounds['win']!=1),'🥈Runner-Up',last_rounds['last_round'])


    last_rounds['round_level'] = 7 - last_rounds['round_level']
    last_rounds['tourney_date'] = last_rounds['tourney_date'].dt.date

    z = {v: k for k, v in surface_map.items()}
    last_rounds['surface'] = last_rounds['surface'].map(z)

    color_mapping = {'Clay':'#FFA15A','Grass':'#00CC96','Hard':'#636EFA','Carpet':'#AB63FA'}

    plot = px.bar(data_frame=last_rounds,x='tourney_date',y='round_level',color='surface',
                    title=f"Tournament Performance for {player['name']}",text='last_round',\
                    hover_data={'tourney_name':True,'tourney_points':True},color_discrete_map=color_mapping)

    plot.update_xaxes(type='category')
    plot.update_xaxes(categoryorder='category ascending') 
    plot.add_hline(y=7, line_width=3, line_dash="dash", line_color="gold")
    
    return plot,matches_p


def get_final_result(p1,p2,result,probability,feat_importance,plot_show=True):
    p1_rank = p1['last_rank']
    p2_rank = p2['last_rank']

    if result[0] == 1:
        # ganhou quem tem ranking mais alto
        if p1_rank < p2_rank:
            winner = p1
        else:
            winner = p2
    else:
        # ganhou quem tem ranking mais baixo
        if p1_rank < p2_rank:
            winner = p2
        else:
            winner = p1

    winner_name = winner['name']
    if winner == p1:
        loser_name = p2['name']
    else:
        loser_name = p1['name']


    probability_result = probability[int(result[0])]

    text = f'WINNER: {winner_name} | LOSER: {loser_name} | Probability: {probability_result:.2%}!'
    print(text)

    if plot_show==True:

        # Create a bar plot of the feature importances
        vals = list(feat_importance.values())[0:10]
        features = list(feat_importance.keys())[0:10]

        z = px.bar(x=vals,y=features,text=vals,title="Feature Importances",text_auto=".1%")
    

        a,matches_p1 = tournament_performance(p1,matches,2)
        b,matches_p2 = tournament_performance(p2,matches,2)
        c = win_loss_ratio(kpis,p1,p2,'wl_start',2)
        d = win_loss_ratio(kpis,p1,p2,'wl_last_10',2)
        e = win_loss_ratio(kpis,p1,p2,'perc_start',2)
        f = win_loss_ratio(kpis,p1,p2,'perc_last_10',2)
        g = rank_evol(df_rankings,p1,p2,2)

        graphs = []
        graphs = [z,a,b,c,d,e,f,g]

        for p in graphs:
            p.show()

    return winner_name,loser_name,probability_result

# Predicting Model

In [574]:
get_last_ranking = df_rankings[df_rankings['ranking_date']==df_rankings['ranking_date'].max()]
get_top_50 = get_last_ranking[get_last_ranking['rank']<=50]
get_top_50 = get_top_50.merge(df_players,left_on='player',right_on='player_id')
get_top_50

Unnamed: 0,ranking_date,rank,player,points,id_x,player_id,name_first,name_last,hand,dob,ioc,height,wikidata_id,id_y,name
0,2023-12-25,1,104925,11245.0,1246507,104925,Novak,Djokovic,R,1987-05-22,SRB,188.0,Q5812,4919,Novak Djokovic
1,2023-12-25,2,207989,8855.0,1246508,207989,Carlos,Alcaraz,R,2003-05-05,ESP,185.0,Q85518537,60092,Carlos Alcaraz
2,2023-12-25,3,106421,7600.0,1246509,106421,Daniil,Medvedev,R,1996-02-11,RUS,198.0,Q21622022,6406,Daniil Medvedev
3,2023-12-25,4,206173,6490.0,1246510,206173,Jannik,Sinner,R,2001-08-16,ITA,188.0,Q54812588,58276,Jannik Sinner
4,2023-12-25,5,126094,4805.0,1246511,126094,Andrey,Rublev,R,1997-10-20,RUS,188.0,Q17373391,26078,Andrey Rublev
5,2023-12-25,6,126774,4235.0,1246512,126774,Stefanos,Tsitsipas,R,1998-08-12,GRE,193.0,Q24450982,26758,Stefanos Tsitsipas
6,2023-12-25,7,100644,3985.0,1246513,100644,Alexander,Zverev,R,1997-04-20,GER,198.0,Q13990552,643,Alexander Zverev
7,2023-12-25,8,208029,3660.0,1246514,208029,Holger,Rune,R,2003-04-29,DEN,185.0,,60132,Holger Rune
8,2023-12-25,9,128034,3245.0,1246515,128034,Hubert,Hurkacz,R,1997-02-11,POL,196.0,Q18927958,28018,Hubert Hurkacz
9,2023-12-25,10,126203,3100.0,1246516,126203,Taylor,Fritz,R,1997-10-28,USA,193.0,Q17660516,26187,Taylor Fritz


### Define Inputs Here!!!

In [575]:
# Define inputs
name_a = 'Carlos Alcaraz'
name_b = 'Felix Auger Aliassime'
tournament_name = 'Roland Garros'
tournament_date = pd.to_datetime('2024-04-05',format='%Y-%m-%d')
round_level = 0

In [576]:
p1,p2,model_to_predict,model_scaled = main_model(model,name_a,name_b,tournament_name,tournament_date,round_level,df_rankings,kpis,tourns_dict)

tc = TennisPredModel(model_scaled)

X, y, X_train, X_test, y_train, y_test, X_to_predict, y_to_predict = tc.split_model()

print(f'Data model and features ready')
# model_selected,model_name,preci,recall = tc.pick_best_model(X_train, y_train, X_test, y_test)
params = {'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 25}
# params = {'max_depth': 9, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 25}
model_selected = RandomForestClassifier(**params)
result, probability, feat_importance = tc.predictive_model_apply(model_selected,X,y,X_to_predict)
get_final_result(p1,p2,result,probability,feat_importance,True)

Data model and features ready
WINNER: Felix Auger Aliassime | LOSER: Carlos Alcaraz | Probability: 54.54%!


('Felix Auger Aliassime', 'Carlos Alcaraz', 0.5453643509620061)

In [632]:
# Define inputs
name_a = 'Casper Ruud'
tournament_name = 'Estoril'
tournament_date = pd.to_datetime('2024-04-05',format='%Y-%m-%d')
round_level = 0

top_50_names = get_top_50[['name','rank']].drop_duplicates()

rank_a = top_50_names[top_50_names['name']==name_a]['rank'].iloc[0]

results = []
for i,row in top_50_names.iterrows():
    name_b = row['name']
    rank_b = row['rank']

    if name_b != name_a:
        p1,p2,model_to_predict,model_scaled = main_model(model,name_a,name_b,tournament_name,tournament_date,round_level,df_rankings,kpis,tourns_dict)
        
        tc = TennisPredModel(model_scaled)

        X, y, X_train, X_test, y_train, y_test, X_to_predict, y_to_predict = tc.split_model()

        print(f'Data model and features ready')
        # model_selected,model_name,preci,recall = tc.pick_best_model(X_train, y_train, X_test, y_test)
        params = {'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 25}
        # params = {'max_depth': 9, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 25}
        model_selected = RandomForestClassifier(**params)
        result, probability, feat_importance = tc.predictive_model_apply(model_selected,X,y,X_to_predict)
        winner_name,loser_name,probability_result = get_final_result(p1,p2,result,probability,feat_importance,False)

        if name_a == winner_name:
            prob = probability_result
            win_a = 1
        else:
            prob = 1-probability_result
            win_a = 0

        results.append([name_a,rank_a,name_b,rank_b,win_a,probability_result])

Data model and features ready
WINNER: Novak Djokovic | LOSER: Casper Ruud | Probability: 74.93%!
Data model and features ready
WINNER: Carlos Alcaraz | LOSER: Casper Ruud | Probability: 61.52%!
Data model and features ready
WINNER: Daniil Medvedev | LOSER: Casper Ruud | Probability: 74.23%!
Data model and features ready
WINNER: Jannik Sinner | LOSER: Casper Ruud | Probability: 75.55%!
Data model and features ready
WINNER: Andrey Rublev | LOSER: Casper Ruud | Probability: 77.74%!
Data model and features ready
WINNER: Stefanos Tsitsipas | LOSER: Casper Ruud | Probability: 76.15%!
Data model and features ready
WINNER: Alexander Zverev | LOSER: Casper Ruud | Probability: 61.42%!
Data model and features ready
WINNER: Holger Rune | LOSER: Casper Ruud | Probability: 51.93%!
Data model and features ready
WINNER: Hubert Hurkacz | LOSER: Casper Ruud | Probability: 67.04%!
Data model and features ready
WINNER: Casper Ruud | LOSER: Taylor Fritz | Probability: 58.13%!
Data model and features ready


In [633]:
df_res = pd.DataFrame(results,columns=['Name_P','Rank_P','Name_O','Rank_O','Win_for_P','Probability'])
df_res['color'] = np.where(df_res['Win_for_P']==0,-1,1)*df_res['Probability']


df_res['Type'] = np.where((df_res['Win_for_P']==1)&(df_res['Probability']>0.7),"Easy_Win",
                          np.where((df_res['Win_for_P']==1)&(df_res['Probability']<=0.7),"Hard_Win","-"))
df_res['Type'] = np.where((df_res['Win_for_P']==0)&(df_res['Probability']>0.7),"Easy_Loss",
                          np.where((df_res['Win_for_P']==0)&(df_res['Probability']<=0.7),"Hard_Loss",df_res['Type']))


px.bar(data_frame=df_res,x='Name_O',y='Probability',text = 'Probability',title="Feature Importances",text_auto=".1%",color='Type')

In [634]:
px.scatter(data_frame=df_res,x='Probability',y='Rank_O',color = 'Type',size='Rank_P',hover_name='Name_O',text='Name_O')
