# Tennis pre-match prediction

## Test with a first dataset generate by a postegresql database and 3 tennis statistiques sources

In [10]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
#%matplotlib inline
#import matplotlib
#import matplotlib.pyplot as plt
#plt.rcParams['axes.labelsize'] = 14
#plt.rcParams['xtick.labelsize'] = 12
#plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

### get data

##### CSV version

In [11]:
PATH = "dataset"
NAME = "dataset_train_final.csv"
PRED = "dataset_pred_final.csv"

def load_dataset_csv(path=PATH, name=NAME):
    csv_path = os.path.join(path,  name)
    return pd.read_csv(csv_path)

def load_predictset_csv(path=PATH, name=PRED):
    csv_path = os.path.join(path, name)
    return pd.read_csv(csv_path)

##### SQL version

In [12]:
from sqlalchemy import create_engine
import psycopg2 as pg
import requests as req

URL = 'postgresql://dataset:dataset1747@35.204.62.91:5432/dataset-dump1'
engine = create_engine(URL)

def load_dataset_sql():
    URL = "https://raw.githubusercontent.com/panaC/tennis-dataset/master/app/dataset/extract_dataset.sql"
    with engine.connect() as conn, conn.begin():
        return pd.read_sql(req.get(URL).text, conn)

def load_predictset_sql():
    URL = "https://raw.githubusercontent.com/panaC/tennis-dataset/master/app/dataset/extract_predset.sql"
    with engine.connect() as conn, conn.begin():
        return pd.read_sql(req.get(URL).text, conn)


In [13]:
dataset = load_dataset_sql()
dataset.head()

Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,31497,https://www.flashscore.com/match/A5WZiVlj,/en/scores/archive/madrid/1536/2002/results,https://www.flashscore.com/tennis/atp-singles/...,Madrid,ATP Masters Series Madrid,Madrid,Semi-finals\n,SF,False,...,-14.370963,-3.861322,-0.921584,-0.439661,-3.069863,-3.63398,-2.877679,2.0,0.0,-0.276389
1,8812,https://www.flashscore.com/match/IcCrEkbs,/en/scores/archive/bucharest/773/2012/results,https://www.flashscore.com/tennis/atp-singles/...,Bucharest,Bucharest,Bucharest,1/16-finals\n,R32,False,...,23.070261,1.648693,4.137255,-0.348039,-6.426471,5.169935,-0.624183,0.0,1.0,-0.074712
2,53036,https://www.flashscore.com/match/GlOoStQG,/en/scores/archive/valencia/573/2003/results,https://www.flashscore.com/tennis/atp-singles/...,Valencia,Valencia,Valencia,1/16-finals\n,R32,False,...,-2.654034,-3.187368,3.777288,2.050648,6.983664,-1.720767,2.407857,1.0,0.0,-0.833793
3,12508,https://www.flashscore.com/match/xtx8SEe6,/en/scores/archive/cincinnati/422/2003/results,https://www.flashscore.com/tennis/atp-singles/...,Cincinnati,ATP Masters Series Cincinnati,Cincinnati,1/32-finals\n,R64,False,...,-17.435662,-1.833359,-5.655868,0.719789,-1.955712,-2.947333,-2.154163,1.0,0.0,0.86602
4,47243,https://www.flashscore.com/match/Gt8mGhDk,/en/scores/archive/stuttgart/321/2005/results,https://www.flashscore.com/tennis/atp-singles/...,Stuttgart,Stuttgart,Stuttgart,1/8-finals\n,R16,False,...,-59.924116,-12.098007,-12.726353,1.500216,-0.895951,-12.855204,-6.047258,0.0,1.0,0.826643


In [14]:
predset = load_predictset_sql()
predset.head()

Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,21740,https://www.flashscore.com/match/b5g7mYRl,/en/scores/archive/halle/500/2017/results,https://www.flashscore.com/tennis/atp-singles/...,Halle,Halle,Halle,1/8-finals\n,R16,False,...,24.842458,4.115788,3.518021,-0.672582,0.603285,5.072365,2.703841,1.0,0.0,0.106283
1,20836,https://www.flashscore.com/match/zVGlPvs3,/en/scores/archive/geneva/322/2018/results,https://www.flashscore.com/tennis/atp-singles/...,Geneva,Banque Eric Sturdza Geneva Open,Geneva,Final\n,F,False,...,0.49359,-2.628205,5.057692,0.378205,4.775641,-1.115385,1.679487,2.0,-0.333333,0.179946
2,20837,https://www.flashscore.com/match/j3IgqvIj,/en/scores/archive/geneva/322/2018/results,https://www.flashscore.com/tennis/atp-singles/...,Geneva,Banque Eric Sturdza Geneva Open,Geneva,Semi-finals\n,SF,False,...,-8.546491,-1.388158,1.012281,1.748246,4.722807,-2.367982,1.038158,0.0,1.0,-0.047688
3,20838,https://www.flashscore.com/match/SSTreMbC,/en/scores/archive/geneva/322/2018/results,https://www.flashscore.com/tennis/atp-singles/...,Geneva,Banque Eric Sturdza Geneva Open,Geneva,Semi-finals\n,SF,False,...,9.428644,1.402381,1.028337,-0.094823,-4.616919,2.111454,-1.201479,1.0,0.0,-0.111542
4,20839,https://www.flashscore.com/match/hAsqo4ES,/en/scores/archive/geneva/322/2018/results,https://www.flashscore.com/tennis/atp-singles/...,Geneva,Banque Eric Sturdza Geneva Open,Geneva,Quarter-finals\n,QF,False,...,1.731312,-1.68935,3.800157,0.816122,7.571281,-0.217313,3.489839,2.0,-0.333333,


In [15]:
predset.shape

(3761, 91)

In [16]:
dataset.shape

(34148, 91)

In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34148 entries, 0 to 34147
Data columns (total 91 columns):
id                                      34148 non-null int64
url_match_flash                         34148 non-null object
url_match_atp                           34148 non-null object
url_tour                                34148 non-null object
name_tour_flash                         34148 non-null object
name_tour_atp                           34148 non-null object
name_tour_csv                           34148 non-null object
round_flash                             34148 non-null object
round_csv                               34148 non-null object
qualif                                  34148 non-null bool
door                                    34148 non-null object
surface                                 34148 non-null object
prize                                   34148 non-null int64
year_atp                                34148 non-null object
year_flash                 

In [18]:
# to make this notebook's output identical at every run
np.random.seed(42)

### split the training set and the test set into 2 batchs with the hash id

In [19]:
import hashlib

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [20]:
train_set, test_set = split_train_test_by_id(dataset, 0.2, "id")

In [21]:
print(train_set.shape)
train_set.head()

(27148, 91)


Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,31497,https://www.flashscore.com/match/A5WZiVlj,/en/scores/archive/madrid/1536/2002/results,https://www.flashscore.com/tennis/atp-singles/...,Madrid,ATP Masters Series Madrid,Madrid,Semi-finals\n,SF,False,...,-14.370963,-3.861322,-0.921584,-0.439661,-3.069863,-3.63398,-2.877679,2.0,0.0,-0.276389
1,8812,https://www.flashscore.com/match/IcCrEkbs,/en/scores/archive/bucharest/773/2012/results,https://www.flashscore.com/tennis/atp-singles/...,Bucharest,Bucharest,Bucharest,1/16-finals\n,R32,False,...,23.070261,1.648693,4.137255,-0.348039,-6.426471,5.169935,-0.624183,0.0,1.0,-0.074712
2,53036,https://www.flashscore.com/match/GlOoStQG,/en/scores/archive/valencia/573/2003/results,https://www.flashscore.com/tennis/atp-singles/...,Valencia,Valencia,Valencia,1/16-finals\n,R32,False,...,-2.654034,-3.187368,3.777288,2.050648,6.983664,-1.720767,2.407857,1.0,0.0,-0.833793
3,12508,https://www.flashscore.com/match/xtx8SEe6,/en/scores/archive/cincinnati/422/2003/results,https://www.flashscore.com/tennis/atp-singles/...,Cincinnati,ATP Masters Series Cincinnati,Cincinnati,1/32-finals\n,R64,False,...,-17.435662,-1.833359,-5.655868,0.719789,-1.955712,-2.947333,-2.154163,1.0,0.0,0.86602
4,47243,https://www.flashscore.com/match/Gt8mGhDk,/en/scores/archive/stuttgart/321/2005/results,https://www.flashscore.com/tennis/atp-singles/...,Stuttgart,Stuttgart,Stuttgart,1/8-finals\n,R16,False,...,-59.924116,-12.098007,-12.726353,1.500216,-0.895951,-12.855204,-6.047258,0.0,1.0,0.826643


In [22]:
print(test_set.shape)
test_set.head()

(7000, 91)


Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
8,16168,https://www.flashscore.com/match/SCeQru63,/en/scores/archive/doha/451/2008/results,https://www.flashscore.com/tennis/atp-singles/...,Doha,Doha,Doha,Semi-finals\n,SF,False,...,-17.518324,-3.023477,-3.507222,-0.570255,-2.045622,-3.413699,-2.469572,3.0,0.25,0.316686
14,58303,https://www.flashscore.com/match/jB1JtM64,/en/scores/archive/zagreb/2276/2011/results,https://www.flashscore.com/tennis/atp-singles/...,Zagreb,Zagreb,Zagreb,1/16-finals\n,R32,False,...,-14.760084,-1.646162,-4.271457,0.837199,9.323894,-2.789496,2.928291,2.0,-0.333333,-0.503471
16,6713,https://www.flashscore.com/match/SfK5pURK,/en/scores/archive/bastad/316/2013/results,https://www.flashscore.com/tennis/atp-singles/...,Bastad,Bastad,Bastad,1/16-finals\n,R32,False,...,40.42517,7.717347,6.785374,-1.143197,-2.169388,8.471769,2.895238,0.0,1.0,
20,4251,https://www.flashscore.com/match/AeRkQsug,/en/scores/archive/australian-open/580/2001/re...,https://www.flashscore.com/tennis/atp-singles/...,Australian Open,Australian Open,Australian Open,1/32-finals\n,R64,False,...,-17.3,-1.475,-7.091667,0.483333,-1.505556,-2.863889,-2.025,0.0,1.0,0.149377
23,15056,https://www.flashscore.com/match/vg2Z57xB,/en/scores/archive/delray-beach/499/2015/results,https://www.flashscore.com/tennis/atp-singles/...,Delray Beach,Delray Beach,Delray Beach,Quarter-finals\n,QF,False,...,8.973,2.178111,-1.823889,-1.868222,-6.641889,3.478333,-2.047778,0.0,1.0,0.338504


In [23]:
dataset = train_set.copy()

In [24]:
dataset["draw_size"] = dataset["draw_size"].apply(lambda x: str(x))
dataset["draw_size"] = dataset["draw_size"].astype("object")
index_draw_size = list(dataset["draw_size"].value_counts().index)
dataset["draw_size"].value_counts()

32     9754
128    4758
28     3155
56     3120
48     2549
96     1990
64     1737
8        74
16       11
Name: draw_size, dtype: int64

In [25]:
index_level_tour = list(dataset["level_tour"].value_counts().index)
dataset["level_tour"].value_counts()

A    16240
M     6233
G     4590
F       85
Name: level_tour, dtype: int64

In [26]:
index_surface = list(dataset["surface"].value_counts().index)
dataset["surface"].value_counts()

Hard      15239
Clay       8119
Grass      3231
Carpet      559
Name: surface, dtype: int64

In [27]:
index_door = list(dataset["door"].value_counts().index)
dataset["door"].value_counts()

Outdoor    22498
Indoor      4650
Name: door, dtype: int64

In [28]:
index_round_csv = list(dataset["round_csv"].value_counts().index)
dataset["round_csv"].value_counts()

R32     9529
R16     5100
R64     4937
R128    3039
QF      2552
SF      1293
F        627
RR        71
Name: round_csv, dtype: int64

In [29]:
dataset["year_atp"].value_counts()

2016    1906
2009    1826
2014    1798
2010    1792
2006    1785
2011    1774
2005    1764
2013    1745
2012    1727
2004    1727
2008    1719
2007    1705
2003    1658
2001    1590
2002    1572
2015    1060
Name: year_atp, dtype: int64

### add column result that will be the y axis on the dataset for the prediction

In [30]:
dataset.shape

(27148, 91)

In [31]:
dataset.insert(dataset.shape[1], "result", np.random.randint(2, size=dataset.shape[0]))

In [32]:
dataset["result"].value_counts()

1    13585
0    13563
Name: result, dtype: int64

In [33]:
test_set.insert(test_set.shape[1], "result", np.random.randint(2, size=test_set.shape[0]))

In [34]:
test_set["result"].value_counts()

1    3535
0    3465
Name: result, dtype: int64

### transformer

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [36]:
class reverseStats(BaseEstimator, TransformerMixin):
       
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ## while on each row : if result = 1 nothing else opponent each value in each collumn
        def wh(a):
            if (a["result"] == 0):
                a = a.apply(lambda x: x * -1)
            return a
        
        X = X.fillna(0)
        X = X.apply(wh, axis=1)
        X = X.drop("result", axis=1)
        return X

In [37]:
stats_col = ["serve_rating_1year", "aces_1year", "double_faults_1year", "first_serve_1year", "first_serve_point_won_1year",
       "second_serve_point_won_1year", "bp_saved_1year", "service_game_played_1year", "return_rating_1year",
       "first_serve_return_point_won_1year", "bp_converted_1year", "return_games_played_1year", "service_point_won_1year",
       "return_point_won_1year", "total_point_won_1year", "serve_rating_20years",
       "aces_20years", "double_faults_20years", "first_serve_20years", "first_serve_point_won_20years", "second_serve_point_won_20years",
       "bp_saved_20years", "service_game_played_20years", "return_rating_20years", "first_serve_return_point_won_20years",
       "bp_converted_20years", "return_games_played_20years", "service_point_won_20years", "return_point_won_20years",
       "total_point_won_20years", "h2h", "minutes_loser_winner", "result"]
len(stats_col)

33

In [38]:
from sklearn.pipeline import Pipeline

stats_pipeline = Pipeline([
        ("select_stats", DataFrameSelector(stats_col)),
        ("reverse", reverseStats()),
    ])

stats_pipeline.fit_transform(dataset).shape

(27148, 32)

In [39]:
stats_pipeline.fit_transform(dataset)

Unnamed: 0,serve_rating_1year,aces_1year,double_faults_1year,first_serve_1year,first_serve_point_won_1year,second_serve_point_won_1year,bp_saved_1year,service_game_played_1year,return_rating_1year,first_serve_return_point_won_1year,...,service_game_played_20years,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,h2h,minutes_loser_winner
0,12.619211,0.216533,0.910997,5.292173,1.937366,2.542619,3.370908,-0.091845,15.568988,4.435193,...,0.268349,14.370963,3.861322,0.921584,0.439661,3.069863,3.633980,2.877679,-0.000000,0.276389
1,-29.350694,-8.222222,-0.055556,0.576389,-9.423611,3.170139,-3.253472,-0.489583,21.857639,1.722222,...,-0.411765,23.070261,1.648693,4.137255,-0.348039,-6.426471,5.169935,-0.624183,1.000000,-0.074712
2,-28.925223,-4.390253,2.524330,0.473437,-8.280506,-3.186012,-7.716369,-1.606324,5.387798,6.242336,...,-2.017434,2.654034,3.187368,-3.777288,-2.050648,-6.983664,1.720767,-2.407857,-0.000000,0.833793
3,2.760000,-1.307415,0.039048,1.938231,1.163401,2.767347,-1.696871,-0.855238,24.072517,2.518912,...,-0.695082,17.435662,1.833359,5.655868,-0.719789,1.955712,2.947333,2.154163,-0.000000,-0.866020
4,3.183059,-6.966392,-2.358852,8.609417,-3.659661,8.769414,-7.880327,-1.177961,60.143971,10.854823,...,-1.181322,59.924116,12.098007,12.726353,-1.500216,0.895951,12.855204,6.047258,-1.000000,-0.826643
5,15.372953,-0.937427,0.627339,4.621053,3.463158,3.182164,5.368567,-1.063596,30.667690,6.365789,...,-1.308291,25.285201,6.110167,6.432567,-1.131968,1.232124,4.869222,2.768531,-0.500000,-0.524411
6,70.226190,7.142857,-1.130952,5.142857,23.380952,13.190476,13.833333,1.797619,68.202381,11.892857,...,0.658095,42.002381,7.269762,14.875952,0.642381,11.011429,6.719048,8.235476,-1.000000,-0.988188
7,0.458225,-0.296050,0.237635,-3.945671,2.145996,2.669318,-1.399729,-0.385119,30.328084,5.997321,...,-0.599471,27.540083,5.663394,5.387453,-0.629917,1.070907,5.971693,2.989335,-0.333333,-0.340478
9,48.040292,0.343161,-2.199782,8.098888,8.354838,8.379346,1.881370,-0.666527,43.985568,7.936815,...,-0.357638,42.849211,7.384862,10.514026,-0.264745,10.867629,7.906587,8.417840,-1.000000,-0.440061
10,-8.989354,-1.820860,-0.403192,-0.158523,-2.315164,-0.375871,-0.249014,-0.573862,-1.164510,0.871784,...,-0.053983,-5.769948,0.070507,-2.054217,0.250268,-1.848756,-0.579861,-1.283191,0.142857,-0.108866


In [40]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.preprocessing import label_binarize

In [41]:
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, classes):
        self.classes = classes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = label_binarize(X, classes=self.classes)
        return enc

In [42]:
round_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["round_csv"])),
        ("cat_encoder", CustomLabelBinarizer(index_round_csv)),
    ])

round_pipeline.fit_transform(dataset)

array([[0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [43]:
round_pipeline.fit_transform(dataset).shape

(27148, 8)

In [44]:
surface_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["surface"])),
        ("cat_encoder", CustomLabelBinarizer(index_surface)),
    ])

surface_pipeline.fit_transform(dataset)

array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       ...,
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0]])

In [45]:
surface_pipeline.fit_transform(dataset).shape

(27148, 4)

In [46]:
drawSize_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["draw_size"])),
        ("cat_encoder", CustomLabelBinarizer(index_draw_size)),
    ])

drawSize_pipeline.fit_transform(dataset)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [47]:
drawSize_pipeline.fit_transform(dataset).shape

(27148, 9)

In [48]:
levelTour_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["level_tour"])),
        ("cat_encoder", CustomLabelBinarizer(index_level_tour)),
    ])

levelTour_pipeline.fit_transform(dataset)

array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0]])

In [49]:
levelTour_pipeline.fit_transform(dataset).shape

(27148, 4)

In [50]:
from sklearn.preprocessing import Imputer

weather_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["temp_weather", "press_weather", "humid_weather"])),
        ('imputer', Imputer(strategy="mean")),
    ])

weather_pipeline.fit_transform(dataset)



array([[ 45.        ,  27.49      ,  93.        ],
       [ 59.        ,  29.46      , 100.        ],
       [ 64.        ,  29.9       ,  77.        ],
       ...,
       [ 61.        ,  29.77      ,  67.        ],
       [ 71.        ,  29.60130104,  62.        ],
       [ 54.        ,  22.26      ,  82.        ]])

In [51]:
weather_pipeline.fit_transform(dataset).shape

(27148, 3)

In [52]:
from sklearn.preprocessing import Imputer

year_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["year_atp"])),
        ('imputer', Imputer(strategy="most_frequent")),
    ])

year_pipeline.fit_transform(dataset)



array([[2002.],
       [2012.],
       [2003.],
       ...,
       [2013.],
       [2014.],
       [2014.]])

In [53]:

year_pipeline.fit_transform(dataset).shape

(27148, 1)

In [54]:
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)

In [55]:
door_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["door"])),
        ("cat_encoder", CustomLabelBinarizer()),
    ])

door_pipeline.fit_transform(dataset)

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [56]:
door_pipeline.fit_transform(dataset).shape

(27148, 1)

In [57]:
class fillna(BaseEstimator, TransformerMixin):
    def __init__(self, val):
        self.val = val
        
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):        
        return X.fillna(self.val)

In [58]:
uncertainty_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["uncertainty_1year", "uncertainty_20years"])),
        ("cat_encoder", fillna(10)),
    ])

uncertainty_pipeline.fit_transform(dataset).shape

(27148, 2)

In [59]:
count_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["count_match_1year", "count_match_20years"])),
        ("cat_encoder", fillna(0)),
    ])

count_pipeline.fit_transform(dataset).shape

(27148, 2)

In [60]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("stats_pipeline", stats_pipeline),
        ("round_pipeline", round_pipeline),
        ("surface_pipeline", surface_pipeline),
        ("drawSize_pipeline", drawSize_pipeline),
        ("levelTour_pipeline", levelTour_pipeline),
        ("weather_pipeline", levelTour_pipeline),
        ("year_pipeline", levelTour_pipeline),
        ("door_pipeline", levelTour_pipeline),
        ("uncertainty_pipeline", levelTour_pipeline),
        ("count_pipeline", count_pipeline),
    ])

In [61]:
X_train = preprocess_pipeline.fit_transform(dataset)

In [62]:
X_train.shape

(27148, 75)

In [63]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,12.619211,0.216533,0.910997,5.292173,1.937366,2.542619,3.370908,-0.091845,15.568988,4.435193,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,187.0,232.0
1,-29.350694,-8.222222,-0.055556,0.576389,-9.423611,3.170139,-3.253472,-0.489583,21.857639,1.722222,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,72.0,85.0
2,-28.925223,-4.390253,2.52433,0.473437,-8.280506,-3.186012,-7.716369,-1.606324,5.387798,6.242336,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,75.0,152.0
3,2.76,-1.307415,0.039048,1.938231,1.163401,2.767347,-1.696871,-0.855238,24.072517,2.518912,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,45.0,88.0
4,3.183059,-6.966392,-2.358852,8.609417,-3.659661,8.769414,-7.880327,-1.177961,60.143971,10.854823,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,33.0,55.0


In [64]:
pd.DataFrame(X_train).isnull().any().value_counts()

False    75
dtype: int64

In [65]:
y_train = dataset["result"]
y_train.shape

(27148,)

#### predset preprocess

In [66]:
testset = test_set.copy()

In [67]:
X_test = preprocess_pipeline.fit_transform(testset)
y_test = testset["result"]

### KNN

In [68]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   55.0s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'weights': ['uniform', 'distance'], 'n_neighbors': [3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [69]:
grid_search.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

In [70]:
grid_search.best_score_

0.668078679829085

In [71]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='uniform', n_neighbors=3)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='uniform')

In [72]:
y_knn_pred = knn_clf.predict(X_train)

from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_knn_pred)

0.8200604096066009

### RF

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10, n_jobs=-1)
forest_scores.mean()



0.8389192665727163

In [74]:
forest_clf_pred = forest_clf.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, forest_clf_pred)

0.8381428571428572

### saved the model on the disk

In [75]:
import pickle

PATH = 'models/'

def save_model(obj, name, path=PATH):
    file = open(path + name, 'wb')
    pickle.dump(obj, file)
    file.close()

save_model(forest_clf, "forest_clf")

### GridSearch

In [76]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=100, high=500),
        'max_features': randint(low=5, high=15),
    }

forest_reg = RandomForestClassifier(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
rnd_search.fit(X_train, y_train) 

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x122c7d5c0>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x12260ff28>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [77]:
rnd_search.best_params_

{'max_features': 14, 'n_estimators': 314}

In [78]:
feature_importances = rnd_search.best_estimator_.feature_importances_
#sorted(zip(feature_importan)ces, list(dataset.columns.values)), reverse=True))
feature_importances[32:] ## useless

array([1.77953842e-03, 1.59713779e-03, 1.29400854e-03, 9.07995123e-04,
       1.32166321e-03, 1.23334463e-03, 9.89843435e-04, 7.76490713e-05,
       1.62099032e-03, 1.69373621e-03, 1.22516174e-03, 5.13609628e-04,
       1.50651218e-03, 7.07389588e-04, 1.17476465e-03, 1.24828791e-03,
       1.32694234e-03, 9.01097649e-04, 1.00598837e-03, 5.05809474e-05,
       6.22847161e-05, 8.38284674e-04, 8.28803383e-04, 7.12704204e-04,
       7.86824777e-05, 8.18380024e-04, 8.11385293e-04, 6.67458501e-04,
       8.19554511e-05, 8.06513016e-04, 7.66442569e-04, 6.84228257e-04,
       7.41008719e-05, 8.91379656e-04, 8.04493730e-04, 6.97550156e-04,
       6.31106854e-05, 8.48612316e-04, 8.01479117e-04, 6.79138612e-04,
       7.69122368e-05, 3.14189312e-02, 3.75834138e-02])

In [79]:
from sklearn.metrics import accuracy_score

rnd_pred = rnd_search.predict(X_train)

accuracy_score(y_train, rnd_pred)

1.0

In [80]:
save_model(rnd_search, "rnd_search")

### predset

match beetween years 2017 and 2018

In [81]:
valset = predset.copy()

valset.insert(valset.shape[1], "result", np.random.randint(2, size=valset.shape[0]))
valset["result"].value_counts()

X_val = preprocess_pipeline.fit_transform(valset)
y_val = valset["result"]

In [82]:
forest_clf_pred = forest_clf.predict(X_val)

from sklearn.metrics import accuracy_score
accuracy_score(y_val, forest_clf_pred)

0.8449880350970487

 - 85% of prediction accuracy , the almosts bookmakers are about 60% with odds modulation

In [83]:
from sklearn.metrics import accuracy_score

rnd_pred = rnd_search.predict(X_val)

accuracy_score(y_val, rnd_pred)

0.8699813879287424

### Test with only Common Opponent

In [84]:
X_train = stats_pipeline.fit_transform(dataset)
y_train = dataset["result"]

In [85]:
X_test = stats_pipeline.fit_transform(test_set)
y_test = test_set["result"]

In [86]:
X_val = stats_pipeline.fit_transform(valset)
y_val = valset["result"]

In [87]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=100, high=500),
        'max_features': randint(low=5, high=15),
    }

forest_reg = RandomForestClassifier(random_state=42)
rnd_search_co = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
rnd_search_co.fit(X_train, y_train) 

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11f51fd68>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11f51ff28>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [88]:
rnd_search_co.best_params_

{'max_features': 6, 'n_estimators': 443}

In [89]:
feature_importances = rnd_search_co.best_estimator_.feature_importances_
#sorted(zip(feature_importan)ces, list(dataset.columns.values)), reverse=True))
feature_importances[32:] ## useless

array([], dtype=float64)

In [90]:
from sklearn.metrics import accuracy_score

rnd_pred_co = rnd_search_co.predict(X_test)
accuracy_score(y_test, rnd_pred_co)

0.804

In [91]:
rnd_pred_co = rnd_search_co.predict(X_val)
accuracy_score(y_val, rnd_pred_co)

0.8529646370646105

In [92]:
save_model(rnd_search_co, "rnd_search_co")

## Prediction pre-match 11/10/18

In [93]:
import pickle

PATH = 'models/'

def load_model(name, path=PATH):
    file = open(path + name, 'rb')
    obj = pickle.load(file)
    file.close()
    return obj

model = load_model("rnd_search_co")

In [94]:
from sqlalchemy import create_engine
import psycopg2 as pg
import requests as req

URL = 'postgresql://dataset:dataset1747@35.204.62.91:5432/dataset-dump1'
engine = create_engine(URL)

def get_data_prediction(date, winner, looser):
    URL = "https://raw.githubusercontent.com/panaC/tennis-dataset/master/app/dataset/prediction.sql"
    with engine.connect() as conn, conn.begin():
        return pd.read_sql(req.get(URL).text.format(date=date, winner=winner, looser=looser), conn)

In [95]:
prediction_data = get_data_prediction('11/10/2018', 'Stefanos Tsitsipas', 'Alex de Minaur')
prediction_data

Unnamed: 0,uncertainty_1year,count_match_1year,serve_rating_1year,aces_1year,double_faults_1year,first_serve_1year,first_serve_point_won_1year,second_serve_point_won_1year,bp_saved_1year,service_game_played_1year,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,0.104167,12,-36.375,-4.125,0.0625,2.3125,-12.5,-0.5,-16.1875,-2.3125,...,-17.083333,0.565476,-11.922619,0.75,-5.077381,-0.5,-3.154762,0.0,1.0,


In [96]:
prediction_data.insert(prediction_data.shape[1], "result", np.random.randint(2, size=prediction_data.shape[0]))

X_pred = stats_pipeline.fit_transform(prediction_data)
pred = model.predict(X_pred)

In [97]:
model.predict_proba(X_pred)

array([[0.0744921, 0.9255079]])

In [98]:
prediction_data["result"][0]

1

In [99]:
pred[0]

1

There is 97% of probability that Tsitsipas win because the result is 1 and 1 = first player in fct

In [100]:
prediction_data = get_data_prediction('11/10/2018', 'Kevin Anderson', 'Dominic Thiem')
prediction_data

Unnamed: 0,uncertainty_1year,count_match_1year,serve_rating_1year,aces_1year,double_faults_1year,first_serve_1year,first_serve_point_won_1year,second_serve_point_won_1year,bp_saved_1year,service_game_played_1year,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,0.067935,13,-15.825,-5.65,1.6,-6.716667,2.075,0.516667,5.975,-0.333333,...,9.790835,3.092286,0.669302,-0.72378,-2.896813,2.132079,-0.335131,0.0,1.0,


In [101]:
prediction_data.insert(prediction_data.shape[1], "result", np.random.randint(2, size=prediction_data.shape[0]))

X_pred = stats_pipeline.fit_transform(prediction_data)
pred = model.predict(X_pred)

In [102]:
model.predict_proba(X_pred)

array([[0.09029345, 0.90970655]])

In [103]:
prediction_data["result"][0]

1

In [104]:
pred[0]

1

- warning combine option. I think that 1 year is too short for head2head , add 3 new colonne with may be 10 years combine

In [105]:
prediction_data = get_data_prediction('11/10/2018', 'Alexander Zverev', 'Marin Cilic')
prediction_data

Unnamed: 0,uncertainty_1year,count_match_1year,serve_rating_1year,aces_1year,double_faults_1year,first_serve_1year,first_serve_point_won_1year,second_serve_point_won_1year,bp_saved_1year,service_game_played_1year,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,0.091912,15,14.346154,4.230769,-1.076923,-2.961538,3.153846,1.115385,3.538462,1.538462,...,3.321875,0.985911,-0.114494,0.206544,3.67691,0.968995,2.11016,1.0,0.666667,


In [106]:
prediction_data.insert(prediction_data.shape[1], "result", np.random.randint(2, size=prediction_data.shape[0]))

X_pred = stats_pipeline.fit_transform(prediction_data)
pred = model.predict(X_pred)

In [107]:
model.predict_proba(X_pred)

array([[0.81264108, 0.18735892]])

In [108]:
prediction_data["result"][0]

0

In [109]:
pred[0]

0

In [110]:
prediction_data = get_data_prediction('11/10/2018', 'Novak Djokovic', 'John Isner')
prediction_data

Unnamed: 0,uncertainty_1year,count_match_1year,serve_rating_1year,aces_1year,double_faults_1year,first_serve_1year,first_serve_point_won_1year,second_serve_point_won_1year,bp_saved_1year,service_game_played_1year,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,0.15625,8,39.625,20.625,1.0,3.125,9.375,-11.6875,21.5625,7.5625,...,-68.709553,-13.003473,-14.844916,2.604565,3.238263,-14.168259,-5.131055,2.0,0.333333,


In [111]:
prediction_data.insert(prediction_data.shape[1], "result", np.random.randint(2, size=prediction_data.shape[0]))

X_pred = stats_pipeline.fit_transform(prediction_data)
pred = model.predict(X_pred)

In [112]:
model.predict_proba(X_pred)

array([[0.80586907, 0.19413093]])

In [113]:
prediction_data["result"][0]

0

In [114]:
pred[0]

0

if prediction_data["result"][0] == 0 then 
    
    pred = 1 = los or pred = 0 = winner 

else

    pred = 1 = win or pred = 0 = los