# Tennis pre-match prediction

## Test with a first dataset generate by a postegresql database and 3 tennis statistiques sources

In [59]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

### get data

##### CSV version

In [60]:
PATH = "dataset"
NAME = "dataset_train_final.csv"
PRED = "dataset_pred_final.csv"

def load_dataset_csv(path=PATH, name=NAME):
    csv_path = os.path.join(path, name)
    return pd.read_csv(csv_path)

def load_predictset_csv(path=PATH, name=PRED):
    csv_path = os.path.join(path, name)
    return pd.read_csv(csv_path)

##### SQL version

In [61]:
from sqlalchemy import create_engine
import psycopg2 as pg
import requests as req

URL = 'postgresql://dataset:dataset1747@35.204.62.91:5432/dataset-dump1'
engine = create_engine(URL)

def load_dataset_sql():
    URL = "https://raw.githubusercontent.com/panaC/tennis-dataset/master/app/dataset/extract_dataset.sql"
    with engine.connect() as conn, conn.begin():
        return pd.read_sql(req.get(URL).text, conn)

def load_predictset_sql():
    URL = "https://raw.githubusercontent.com/panaC/tennis-dataset/master/app/dataset/extract_predset.sql"
    with engine.connect() as conn, conn.begin():
        return pd.read_sql(req.get(URL).text, conn)


In [62]:
dataset = load_dataset_sql()
dataset.head()

Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,46453,https://www.flashscore.com/match/zLZZvnj4,/en/scores/archive/st-petersburg/568/2008/results,https://www.flashscore.com/tennis/atp-singles/...,St. Petersburg,St. Petersburg,St. Petersburg,1/16-finals\n,R32,False,...,,,,,,,,0.0,1.0,-0.773512
1,41294,https://www.flashscore.com/match/0rVGyXCB,/en/scores/archive/paris/352/2008/results,https://www.flashscore.com/tennis/atp-singles/...,Paris,ATP Masters Series Paris,Paris,1/32-finals\n,R64,False,...,6.712358,0.995578,2.247846,-0.18867,-0.839189,0.825073,-0.246736,1.0,0.0,-0.229482
2,11149,https://www.flashscore.com/match/AN9VUlDe,/en/scores/archive/chennai/891/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Chennai,Chennai,Chennai,1/8-finals\n,R16,False,...,4.420939,3.429365,-1.070886,0.136627,-3.283386,2.138439,-0.866561,0.0,1.0,0.187478
3,47872,https://www.flashscore.com/match/zan0S1I0,/en/scores/archive/sydney/338/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Sydney,Sydney,Sydney,Quarter-finals\n,QF,False,...,-36.133964,-5.092497,-10.070512,0.361746,-1.131777,-6.615623,-3.679177,1.0,0.5,0.913709
4,24571,https://www.flashscore.com/match/UT5TAoP1,/en/scores/archive/houston/717/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Houston,Houston,Houston,\n,R32,False,...,-27.388889,-4.018519,-8.462963,0.111111,3.833333,-4.759259,-0.592593,0.0,1.0,0.797218


In [63]:
predset = load_predictset_sql()
predset.head()

Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,1998,https://www.flashscore.com/match/bZn9YCAH,/en/scores/archive/australian-open/580/2017/re...,https://www.flashscore.com/tennis/atp-singles/...,Australian Open,Australian Open,Australian Open,1/8-finals\n,R16,False,...,-1.548307,-2.001322,-0.280143,-0.16533,-8.061915,-0.686558,-3.696575,2.0,0.428571,0.041535
1,1999,https://www.flashscore.com/match/txaPHTT8,/en/scores/archive/australian-open/580/2017/re...,https://www.flashscore.com/tennis/atp-singles/...,Australian Open,Australian Open,Australian Open,1/8-finals\n,R16,False,...,-5.05276,-4.458171,-1.550054,0.940476,-8.395076,-1.776245,-4.574946,0.0,1.0,
2,674,https://www.flashscore.com/match/xMRaz1is,/en/scores/archive/antalya/7650/2018/results,https://www.flashscore.com/tennis/atp-singles/...,Antalya,Turkish Airlines Open Antalya,Antalya,1/16-finals\n,R32,False,...,-15.450347,-2.556808,-6.339931,-0.626426,-2.170461,-2.422321,-2.293787,0.0,1.0,
3,2000,https://www.flashscore.com/match/zVGlo1Ij,/en/scores/archive/australian-open/580/2017/re...,https://www.flashscore.com/tennis/atp-singles/...,Australian Open,Australian Open,Australian Open,1/8-finals\n,R16,False,...,-1.427585,-0.952493,-1.052282,-0.220919,-4.116451,-0.244308,-2.145387,3.0,0.333333,
4,2001,https://www.flashscore.com/match/dvIpns3p,/en/scores/archive/australian-open/580/2017/re...,https://www.flashscore.com/tennis/atp-singles/...,Australian Open,Australian Open,Australian Open,1/8-finals\n,R16,False,...,48.138823,6.502585,12.775799,0.319166,5.858005,8.511413,6.928529,1.0,0.0,0.365752


In [64]:
predset.shape

(3761, 91)

In [65]:
dataset.shape

(34148, 91)

In [66]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34148 entries, 0 to 34147
Data columns (total 91 columns):
id                                      34148 non-null int64
url_match_flash                         34148 non-null object
url_match_atp                           34148 non-null object
url_tour                                34148 non-null object
name_tour_flash                         34148 non-null object
name_tour_atp                           34148 non-null object
name_tour_csv                           34148 non-null object
round_flash                             34148 non-null object
round_csv                               34148 non-null object
qualif                                  34148 non-null bool
door                                    34148 non-null object
surface                                 34148 non-null object
prize                                   34148 non-null int64
year_atp                                34148 non-null object
year_flash                 

In [67]:
# to make this notebook's output identical at every run
np.random.seed(42)

### split the training set and the test set into 2 batchs with the hash id

In [68]:
import hashlib

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [69]:
train_set, test_set = split_train_test_by_id(dataset, 0.2, "id")

In [70]:
print(train_set.shape)
train_set.head()

(27148, 91)


Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
0,46453,https://www.flashscore.com/match/zLZZvnj4,/en/scores/archive/st-petersburg/568/2008/results,https://www.flashscore.com/tennis/atp-singles/...,St. Petersburg,St. Petersburg,St. Petersburg,1/16-finals\n,R32,False,...,,,,,,,,0.0,1.0,-0.773512
1,41294,https://www.flashscore.com/match/0rVGyXCB,/en/scores/archive/paris/352/2008/results,https://www.flashscore.com/tennis/atp-singles/...,Paris,ATP Masters Series Paris,Paris,1/32-finals\n,R64,False,...,6.712358,0.995578,2.247846,-0.18867,-0.839189,0.825073,-0.246736,1.0,0.0,-0.229482
4,24571,https://www.flashscore.com/match/UT5TAoP1,/en/scores/archive/houston/717/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Houston,Houston,Houston,\n,R32,False,...,-27.388889,-4.018519,-8.462963,0.111111,3.833333,-4.759259,-0.592593,0.0,1.0,0.797218
6,56369,https://www.flashscore.com/match/MNrp7fsc,/en/scores/archive/wimbledon/540/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Wimbledon,Wimbledon,Wimbledon,1/8-finals\n,R16,False,...,2.101025,1.133722,1.94051,-0.31117,-3.193286,0.208214,-1.121869,0.0,1.0,-0.616008
7,56378,https://www.flashscore.com/match/C4udt0i0,/en/scores/archive/wimbledon/540/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Wimbledon,Wimbledon,Wimbledon,1/16-finals\n,R32,False,...,9.622499,2.504779,2.37381,0.261636,0.627115,1.83285,1.05479,2.0,0.0,-0.424937


In [71]:
print(test_set.shape)
test_set.head()

(7000, 91)


Unnamed: 0,id,url_match_flash,url_match_atp,url_tour,name_tour_flash,name_tour_atp,name_tour_csv,round_flash,round_csv,qualif,...,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,winner_loser_match_win,h2h,minutes_loser_winner
2,11149,https://www.flashscore.com/match/AN9VUlDe,/en/scores/archive/chennai/891/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Chennai,Chennai,Chennai,1/8-finals\n,R16,False,...,4.420939,3.429365,-1.070886,0.136627,-3.283386,2.138439,-0.866561,0.0,1.0,0.187478
3,47872,https://www.flashscore.com/match/zan0S1I0,/en/scores/archive/sydney/338/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Sydney,Sydney,Sydney,Quarter-finals\n,QF,False,...,-36.133964,-5.092497,-10.070512,0.361746,-1.131777,-6.615623,-3.679177,1.0,0.5,0.913709
5,24581,https://www.flashscore.com/match/KlRgOoHL,/en/scores/archive/houston/717/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Houston,Houston,Houston,1/16-finals\n,R32,False,...,14.286806,2.537153,3.804167,-0.647222,8.675694,2.500347,5.327431,0.0,1.0,0.036313
9,22833,https://www.flashscore.com/match/CAd3U9fI,/en/scores/archive/hamburg/414/2009/results,https://www.flashscore.com/tennis/atp-singles/...,Hamburg,Hamburg,Hamburg,1/8-finals\n,R16,False,...,-13.699561,-0.77193,-1.572368,0.307018,2.817982,-3.140351,0.028509,0.0,1.0,0.396022
19,58342,https://www.flashscore.com/match/2qMiK30n,/en/scores/archive/zagreb/2276/2010/results,https://www.flashscore.com/tennis/atp-singles/...,Zagreb,Zagreb,Zagreb,Semi-finals\n,SF,False,...,-0.81558,0.035688,-0.655978,0.496014,-0.785507,0.617572,-0.188043,0.0,1.0,0.07789


In [72]:
dataset = train_set.copy()

In [73]:
dataset["draw_size"] = dataset["draw_size"].apply(lambda x: str(x))
dataset["draw_size"] = dataset["draw_size"].astype("object")
index_draw_size = list(dataset["draw_size"].value_counts().index)
dataset["draw_size"].value_counts()

32     9754
128    4758
28     3155
56     3120
48     2549
96     1990
64     1737
8        74
16       11
Name: draw_size, dtype: int64

In [74]:
index_level_tour = list(dataset["level_tour"].value_counts().index)
dataset["level_tour"].value_counts()

A    16240
M     6233
G     4590
F       85
Name: level_tour, dtype: int64

In [75]:
index_surface = list(dataset["surface"].value_counts().index)
dataset["surface"].value_counts()

Hard      15239
Clay       8119
Grass      3231
Carpet      559
Name: surface, dtype: int64

In [76]:
index_door = list(dataset["door"].value_counts().index)
dataset["door"].value_counts()

Outdoor    22498
Indoor      4650
Name: door, dtype: int64

In [77]:
index_round_csv = list(dataset["round_csv"].value_counts().index)
dataset["round_csv"].value_counts()

R32     9529
R16     5100
R64     4937
R128    3039
QF      2552
SF      1293
F        627
RR        71
Name: round_csv, dtype: int64

In [78]:
dataset["year_atp"].value_counts()

2016    1906
2009    1826
2014    1798
2010    1792
2006    1785
2011    1774
2005    1764
2013    1745
2012    1727
2004    1727
2008    1719
2007    1705
2003    1658
2001    1590
2002    1572
2015    1060
Name: year_atp, dtype: int64

### add column result that will be the y axis on the dataset for the prediction

In [79]:
dataset.shape

(27148, 91)

In [80]:
dataset.insert(dataset.shape[1], "result", np.random.randint(2, size=dataset.shape[0]))

In [81]:
dataset["result"].value_counts()

1    13585
0    13563
Name: result, dtype: int64

In [82]:
test_set.insert(test_set.shape[1], "result", np.random.randint(2, size=test_set.shape[0]))

In [83]:
test_set["result"].value_counts()

1    3535
0    3465
Name: result, dtype: int64

### transformer

In [84]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [85]:
class reverseStats(BaseEstimator, TransformerMixin):
       
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ## while on each row : if result = 1 nothing else opponent each value in each collumn
        def wh(a):
            if (a["result"] == 0):
                a = a.apply(lambda x: x * -1)
            return a
        
        #tmp = np.c_[self.result, X]
        X = X.apply(wh, axis=1)
        X = X.fillna(0)
        X = X.drop("result", axis=1)
        return X

In [86]:
stats_col = ["serve_rating_1year", "aces_1year", "double_faults_1year", "first_serve_1year", "first_serve_point_won_1year",
       "second_serve_point_won_1year", "bp_saved_1year", "service_game_played_1year", "return_rating_1year",
       "first_serve_return_point_won_1year", "bp_converted_1year", "return_games_played_1year", "service_point_won_1year",
       "return_point_won_1year", "total_point_won_1year", "serve_rating_20years",
       "aces_20years", "double_faults_20years", "first_serve_20years", "first_serve_point_won_20years", "second_serve_point_won_20years",
       "bp_saved_20years", "service_game_played_20years", "return_rating_20years", "first_serve_return_point_won_20years",
       "bp_converted_20years", "return_games_played_20years", "service_point_won_20years", "return_point_won_20years",
       "total_point_won_20years", "h2h", "minutes_loser_winner", "result"]
len(stats_col)

33

In [87]:
from sklearn.pipeline import Pipeline

stats_pipeline = Pipeline([
        ("select_stats", DataFrameSelector(stats_col)),
        ("reverse", reverseStats()),
    ])

stats_pipeline.fit_transform(dataset).shape

(27148, 32)

In [88]:
stats_pipeline.fit_transform(dataset)

Unnamed: 0,serve_rating_1year,aces_1year,double_faults_1year,first_serve_1year,first_serve_point_won_1year,second_serve_point_won_1year,bp_saved_1year,service_game_played_1year,return_rating_1year,first_serve_return_point_won_1year,...,service_game_played_20years,return_rating_20years,first_serve_return_point_won_20years,bp_converted_20years,return_games_played_20years,service_point_won_20years,return_point_won_20years,total_point_won_20years,h2h,minutes_loser_winner
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1.000000,0.773512
1,-5.248770,-1.906463,-0.885008,1.441235,-2.708006,0.725850,-6.664129,-0.123888,7.866588,1.396886,...,-0.490857,6.712358,0.995578,2.247846,-0.188670,-0.839189,0.825073,-0.246736,0.000000,-0.229482
4,-3.863636,-3.318182,1.181818,2.363636,-0.500000,0.090909,-2.500000,2.045455,39.545455,10.590909,...,-0.000000,27.388889,4.018519,8.462963,-0.111111,-3.833333,4.759259,0.592593,-1.000000,-0.797218
6,11.248999,-1.732430,-0.962985,12.052418,-3.664035,0.980250,-3.957985,-0.618901,-4.497375,-2.746618,...,0.065118,-2.101025,-1.133722,-1.940510,0.311170,3.193286,-0.208214,1.121869,-1.000000,0.616008
7,-3.759786,-2.105343,0.837297,4.390615,-2.509024,-2.010051,2.528765,-0.242479,-11.505544,-2.816558,...,-0.266620,-9.622499,-2.504779,-2.373810,-0.261636,-0.627115,-1.832850,-1.054790,-0.000000,0.424937
8,-11.593195,0.322388,-0.713415,-2.607978,-3.026439,-1.054353,-4.515689,0.847459,-25.537029,-3.615618,...,0.588474,-18.066338,-2.490506,-6.515938,0.859608,-1.523675,-2.895551,-2.146697,0.250000,-0.494897
10,23.353163,1.277901,-0.018208,3.119388,4.989705,3.535234,4.844297,0.537483,21.205593,4.784921,...,0.222254,19.672078,3.733646,3.361961,-0.065724,3.573698,4.185283,3.649588,-0.333333,-0.469921
11,12.111111,3.295918,-0.191043,1.054422,1.134354,0.915533,-4.054422,-0.498866,11.549320,0.623016,...,-0.665675,9.699405,0.868056,4.159722,-0.653770,2.584325,0.442460,1.659722,-1.000000,-0.923528
12,10.916667,7.821429,1.869048,2.785714,-1.833333,-0.083333,7.559524,4.190476,-16.476190,3.333333,...,3.061111,-17.966667,1.883333,-7.700000,2.611111,-0.850000,-1.855556,-0.938889,-1.000000,0.351841
13,-31.293262,-0.731738,-1.475709,-2.830851,-7.979965,-6.042730,-3.310461,-1.410638,-43.171809,-9.003723,...,-0.057970,-34.598614,-6.830044,-9.236474,0.210949,-4.718919,-7.001903,-5.407898,1.000000,0.666206


In [89]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.preprocessing import label_binarize

In [90]:
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, classes):
        self.classes = classes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = label_binarize(X, classes=self.classes)
        return enc

In [91]:
round_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["round_csv"])),
        ("cat_encoder", CustomLabelBinarizer(index_round_csv)),
    ])

round_pipeline.fit_transform(dataset)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [92]:
round_pipeline.fit_transform(dataset).shape

(27148, 8)

In [93]:
surface_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["surface"])),
        ("cat_encoder", CustomLabelBinarizer(index_surface)),
    ])

surface_pipeline.fit_transform(dataset)

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       ...,
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]])

In [94]:
surface_pipeline.fit_transform(dataset).shape

(27148, 4)

In [95]:
drawSize_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["draw_size"])),
        ("cat_encoder", CustomLabelBinarizer(index_draw_size)),
    ])

drawSize_pipeline.fit_transform(dataset)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [96]:
drawSize_pipeline.fit_transform(dataset).shape

(27148, 9)

In [97]:
levelTour_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["level_tour"])),
        ("cat_encoder", CustomLabelBinarizer(index_level_tour)),
    ])

levelTour_pipeline.fit_transform(dataset)

array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       ...,
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]])

In [98]:
levelTour_pipeline.fit_transform(dataset).shape

(27148, 4)

In [99]:
from sklearn.preprocessing import Imputer

weather_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["temp_weather", "press_weather", "humid_weather"])),
        ('imputer', Imputer(strategy="mean")),
    ])

weather_pipeline.fit_transform(dataset)

array([[30.        , 28.81      , 93.        ],
       [61.55926031, 29.60130104, 78.46290689],
       [77.        , 29.83      , 88.        ],
       ...,
       [70.        , 29.83      , 56.        ],
       [61.55926031, 29.60130104, 78.46290689],
       [73.        , 30.07      , 57.        ]])

In [100]:
weather_pipeline.fit_transform(dataset).shape

(27148, 3)

In [101]:
from sklearn.preprocessing import Imputer

year_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["year_atp"])),
        ('imputer', Imputer(strategy="most_frequent")),
    ])

year_pipeline.fit_transform(dataset)

array([[2008.],
       [2008.],
       [2009.],
       ...,
       [2007.],
       [2008.],
       [2008.]])

In [102]:

year_pipeline.fit_transform(dataset).shape

(27148, 1)

In [103]:
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)

In [104]:
door_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["door"])),
        ("cat_encoder", CustomLabelBinarizer()),
    ])

door_pipeline.fit_transform(dataset)

array([[0],
       [0],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [105]:
door_pipeline.fit_transform(dataset).shape

(27148, 1)

In [106]:
class fillna(BaseEstimator, TransformerMixin):
    def __init__(self, val):
        self.val = val
        
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):        
        return X.fillna(self.val)

In [107]:
uncertainty_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["uncertainty_1year", "uncertainty_20years"])),
        ("cat_encoder", fillna(10)),
    ])

uncertainty_pipeline.fit_transform(dataset).shape

(27148, 2)

In [108]:
count_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["count_match_1year", "count_match_20years"])),
        ("cat_encoder", fillna(0)),
    ])

count_pipeline.fit_transform(dataset).shape

(27148, 2)

In [109]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("stats_pipeline", stats_pipeline),
        ("round_pipeline", round_pipeline),
        ("surface_pipeline", surface_pipeline),
        ("drawSize_pipeline", drawSize_pipeline),
        ("levelTour_pipeline", levelTour_pipeline),
        ("weather_pipeline", levelTour_pipeline),
        ("year_pipeline", levelTour_pipeline),
        ("door_pipeline", levelTour_pipeline),
        ("uncertainty_pipeline", levelTour_pipeline),
        ("count_pipeline", count_pipeline),
    ])

In [110]:
X_train = preprocess_pipeline.fit_transform(dataset)

In [111]:
X_train.shape

(27148, 75)

In [112]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-5.24877,-1.906463,-0.885008,1.441235,-2.708006,0.72585,-6.664129,-0.123888,7.866588,1.396886,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,194.0,209.0
2,-3.863636,-3.318182,1.181818,2.363636,-0.5,0.090909,-2.5,2.045455,39.545455,10.590909,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,32.0
3,11.248999,-1.73243,-0.962985,12.052418,-3.664035,0.98025,-3.957985,-0.618901,-4.497375,-2.746618,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,245.0,428.0
4,-3.759786,-2.105343,0.837297,4.390615,-2.509024,-2.010051,2.528765,-0.242479,-11.505544,-2.816558,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,387.0,469.0


In [113]:
pd.DataFrame(X_train).isnull().any().value_counts()

False    75
dtype: int64

In [114]:
y_train = dataset["result"]
y_train.shape

(27148,)

#### predset preprocess

In [115]:
testset = test_set.copy()

In [116]:
X_test = preprocess_pipeline.fit_transform(testset)
y_test = testset["result"]

### KNN

In [117]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV]  n_neighbors=3, weights=uniform, score=0.6515653775322283, total=   2.1s
[CV] n_neighbors=3, weights=uniform ..................................
[CV]  n_neighbors=3, weights=uniform, score=0.6469613259668509, total=   2.0s
[CV] n_neighbors=3, weights=distance .................................
[CV]  n_neighbors=3, weights=uniform, score=0.6537753222836096, total=   2.1s
[CV] n_neighbors=3, weights=distance .................................
[CV]  n_neighbors=3, weights=uniform, score=0.6564744888561429, total=   2.0s
[CV] n_neighbors=3, weights=distance .................................
[CV]  n_neighbors=3, weights=distance, score=0.6626151012891

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'weights': ['uniform', 'distance'], 'n_neighbors': [3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [118]:
grid_search.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

In [119]:
grid_search.best_score_

0.6707676440253426

In [120]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='uniform', n_neighbors=3)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='uniform')

In [121]:
y_knn_pred = knn_clf.predict(X_train)

from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_knn_pred)

0.8213128038897893

### RF

In [122]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10, n_jobs=-1)
forest_scores.mean()

0.8360468401329738

In [123]:
forest_clf_pred = forest_clf.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, forest_clf_pred)

0.8272857142857143

### saved the model on the disk

In [124]:
import pickle

PATH = 'models/'

def save_model(obj, name, path=PATH):
    file = open(path + name, 'wb')
    pickle.dump(obj, file)
    file.close()

save_model(forest_clf, "forest_clf")

### GridSearch

In [125]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=100, high=500),
        'max_features': randint(low=5, high=15),
    }

forest_reg = RandomForestClassifier(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
rnd_search.fit(X_train, y_train) 

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0e37969be0>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0e37969828>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [126]:
rnd_search.best_params_

{'max_features': 14, 'n_estimators': 314}

In [127]:
feature_importances = rnd_search.best_estimator_.feature_importances_
#sorted(zip(feature_importan)ces, list(dataset.columns.values)), reverse=True))
feature_importances[32:] ## useless

array([1.75653409e-03, 1.60280006e-03, 1.26503253e-03, 9.03136652e-04,
       1.37155518e-03, 1.18315312e-03, 9.53292146e-04, 9.41134806e-05,
       1.68051839e-03, 1.71384291e-03, 1.22258163e-03, 5.36134909e-04,
       1.42053506e-03, 7.66004463e-04, 1.25514610e-03, 1.28137508e-03,
       1.22559403e-03, 9.65788110e-04, 9.71561214e-04, 4.62818615e-05,
       4.16494691e-05, 8.06895302e-04, 7.87776190e-04, 6.50133314e-04,
       7.54337609e-05, 8.26826420e-04, 8.43725556e-04, 6.60101834e-04,
       9.00343827e-05, 8.80777440e-04, 7.88677264e-04, 7.24387306e-04,
       8.08913618e-05, 8.02896400e-04, 8.17603782e-04, 6.12404554e-04,
       6.17773828e-05, 8.64846185e-04, 8.06043502e-04, 6.61991858e-04,
       7.93062041e-05, 3.08099300e-02, 3.76803195e-02])

In [128]:
from sklearn.metrics import accuracy_score

rnd_pred = rnd_search.predict(X_train)

accuracy_score(y_train, rnd_pred)

0.9999631648740239

In [129]:
save_model(rnd_pred, "rnd_pred")

### predset

match beetween years 2017 and 2018

In [130]:
valset = predset.copy()

valset.insert(valset.shape[1], "result", np.random.randint(2, size=valset.shape[0]))
valset["result"].value_counts()

X_val = preprocess_pipeline.fit_transform(valset)
y_val = valset["result"]

In [131]:
forest_clf_pred = forest_clf.predict(X_val)

from sklearn.metrics import accuracy_score
accuracy_score(y_val, forest_clf_pred)

0.8447221483647966

 - 85% of prediction accuracy , the almosts bookmakers are about 60% with odds modulation

In [132]:
from sklearn.metrics import accuracy_score

rnd_pred = rnd_search.predict(X_val)

accuracy_score(y_val, rnd_pred)

0.8715767083222548

### Test with only Common Opponent

In [133]:
X_train = stats_pipeline.fit_transform(dataset)
y_train = dataset["result"]

In [134]:
X_test = stats_pipeline.fit_transform(test_set)
y_test = test_set["result"]

In [136]:
X_val = stats_pipeline.fit_transform(valset)
y_val = valset["result"]

In [137]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=100, high=500),
        'max_features': randint(low=5, high=15),
    }

forest_reg = RandomForestClassifier(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
rnd_search.fit(X_train, y_train) 

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0e3b8b0ef0>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0e3b8b00b8>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [138]:
rnd_search.best_params_

{'max_features': 6, 'n_estimators': 443}

In [139]:
feature_importances = rnd_search.best_estimator_.feature_importances_
#sorted(zip(feature_importan)ces, list(dataset.columns.values)), reverse=True))
feature_importances[32:] ## useless

array([], dtype=float64)

In [140]:
from sklearn.metrics import accuracy_score

rnd_pred = rnd_search.predict(X_test)
accuracy_score(y_test, rnd_pred)

0.8047142857142857

In [141]:
rnd_pred = rnd_search.predict(X_val)
accuracy_score(y_val, rnd_pred)

0.8617388992289284