# Predicting Match Outcomes

In [30]:
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt


In [2]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor()

## Create match_stat_df of aggregated player stats for each match

In [3]:
query = pd.read_sql_query('''
    SELECT * 
    FROM PlayerMatchStats
''', mydb)
player_stat_df = pd.DataFrame(query)
match_stat_df = player_stat_df.groupby(['match_id','team_id']).sum().reset_index()
match_stat_df = match_stat_df.drop(columns=['id', 'player_id', 'position_id', 'minutes_played'])
#match_stat_df.columns
#print(match_stat_df.dtypes)

In [4]:
def get_averages(df):
    df['metres_per_kick'] = df['kicking_metres'] / df['kicks']
    df['metres_per_run'] = df['total_run_metres'] / df['total_runs']
    df['metres_ran_per_try'] = df['total_run_metres'] / df['tries']
    df['tackle_percentage'] = df['tackles_made'] / (df['tackles_made'] + df['tackles_missed'] + df['ineffective_tackles'])
    return df
match_stat_df = get_averages(match_stat_df)
match_stat_df = match_stat_df.drop(columns=['average_play_ball_seconds']).fillna(0)
match_stat_df.head()

Unnamed: 0,match_id,team_id,points,tries,conversions,conversion_attempts,penalty_goals,field_goals,total_runs,total_run_metres,...,handling_errors,one_on_ones_lost,penalties,on_report,sin_bins,send_offs,metres_per_kick,metres_per_run,metres_ran_per_try,tackle_percentage
0,1,1,12,2,1,2,1,0,130,1076,...,11,0,9,1,0,0,27.285714,8.276923,538.0,0.871465
1,1,14,34,5,4,5,3,0,168,1618,...,9,2,7,0,0,0,28.117647,9.630952,323.6,0.88806
2,2,6,18,3,2,3,1,0,178,1648,...,10,0,11,0,0,0,21.846154,9.258427,549.333333,0.88835
3,2,8,19,3,3,3,0,1,192,1425,...,6,0,9,0,0,0,25.619048,7.421875,475.0,0.846535
4,3,4,14,2,2,2,1,0,134,1144,...,12,0,10,0,0,0,20.928571,8.537313,572.0,0.865952


## Create match_df of info not related to player stats

In [5]:
match_query = pd.read_sql_query('''
    SELECT id, date, round, home_team_id, home_score, away_team_id, away_score, winner, home_odds, away_odds, draw_odds
    FROM Matches
''', mydb)
match_df = pd.DataFrame(match_query).rename(columns={'id': 'match_id'})
match_df['date'] = pd.to_datetime(match_df['date'])
match_df.head()

Unnamed: 0,match_id,date,round,home_team_id,home_score,away_team_id,away_score,winner,home_odds,away_odds,draw_odds
0,1,2018-03-08,1.0,14,34,1,12,14,0.5882,0.4386,0.0488
1,2,2018-03-09,1.0,8,19,6,18,8,0.5181,0.5128,0.0501
2,3,2018-03-09,1.0,10,20,4,14,10,0.5988,0.4329,0.0482
3,4,2018-03-10,1.0,16,10,15,8,15,0.25,0.7874,0.0369
4,5,2018-03-10,1.0,13,20,9,32,9,0.6803,0.3509,0.0476


In [6]:
#Add date to match_stat_df
match_stat_df = pd.merge(match_stat_df, match_df[['match_id', 'date', 'winner']], on='match_id', how='inner')
match_stat_df['is_winner'] = match_stat_df.apply(lambda x: 1 if x['team_id'] == x['winner'] else 0, axis=1)
match_stat_df = match_stat_df.drop(columns='winner')
match_stat_df.head()

Unnamed: 0,match_id,team_id,points,tries,conversions,conversion_attempts,penalty_goals,field_goals,total_runs,total_run_metres,...,penalties,on_report,sin_bins,send_offs,metres_per_kick,metres_per_run,metres_ran_per_try,tackle_percentage,date,is_winner
0,1,1,12,2,1,2,1,0,130,1076,...,9,1,0,0,27.285714,8.276923,538.0,0.871465,2018-03-08,0
1,1,14,34,5,4,5,3,0,168,1618,...,7,0,0,0,28.117647,9.630952,323.6,0.88806,2018-03-08,1
2,2,6,18,3,2,3,1,0,178,1648,...,11,0,0,0,21.846154,9.258427,549.333333,0.88835,2018-03-09,0
3,2,8,19,3,3,3,0,1,192,1425,...,9,0,0,0,25.619048,7.421875,475.0,0.846535,2018-03-09,1
4,3,4,14,2,2,2,1,0,134,1144,...,10,0,0,0,20.928571,8.537313,572.0,0.865952,2018-03-09,0


# Create class for each team

In [7]:
class Teams:
    all_teams = {}
    
    def __init__(self, name, id):
        self.name = name
        self.id = id
        self.match_stats = match_stat_df[match_stat_df['team_id'] == self.id]
        self.opposition_stats = self.get_opposition_stats()
        Teams.all_teams[self.name] = self
    
    def get_opposition_stats(self):
        match_ids = list(self.match_stats['match_id'])
        opposition_df = match_stat_df[(match_stat_df['match_id'].isin(match_ids)) & (match_stat_df['team_id'] != self.id)]
        return opposition_df
    
    def get_stats_most_recent_matches(self, date, x=1):
        date_df = self.match_stats[self.match_stats['date'] < date].sort_values('date', ascending=False)
        date_df = date_df.drop(columns=['date'])
        averages_df = date_df.head(x).sum() / x
        return averages_df
    
    def get_opposition_stats_most_recent_matches(self, date, x=1):
        date_df = self.opposition_stats[self.opposition_stats['date'] < date].sort_values('date', ascending=False)
        date_df = date_df.drop(columns=['date'])
        averages_df = date_df.head(x).sum() / x
        return averages_df
    
    def get_season_averages(self, year, self_or_opposition = 'self'):
        if self_or_opposition == 'opposition':
            stats = self.opposition_stats
        else:
            stats = self.match_stats
        date_df = stats[stats['date'].dt.year == year]
        date_df.drop(columns=['date'])
        averages_df = date_df.sum() / date_df.shape[0]
        return averages_df

team_names_and_ids = pd.DataFrame(pd.read_sql_query('SELECT id, nickname FROM Teams', mydb)).set_index('nickname').to_dict()['id']
for name in team_names_and_ids.keys():
     Teams(name, team_names_and_ids[name])
#print(Teams.all_teams['Roosters'].opposition_stats.head())
#print(Teams.all_teams['Roosters'].get_season_averages(2014))

match_id                 1382.185185
team_id                    15.000000
points                     26.296296
tries                       4.518519
conversions                 3.703704
conversion_attempts         4.518519
penalty_goals               0.407407
field_goals                 0.000000
total_runs                161.037037
total_run_metres         1500.074074
kick_return_metres        196.444444
post_contact_metres       479.518519
line_breaks                 4.925926
line_break_assists          3.333333
try_assists                 3.555556
line_engaged_runs           0.000000
tackle_breaks              28.888889
hit_ups                    78.740741
play_the_ball               0.000000
dummy_half_runs             9.666667
dummy_half_run_metres      85.222222
steals                      0.111111
offloads                    9.037037
dummy_passes               10.333333
passes                    190.666667
receipts                  367.111111
tackles_made              321.518519
t

# Create class for each match

In [8]:
class Matches:
    all_matches = {}
    
    def __init__(self, match_id, round, date, home_team, away_team, winner=None, home_odds=None, away_odds=None):
        self.id = match_id
        self.round = round
        self.date = date
        self.season = self.date.year
        self.home = home_team
        self.home_odds = home_odds
        self.away = away_team
        self.away_odds = away_odds
        if home_team.id == winner:
            self.winner = home_team
            self.home_winner = True
        else:
            self.winner = away_team
            self.home_winner = False
        self.stats = self.get_match_stats()
        self.identifier = str(self.date.date()) + '_' + str(self.id) + '_' + self.home.name + '-v-' + self.away.name
        Matches.all_matches[self.identifier] = self
        #Matches.all_matches.append(self)
    
    def get_match_stats(self):
        match_stats = pd.DataFrame()
        home_stats = self.home.match_stats[self.home.match_stats['match_id'] == self.id]
        away_stats = self.away.match_stats[self.away.match_stats['match_id'] == self.id]
        for column in list(home_stats.columns)[2:-2]:
            match_stats['h_' + column] = [home_stats[column].values[0]]
            match_stats['a_' + column] = [away_stats[column].values[0]]
            #match_stats['diff_' + column] = match_stats['h_' + column] - match_stats['a_' + column]
        return match_stats
    
    def predict_winner():
        pass

In [9]:
def create_match_objects(match):
    for key, value in Teams.all_teams.items():
        if match['home_team_id'] == Teams.all_teams[key].id:
            home_team = Teams.all_teams[key]
        if match['away_team_id'] == Teams.all_teams[key].id:
            away_team = Teams.all_teams[key]
    Matches(match['match_id'], match['round'], match['date'], home_team, away_team, match['winner'], match['home_odds'], match['away_odds'])
    
match_df.apply(lambda x: create_match_objects(x), axis = 1)

#Verification that code worked
Matches.all_matches['2018-03-09_3_Cowboys-v-Sharks'].stats


True

## Find features significant in predicting home win

In [10]:
features = pd.DataFrame()
for key in Matches.all_matches:
    match = Matches.all_matches[key]
    match_row = match.stats
    match_row['id'] = match.id
    match_row['h_win'] = match.home_winner
    features = features.append(match_row)
features = features.set_index('id')
features['h_win'] = features['h_win'].replace({True: 1, False: 0})
features.head()

Unnamed: 0_level_0,h_points,a_points,h_tries,a_tries,h_conversions,a_conversions,h_conversion_attempts,a_conversion_attempts,h_penalty_goals,a_penalty_goals,...,a_send_offs,h_metres_per_kick,a_metres_per_kick,h_metres_per_run,a_metres_per_run,h_metres_ran_per_try,a_metres_ran_per_try,h_tackle_percentage,a_tackle_percentage,h_win
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,34,12,5,2,4,1,5,2,3,1,...,0,28.117647,27.285714,9.630952,8.276923,323.6,538.0,0.88806,0.871465,1
2,19,18,3,3,3,2,3,3,0,1,...,0,25.619048,21.846154,7.421875,9.258427,475.0,549.333333,0.846535,0.88835,1
3,20,14,4,2,1,2,4,2,1,1,...,0,19.384615,20.928571,8.169697,8.537313,337.0,572.0,0.877419,0.865952,1
4,10,8,1,1,1,1,1,1,1,1,...,0,24.521739,28.684211,8.839779,8.296512,1600.0,1427.0,0.898667,0.860406,0
5,20,32,4,6,2,3,4,6,0,1,...,0,25.777778,25.4375,9.754717,9.754601,387.75,265.0,0.864,0.855385,0


## Feature Selection

In [11]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [12]:
##Troubleshoot issue with infinity values...occurred in metres_ran_per_try when no tries were scored
features = features.replace([np.inf, -np.inf], 0)
filter = [col for col in features if (col.startswith('h_') | col.startswith('a_'))]
features = features[filter]


In [14]:
X = features.iloc[:, :-1]
Y = features['h_win']

#Create a SelectKBest object to select features with the best ANOVOA F-Values
fvalue_selector = SelectKBest(score_func=f_classif, k=15)
fvalue_selector.fit_transform(X, Y)

f_scores_df = pd.DataFrame(dict(feature_names = X.columns, f_scores = fvalue_selector.scores_)).sort_values('f_scores', ascending=False)
f_scores_df[f_scores_df['f_scores'] > 60]

  f = msb / msw


Unnamed: 0,feature_names,f_scores
0,h_points,685.664954
1,a_points,676.611124
2,h_tries,564.665038
6,h_conversion_attempts,557.746359
3,a_tries,535.805577
7,a_conversion_attempts,527.80131
5,a_conversions,478.179379
4,h_conversions,429.497824
24,h_try_assists,380.86027
25,a_try_assists,341.249165


In [73]:
#columns = h_points, a_points, h_line_breaks, a_line_breaks, h_total_run_metres, a_total_run_metres
#h_tackles_made, a_tackles_made, 
#columns_to_add = ['h_win_percentage', 'a_win_percentage']


## Generate df of historical data for making predictions

In [78]:
predictions_data = {}
pd.DataFrame()

for i in range(1, 11):
    predictions_data[str(i)] = pd.DataFrame()
    for key in Matches.all_matches:
        prediction_row = {} 
        match = Matches.all_matches[key]
        prediction_row['match_id'] = match.id
        prediction_row['date'] = match.date
        year = match.season
        prediction_row['round'] = match.round
        h = match.home
        a = match.away
        if match.home_winner is True:
            prediction_row['home_winner'] = 1
        else:
            prediction_row['home_winner'] = 0
    
        team_stats = ['points', 'line_breaks', 'total_run_metres', 'tackles_made', 'is_winner']
        opposition_stats = ['points', 'line_breaks', 'total_run_metres', 'tackles_made']
    
        renaming_dict = {
            'points': 'points_allowed',
            'line_breaks': 'line_breaks_allowed',
            'total_run_metres': 'total_run_metres_allowed',
            'tackles_made': 'tackles_by_opposition'
        }
    
        if (np.isnan(match.round)) | int(match.round > i):
            home_stats = h.get_stats_most_recent_matches(match.date, i)[team_stats]
            home_opposition_stats = h.get_opposition_stats_most_recent_matches(match.date, i)[opposition_stats]
        
            away_stats = a.get_stats_most_recent_matches(match.date, i)[team_stats]
            away_opposition_stats = a.get_opposition_stats_most_recent_matches(match.date, i)[opposition_stats]
    
        elif int(match.round) == 1:
            home_stats = h.get_season_averages(year - 1)[team_stats]
            home_opposition_stats = h.get_season_averages(year - 1)[opposition_stats]
        
            away_stats = a.get_season_averages(year - 1)[team_stats]
            away_opposition_stats = a.get_season_averages(year - 1)[opposition_stats]
    
        else:
            home_stats = h.get_stats_most_recent_matches(match.date, int(match.round) - 1)[team_stats]
            home_stats = ((home_stats / i) * (i - int(match.round))) + ((h.get_season_averages(year - 1)[team_stats] / i) * int(match.round))
            
            away_stats = a.get_stats_most_recent_matches(match.date, int(match.round) - 1)[team_stats]
            away_stats = ((away_stats / i) * (i - int(match.round))) + ((a.get_season_averages(year - 1)[team_stats] / i) * int(match.round))
        
            home_opposition_stats = h.get_opposition_stats_most_recent_matches(match.date, int(match.round) - 1)[opposition_stats]
            home_opposition_stats = ((home_opposition_stats / i) * (i - int(match.round))) + ((h.get_season_averages(year - 1)[opposition_stats] / i) * int(match.round))
            
            away_opposition_stats = a.get_opposition_stats_most_recent_matches(match.date, int(match.round) - 1)[opposition_stats]
            away_opposition_stats = ((away_opposition_stats / i) * (i - int(match.round))) + ((a.get_season_averages(year - 1)[opposition_stats] / i) * int(match.round))
        
        
#             home_stats['is_winner'] = ((h.get_season_averages(year - 1)['is_winner'] / i) * int(match.round))\
#                 + ((home_stats['is_winner'] / i) * (i - int(match.round)))
#             away_stats['is_winner'] = ((a.get_season_averages(year - 1)['is_winner'] / i) * int(match.round))\
#                 + ((away_stats['is_winner'] / i) * (i - int(match.round)))

        home_opposition_stats = home_opposition_stats.rename(renaming_dict)
        away_opposition_stats = away_opposition_stats.rename(renaming_dict)
        for stat in team_stats:
            prediction_row['h_' + stat] = home_stats[stat]
            prediction_row['a_' + stat] = away_stats[stat]
        for key, value in home_opposition_stats.items():
            prediction_row['h_' + key] = home_opposition_stats[key]
            prediction_row['a_' + key] = away_opposition_stats[key]
    
        predictions_data[str(i)] = predictions_data[str(i)].append(prediction_row, ignore_index=True)
    predictions_data[str(i)]['match_id'] = predictions_data[str(i)]['match_id'].astype(int)
    predictions_data[str(i)] = predictions_data[str(i)].sort_values(by='date').drop(columns='date')


In [79]:
for key in predictions_data:
    predictions_data[key]['round'] = predictions_data[key]['round'].fillna(0)
    predictions_data[key] = predictions_data[key].dropna(axis=0)
predictions_data['5']

Unnamed: 0,a_is_winner,a_line_breaks,a_line_breaks_allowed,a_points,a_points_allowed,a_tackles_by_opposition,a_tackles_made,a_total_run_metres,a_total_run_metres_allowed,h_is_winner,...,h_line_breaks_allowed,h_points,h_points_allowed,h_tackles_by_opposition,h_tackles_made,h_total_run_metres,h_total_run_metres_allowed,home_winner,match_id,round
1247,0.2,1.8,2.4,13.6,19.8,333.8,278.4,1454.2,1520.2,0.6,...,1.6,21.2,13.2,297.4,349.6,1366.2,1393.6,1.0,1316,6.0
1246,0.4,3.6,3.0,17.6,19.6,308.8,316.8,1464.2,1447.2,0.4,...,3.0,18.8,16.0,355.6,332.0,1469.4,1447.6,1.0,1315,6.0
1248,0.2,4.0,5.4,18.4,28.0,277.4,305.2,1403.6,1458.2,0.6,...,3.0,24.0,15.8,353.2,310.6,1567.6,1310.2,1.0,1317,6.0
1249,1.0,4.6,3.0,28.4,15.2,296.6,322.8,1495.0,1381.2,1.0,...,4.4,25.4,17.6,316.2,301.8,1565.4,1436.4,0.0,1318,6.0
1250,0.2,2.8,5.0,14.0,25.2,299.8,315.0,1276.2,1495.8,0.4,...,5.6,15.2,27.0,304.8,296.0,1325.4,1405.4,1.0,1319,6.0
1251,0.4,1.8,1.8,14.0,13.2,346.4,326.6,1550.4,1555.2,0.8,...,1.4,22.8,7.2,318.2,328.2,1546.0,1339.6,1.0,1320,6.0
1252,0.4,2.8,3.6,15.4,22.0,324.4,325.6,1469.6,1538.0,0.4,...,5.0,16.2,26.0,301.2,313.0,1276.2,1508.2,0.0,1321,6.0
1253,0.4,3.0,3.4,17.4,23.4,296.4,313.8,1368.8,1421.6,0.6,...,3.0,20.4,13.6,327.2,322.0,1567.0,1506.8,1.0,1322,6.0
1254,0.6,3.2,3.4,16.0,18.4,336.0,333.8,1556.0,1516.8,0.8,...,0.8,26.8,7.6,300.6,344.4,1426.8,1344.2,1.0,1323,7.0
1255,0.2,2.8,4.4,15.2,21.2,304.2,328.4,1302.4,1546.4,1.0,...,3.0,25.8,15.2,309.6,329.2,1506.2,1423.8,1.0,1324,7.0


## Model Selection

In [80]:
#Kernel SVM, Random Forest, Gradient Boosting Tree, Neural Network

#Evaluation...use Area Under Curve

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import seaborn as sns
sns.set()

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [83]:
for key in predictions_data:
    data = predictions_data[key]
    data = data.replace([np.inf, -np.inf], 0)
    #print(data)
    X = data.drop(columns=['home_winner', 'match_id', 'round'])
    y = data['home_winner']
    X_train, X_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(X), y, test_size=0.2, random_state=42)
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    print('Prior ' + key + ' matches: ' + str(accuracy_score(y_test, y_pred)))

Prior 1 matches: 0.5714285714285714
Prior 2 matches: 0.5806451612903226
Prior 3 matches: 0.5487364620938628
Prior 4 matches: 0.6036363636363636
Prior 5 matches: 0.5766423357664233
Prior 6 matches: 0.5845588235294118
Prior 7 matches: 0.5830258302583026
Prior 8 matches: 0.5947955390334573
Prior 9 matches: 0.5767790262172284
Prior 10 matches: 0.5639097744360902


In [20]:
# test_data = predictions_data.iloc[y_test.index.to_list()]
# results = test_data[['match_id', 'home_winner']]
# results['predictions'] = y_pred
# results['correct_prediction'] = results.apply(lambda x: 
#                                               ((x['predictions'] == 0) & (x['home_winner'] == 0)) |
#                                               ((x['predictions'] == 1) & (x['home_winner'] == 1)), axis=1)
# accuracy = len(results[results['correct_prediction'] == True]) / results.shape[0]
# print('calculated accuracy: ' + str(accuracy))
print(accuracy_score(y_test, y_pred))


0.5709219858156028


## Random Forest

In [84]:
from sklearn.ensemble import RandomForestClassifier
random_forest_results = pd.DataFrame({'estimators': list(range(1, 300, 25))})
random_state = 0
for key in predictions_data:
    data = predictions_data[key]
    data = data.replace([np.inf, -np.inf], 0)
    #print(data)
    X = data.drop(columns=['home_winner', 'match_id', 'round'])
    y = data['home_winner']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    key_results = []
    random_state = 0
    for i in range(1,300, 25):
        random_state += 1
        forest = RandomForestClassifier(n_estimators = i, random_state=random_state)
        forest.fit(X_train, y_train)
        forest_pred = forest.predict(X_test)
        score = np.round(accuracy_score(y_test, forest_pred), 4)
        key_results.append(score)
        #print('Previous ' + key + ' matches - ' + str(i) + ' estimators accuracy: ' + str(np.round(score, 4)))
    random_forest_results[key] = key_results

In [86]:
## MAKE VISUALIZATION
random_forest_results = random_forest_results.set_index('estimators')
random_forest_results
random_forest_results['estimator_average'] = random_forest_results.mean(axis=1)
prior_match_average = random_forest_results.mean(axis=0)
print(prior_match_average)
random_forest_results
# plt.figure(figsize=(30,20))
# for column in random_forest_results:
#     plt.plot(random_forest_results.index, random_forest_results[column], label = column)
# plt.legend()
# plt.title("Visualization..CHANGE NAME")
# plt.xlabel("n_estimators")
# plt.ylabel("Prediction Accuracy")

1                    0.550608
2                    0.548683
3                    0.549633
4                    0.553325
5                    0.591225
6                    0.565567
7                    0.558733
8                    0.562883
9                    0.585525
10                   0.594925
estimator_average    1.706060
dtype: float64


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,estimator_average
estimators,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.5607,0.5125,0.5054,0.5127,0.5401,0.4926,0.5129,0.4684,0.5543,0.5338,0.523312
26,0.5286,0.5412,0.5451,0.5382,0.5839,0.5588,0.5498,0.5613,0.5655,0.594,0.766916
51,0.5786,0.5269,0.556,0.5527,0.5876,0.5993,0.5461,0.5576,0.5993,0.6241,0.989574
76,0.525,0.5269,0.5668,0.5345,0.5657,0.5772,0.5572,0.5948,0.5993,0.5865,1.186833
101,0.5536,0.5663,0.5451,0.5455,0.6022,0.5588,0.5904,0.5651,0.5805,0.6203,1.402757
126,0.5286,0.5699,0.5451,0.5382,0.5876,0.5809,0.5609,0.5576,0.5993,0.5865,1.602109
151,0.5643,0.5376,0.5523,0.5745,0.6131,0.5772,0.5609,0.5651,0.603,0.6278,1.82074
176,0.5786,0.5663,0.5596,0.5709,0.6022,0.5515,0.5498,0.5948,0.5955,0.5714,2.023861
201,0.5321,0.5341,0.5343,0.5418,0.6131,0.5625,0.5683,0.5576,0.5843,0.6015,2.219464
226,0.5536,0.5448,0.5776,0.5745,0.5985,0.5699,0.5498,0.5799,0.573,0.5902,2.434228


In [None]:
#Piror 8 or 9 matches
#226 - 276 n_estimators

## Gradient Boosting Tree

In [96]:
from sklearn.ensemble import GradientBoostingClassifier
gradient_results = pd.DataFrame({'estimators': list(range(100, 110))})
for key in predictions_data:
    print(key)
    data = predictions_data[key]
    data = data.replace([np.inf, -np.inf, np.nan], 0)
    print(data.tail())
    X = data.drop(columns=['home_winner', 'match_id', 'round'])
    y = data['home_winner']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    key_results = []
#     for i in range(100,110):
#         gradient = GradientBoostingClassifier(n_estimators = i, random_state=42)
#         gradient.fit(X_train, y_train)
#         gradient_pred = gradient.predict(X_test)
#         score = np.round(accuracy_score(y_test, gradient_pred), 4)
#         key_results.append(score)
#         #print('Previous ' + key + ' matches - ' + str(i) + ' estimators accuracy: ' + str(np.round(score, 4)))
#     gradient_results[key] = key_results
gradient_results = gradient_results.set_index('estimators')
gradient_results

1
      a_is_winner  a_line_breaks  a_line_breaks_allowed   a_points  \
1402     0.576923       2.653846               2.653846  19.461538   
1403     0.518519       3.592593               3.592593  20.962963   
1404     0.607143       4.785714               4.785714  23.571429   
1405     0.518519       3.592593               3.592593  20.962963   
1406     0.607143       4.785714               4.785714  23.571429   

      a_points_allowed  a_tackles_by_opposition  a_tackles_made  \
1402         19.461538               326.153846      326.153846   
1403         20.962963               325.370370      325.370370   
1404         23.571429               309.500000      309.500000   
1405         20.962963               325.370370      325.370370   
1406         23.571429               309.500000      309.500000   

      a_total_run_metres  a_total_run_metres_allowed  h_is_winner  ...  \
1402         1524.807692                 1524.807692     0.607143  ...   
1403         1521.333333  

      a_is_winner  a_line_breaks  a_line_breaks_allowed   a_points  \
1402     0.576923       2.653846               2.653846  19.461538   
1403     0.518519       3.592593               3.592593  20.962963   
1404     0.607143       4.785714               4.785714  23.571429   
1405     0.518519       3.592593               3.592593  20.962963   
1406     0.607143       4.785714               4.785714  23.571429   

      a_points_allowed  a_tackles_by_opposition  a_tackles_made  \
1402         19.461538               326.153846      326.153846   
1403         20.962963               325.370370      325.370370   
1404         23.571429               309.500000      309.500000   
1405         20.962963               325.370370      325.370370   
1406         23.571429               309.500000      309.500000   

      a_total_run_metres  a_total_run_metres_allowed  h_is_winner  ...  \
1402         1524.807692                 1524.807692     0.607143  ...   
1403         1521.333333    

100
101
102
103
104
105
106
107
108
109


## Updating Historical Data...how many games back do we go?

In [86]:
#Piror 8 or 9 matches
#226 - 276 n_estimators

from sklearn.ensemble import RandomForestClassifier
random_forest_results = pd.DataFrame({'estimators': list(range(226, 275))})
random_state = 0
for key in ['8', '9']:
    data = predictions_data[key]
    data = data.replace([np.inf, -np.inf, np.nan], 0)
    X = data.drop(columns=['home_winner', 'match_id', 'round'])
    y = data['home_winner']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    key_results = []
    random_state = 0
    for i in range(226, 275):
        random_state += 1
        forest = RandomForestClassifier(n_estimators = i, random_state=random_state)
        forest.fit(X_train, y_train)
        forest_pred = forest.predict(X_test)
        score = np.round(accuracy_score(y_test, forest_pred), 4)
        key_results.append(score)
        #print('Previous ' + key + ' matches - ' + str(i) + ' estimators accuracy: ' + str(np.round(score, 4)))
    random_forest_results[key] = key_results

## Adding in betting data

## Adding lineup changes as a feature + dropping other features

In [44]:
query = pd.read_sql_query('''
    SELECT * 
    FROM PlayerMatchStats
''', mydb)
player_stat_df = pd.DataFrame(query)
player_stat_df.head()
#match_stat_df = player_stat_df.groupby(['match_id','team_id']).sum().reset_index()
#match_stat_df = match_stat_df.drop(columns=['id', 'player_id', 'position_id', 'minutes_played'])

Unnamed: 0,id,match_id,player_id,team_id,position_id,minutes_played,points,tries,conversions,conversion_attempts,...,kicked_dead,errors,handling_errors,one_on_ones_lost,penalties,on_report,sin_bins,send_offs,stint_one,stint_two
0,1,1,385,14,1,80.0,0,0,0,0,...,0,0,0,0,1,0,0,0,80.0,
1,2,1,481,14,2,80.0,0,0,0,0,...,0,1,1,0,0,0,0,0,80.0,
2,3,1,378,14,3,80.0,4,1,0,0,...,0,0,0,0,0,0,0,0,80.0,
3,4,1,388,14,3,80.0,0,0,0,0,...,0,3,2,0,0,0,0,0,80.0,
4,5,1,482,14,2,80.0,4,1,0,0,...,0,1,1,0,0,0,0,0,80.0,


In [65]:
for match in Matches.all_matches:
    match_id = Matches.all_matches[match].id
    home_id = Matches.all_matches[match].home.id
    home_df = player_stat_df[(player_stat_df['match_id'] == match_id) & (player_stat_df['team_id'] == home_id)]
    home_roster = list(home_df['player_id'])
    
    away_id = Matches.all_matches[match].away.id
    away_df = player_stat_df[(player_stat_df['match_id'] == match_id) & (player_stat_df['team_id'] == away_id)]
    away_roster = list(away_df['player_id'])
    

[385, 481, 378, 388, 482, 379, 376, 396, 391, 404, 407, 406, 483, 402, 484, 1626, 397]
[209, 212, 200, 213, 211, 208, 210, 192, 201, 503, 196, 188, 197, 504, 505, 194, 190]
[244, 254, 252, 250, 517, 258, 519, 272, 265, 268, 264, 520, 266, 260, 276, 267, 261]
[521, 458, 461, 522, 523, 457, 464, 453, 524, 448, 441, 452, 443, 440, 439, 449, 525]
[346, 527, 351, 352, 528, 349, 345, 369, 361, 366, 364, 529, 367, 360, 363, 368, 530]
[490, 491, 492, 84, 82, 493, 80, 494, 68, 495, 67, 70, 64, 61, 62, 538, 66]
[319, 323, 326, 543, 316, 320, 324, 330, 544, 332, 329, 545, 546, 335, 342, 547, 336]
[117, 112, 115, 554, 118, 555, 113, 127, 556, 132, 131, 557, 126, 134, 133, 558, 1639]
[91, 97, 87, 471, 472, 94, 89, 103, 473, 474, 475, 476, 477, 478, 479, 104, 480]
[432, 431, 486, 433, 487, 435, 430, 414, 413, 421, 409, 488, 412, 489, 424, 423, 417]
[22, 20, 27, 497, 26, 19, 498, 11, 3, 500, 2, 10, 501, 499, 9, 502, 566]
[531, 238, 532, 533, 241, 236, 534, 535, 225, 220, 224, 234, 231, 536, 230, 537,

[432, 431, 486, 433, 487, 435, 430, 414, 413, 421, 409, 412, 423, 488, 424, 417, 416]
[84, 491, 492, 79, 83, 81, 493, 62, 68, 64, 67, 72, 61, 77, 538, 69, 63]
[209, 591, 200, 622, 212, 1638, 210, 195, 191, 190, 196, 197, 188, 192, 194, 505, 628]
[346, 348, 530, 351, 344, 349, 345, 369, 361, 363, 364, 529, 367, 360, 588, 368, 347]
[385, 383, 378, 388, 482, 379, 376, 396, 391, 404, 407, 406, 483, 397, 484, 1626, 402]
[280, 279, 285, 548, 281, 550, 286, 614, 304, 306, 299, 305, 300, 283, 296, 302, 293]
[432, 431, 486, 433, 487, 488, 430, 414, 413, 421, 409, 412, 423, 489, 424, 417, 627]
[472, 91, 576, 471, 96, 94, 89, 103, 473, 474, 582, 476, 477, 605, 479, 102, 475]
[543, 323, 326, 318, 316, 320, 324, 332, 338, 330, 342, 545, 329, 325, 546, 547, 335]
[258, 254, 248, 620, 252, 249, 519, 272, 265, 268, 264, 260, 266, 244, 267, 261, 276]
[531, 238, 239, 533, 241, 236, 534, 535, 225, 221, 224, 220, 590, 227, 230, 222, 532]
[346, 347, 530, 351, 527, 349, 345, 369, 361, 363, 364, 529, 367, 360

[208, 218, 219, 215, 212, 216, 205, 191, 193, 196, 200, 194, 192, 195, 198, 188, 217]
[94, 97, 88, 92, 91, 95, 89, 102, 473, 474, 108, 478, 477, 93, 475, 616, 107]
[319, 323, 318, 1647, 314, 321, 320, 330, 1644, 329, 342, 545, 580, 327, 332, 335, 322]
[208, 218, 219, 215, 212, 216, 210, 205, 191, 195, 194, 200, 192, 213, 206, 190, 203]
[344, 351, 1646, 347, 348, 349, 345, 368, 361, 373, 364, 360, 362, 365, 1656, 374, 370]
[19, 28, 1652, 1653, 22, 6, 11, 3, 14, 2, 500, 18, 7, 9, 4, 12, 26]
[178, 185, 187, 179, 175, 174, 164, 159, 169, 162, 168, 161, 158, 160, 165, 182, 180]
[433, 431, 486, 428, 437, 435, 430, 414, 413, 421, 418, 417, 423, 412, 408, 422, 419]
[458, 468, 466, 461, 459, 457, 464, 569, 451, 439, 454, 441, 448, 445, 456, 443, 449]
[385, 382, 378, 388, 386, 1630, 397, 391, 390, 395, 406, 402, 400, 405, 376, 404, 407]
[19, 20, 28, 26, 22, 6, 11, 3, 14, 2, 500, 18, 7, 9, 4, 12, 1654]
[319, 323, 318, 1647, 314, 320, 324, 330, 1644, 329, 342, 327, 545, 580, 321, 332, 335]
[531, 5

[568, 185, 187, 589, 179, 539, 1806, 541, 159, 169, 162, 168, 161, 165, 2131, 160, 1994]
[491, 79, 492, 2058, 82, 1757, 490, 62, 68, 1970, 67, 61, 538, 1969, 64, 70, 493]
[2108, 591, 1858, 2101, 211, 504, 2102, 190, 191, 2103, 200, 2126, 197, 2104, 199, 603, 193]
[22, 20, 27, 2054, 497, 19, 498, 2056, 3, 2055, 2, 10, 501, 502, 499, 500, 2364]
[151, 145, 143, 507, 508, 2092, 142, 1713, 153, 509, 515, 155, 512, 513, 570, 2140, 571]
[40, 45, 41, 43, 42, 560, 33, 564, 55, 2078, 56, 563, 54, 2079, 58, 50, 565]
[482, 481, 378, 388, 484, 379, 2067, 2068, 391, 404, 407, 2069, 483, 406, 1642, 578, 2361]
[1966, 326, 318, 543, 316, 2071, 324, 330, 544, 335, 545, 329, 546, 2073, 332, 2074, 2070]
[280, 2030, 285, 549, 284, 293, 550, 553, 297, 306, 298, 305, 300, 552, 302, 294, 614]
[40, 45, 41, 43, 42, 560, 33, 564, 55, 2078, 56, 563, 54, 2079, 58, 565, 50]
[346, 530, 528, 2059, 1901, 364, 349, 369, 361, 363, 365, 529, 367, 621, 588, 2061, 360]
[280, 2030, 285, 549, 284, 293, 550, 553, 297, 306, 29

[151, 506, 143, 507, 508, 2092, 142, 2140, 153, 510, 515, 155, 512, 513, 571, 1713, 509]
[568, 185, 187, 177, 179, 175, 1806, 164, 159, 1994, 162, 1808, 168, 169, 1813, 2393, 2381]
[211, 591, 1858, 2101, 212, 2391, 2102, 193, 191, 195, 196, 194, 197, 2126, 2394, 199, 2383]
[385, 481, 1990, 388, 482, 379, 484, 406, 391, 404, 407, 2069, 483, 1642, 578, 390, 2067]
[1920, 459, 461, 522, 523, 521, 464, 1886, 2065, 610, 2115, 569, 448, 439, 599, 2392, 2395]
[2089, 486, 488, 417, 487, 435, 1677, 414, 413, 489, 409, 1679, 412, 2090, 421, 2091, 431]
[568, 185, 187, 177, 179, 175, 1806, 164, 159, 1994, 162, 1808, 161, 168, 169, 541, 1813]
[151, 506, 143, 507, 508, 2092, 142, 2140, 153, 510, 515, 155, 512, 513, 571, 1713, 509]
[472, 97, 2051, 471, 2050, 2052, 89, 103, 473, 474, 582, 476, 477, 475, 1836, 2114, 479]
[498, 20, 27, 2054, 497, 19, 2117, 499, 1704, 2055, 2, 10, 501, 2364, 2057, 9, 2056]
[287, 2030, 285, 293, 549, 550, 286, 294, 551, 306, 298, 575, 300, 552, 553, 297, 302]
[568, 185, 18

KeyboardInterrupt: 

In [137]:
results_df.head()
prediction_percent = len(results_df[results_df['correct_prediction'] == True]) / results_df.shape[0]
betting_percent = len(results_df[results_df['correct_odds'] == True]) / results_df.shape[0]                                   
print('prediction_percentage = ' + str(prediction_percent))
print('betting_percentage = ' + str(betting_percent))

prediction_percentage = 0.58
betting_percentage = 0.62


In [73]:
mycursor.close()

True