# Predicting Match Outcomes

In [2]:
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd
from datetime import datetime as dt

In [3]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor()

## Create match_stat_df of aggregated player stats for each match

In [4]:
query = pd.read_sql_query('''
    SELECT * 
    FROM PlayerMatchStats
''', mydb)
player_stat_df = pd.DataFrame(query)
match_stat_df = player_stat_df.groupby(['match_id','team_id']).sum().reset_index()
match_stat_df = match_stat_df.drop(columns=['id', 'player_id', 'position_id', 'minutes_played'])
match_stat_df.columns

Index(['match_id', 'team_id', 'points', 'tries', 'conversions',
       'conversion_attempts', 'penalty_goals', 'field_goals', 'total_runs',
       'total_run_metres', 'kick_return_metres', 'post_contact_metres',
       'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs',
       'tackle_breaks', 'hit_ups', 'play_the_ball',
       'average_play_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres',
       'steals', 'offloads', 'dummy_passes', 'passes', 'receipts',
       'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts',
       'kicks_defused', 'kicks', 'kicking_metres', 'forced_drop_outs',
       'bomb_kicks', 'grubbers', 'fourty_twenty', 'cross_field_kicks',
       'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost',
       'penalties', 'on_report', 'sin_bins', 'send_offs'],
      dtype='object')

In [5]:
match_stat_df = match_stat_df.drop(columns=['average_play_ball_seconds'])
def get_averages(df):
    df['metres_per_kick'] = df['kicking_metres'] / df['kicks']
    df['metres_per_run'] = df['total_run_metres'] / df['total_runs']
    df['metres_ran_per_try'] = df['total_run_metres'] / df['tries']
    df['tackle_percentage'] = df['tackles_made'] / (df['tackles_made'] + df['tackles_missed'] + df['ineffective_tackles'])
    return df
match_stat_df = get_averages(match_stat_df)
match_stat_df.head()

Unnamed: 0,match_id,team_id,points,tries,conversions,conversion_attempts,penalty_goals,field_goals,total_runs,total_run_metres,...,handling_errors,one_on_ones_lost,penalties,on_report,sin_bins,send_offs,metres_per_kick,metres_per_run,metres_ran_per_try,tackle_percentage
0,1,1,12,2,1,2,1,0,130,1076,...,11,0,9,1,0,0,27.285714,8.276923,538.0,0.871465
1,1,14,34,5,4,5,3,0,168,1618,...,9,2,7,0,0,0,28.117647,9.630952,323.6,0.88806
2,2,6,18,3,2,3,1,0,178,1648,...,10,0,11,0,0,0,21.846154,9.258427,549.333333,0.88835
3,2,8,19,3,3,3,0,1,192,1425,...,6,0,9,0,0,0,25.619048,7.421875,475.0,0.846535
4,3,4,14,2,2,2,1,0,134,1144,...,12,0,10,0,0,0,20.928571,8.537313,572.0,0.865952


## Create match_df of info not related to player stats

In [6]:
match_query = pd.read_sql_query('''
    SELECT id, date, home_team_id, home_score, away_team_id, away_score, winner, home_odds, away_odds, draw_odds
    FROM Matches
''', mydb)
match_df = pd.DataFrame(match_query).rename(columns={'id': 'match_id'})
match_df.head()

Unnamed: 0,match_id,date,home_team_id,home_score,away_team_id,away_score,winner,home_odds,away_odds,draw_odds
0,1,2018-03-08,14,34,1,12,14,0.5882,0.4386,0.0488
1,2,2018-03-09,8,19,6,18,8,0.5181,0.5128,0.0501
2,3,2018-03-09,10,20,4,14,10,0.5988,0.4329,0.0482
3,4,2018-03-10,16,10,15,8,15,0.25,0.7874,0.0369
4,5,2018-03-10,13,20,9,32,9,0.6803,0.3509,0.0476


# Create class for each team

In [7]:
class Teams:
    all_teams = {}
    
    def __init__(self, name, id):
        self.name = name
        self.id = id
        self.match_stats = match_stat_df[match_stat_df['team_id'] == self.id]
        Teams.all_teams[self.name] = self
    
    def get_stats_most_recent_matches(self, date, x=1):
        date_df = self.match_stats[self.match_stats['date'] < date].sort_values('date', ascending=False)
        averages_df = date_df.head(x).sum() / x
        return averages_df
        
    
    def get_season_averages():
        pass

team_names_and_ids = pd.DataFrame(pd.read_sql_query('SELECT id, nickname FROM Teams', mydb)).set_index('nickname').to_dict()['id']
for name in team_names_and_ids.keys():
     Teams(name, team_names_and_ids[name])
Teams.all_teams['Roosters'].name

'Roosters'

# Create class for each match

In [19]:
class Matches:
    all_matches = {}
    
    def __init__(self, match_id, date, home_team, away_team, winner):
        self.id = match_id
        self.date = date
        self.home = home_team
        self.away = away_team
        if home_team.id == winner:
            self.winner = home_team
            self.home_winner = True
        else:
            self.winner = away_team
            self.home_winner = False
        self.stats = self.get_match_stats()
        self.identifier = str(self.date) + '_' + str(self.id) + '_' + self.home.name + '-v-' + self.away.name
        Matches.all_matches[self.identifier] = self
        #Matches.all_matches.append(self)
    
    def get_match_stats(self):
        match_stats = pd.DataFrame()
        home_stats = self.home.match_stats[self.home.match_stats['match_id'] == self.id]
        away_stats = self.away.match_stats[self.away.match_stats['match_id'] == self.id]
        for column in list(home_stats.columns)[2:]:
            match_stats['h_' + column] = [home_stats[column].values[0]]
            match_stats['a_' + column] = [away_stats[column].values[0]]
            match_stats['diff_' + column] = match_stats['h_' + column] - match_stats['a_' + column]
        return match_stats
    
    def predict_winner():
        pass

In [20]:
def create_match_objects(match):
    for key, value in Teams.all_teams.items():
        if match['home_team_id'] == Teams.all_teams[key].id:
            home_team = Teams.all_teams[key]
        if match['away_team_id'] == Teams.all_teams[key].id:
            away_team = Teams.all_teams[key]
    Matches(match['match_id'], match['date'], home_team, away_team, match['winner'])
    
match_df.apply(lambda x: create_match_objects(x), axis = 1)
Matches.all_matches['2018-03-09_2_Knights-v-Sea Eagles'].home_winner

True

In [21]:
match = Matches.all_matches['2018-03-09_2_Knights-v-Sea Eagles']
print(match.home.name)
print(match.away.name)


match.stats

Knights
Sea Eagles


Unnamed: 0,h_points,a_points,diff_points,h_tries,a_tries,diff_tries,h_conversions,a_conversions,diff_conversions,h_conversion_attempts,...,diff_metres_per_kick,h_metres_per_run,a_metres_per_run,diff_metres_per_run,h_metres_ran_per_try,a_metres_ran_per_try,diff_metres_ran_per_try,h_tackle_percentage,a_tackle_percentage,diff_tackle_percentage
0,19,18,1,3,3,0,3,2,1,3,...,3.772894,7.421875,9.258427,-1.836552,475.0,549.333333,-74.333333,0.846535,0.88835,-0.041815


## Find stats highly correlated with a home win

In [28]:
correlation_df = pd.DataFrame()
for key in Matches.all_matches:
    match = Matches.all_matches[key]
    match_row = match.stats
    match_row['id'] = match.id
    match_row['h_win'] = match.home_winner
    correlation_df = correlation_df.append(match_row)
correlation_df = correlation_df.set_index('id')
correlation_df['h_win'] = correlation_df['h_win'].replace({True: 1, False: 0})
correlation_df.head()

Unnamed: 0_level_0,h_points,a_points,diff_points,h_tries,a_tries,diff_tries,h_conversions,a_conversions,diff_conversions,h_conversion_attempts,...,h_metres_per_run,a_metres_per_run,diff_metres_per_run,h_metres_ran_per_try,a_metres_ran_per_try,diff_metres_ran_per_try,h_tackle_percentage,a_tackle_percentage,diff_tackle_percentage,h_win
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,34,12,22,5,2,3,4,1,3,5,...,9.630952,8.276923,1.354029,323.6,538.0,-214.4,0.88806,0.871465,0.016594,1
2,19,18,1,3,3,0,3,2,1,3,...,7.421875,9.258427,-1.836552,475.0,549.333333,-74.333333,0.846535,0.88835,-0.041815,1
3,20,14,6,4,2,2,1,2,-1,4,...,8.169697,8.537313,-0.367616,337.0,572.0,-235.0,0.877419,0.865952,0.011468,1
4,10,8,2,1,1,0,1,1,0,1,...,8.839779,8.296512,0.543267,1600.0,1427.0,173.0,0.898667,0.860406,0.038261,0
5,20,32,-12,4,6,-2,2,3,-1,4,...,9.754717,9.754601,0.000116,387.75,265.0,122.75,0.864,0.855385,0.008615,0


## Feature Selection

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [45]:
for column in list(correlation_df.columns):
    if correlation_df[column].isnull().any():
        print(column)
    else:
        print(column + ' is good!')
#correlation_df.isnull().any()

h_points is good!
a_points is good!
diff_points is good!
h_tries is good!
a_tries is good!
diff_tries is good!
h_conversions is good!
a_conversions is good!
diff_conversions is good!
h_conversion_attempts is good!
a_conversion_attempts is good!
diff_conversion_attempts is good!
h_penalty_goals is good!
a_penalty_goals is good!
diff_penalty_goals is good!
h_field_goals is good!
a_field_goals is good!
diff_field_goals is good!
h_total_runs is good!
a_total_runs is good!
diff_total_runs is good!
h_total_run_metres is good!
a_total_run_metres is good!
diff_total_run_metres is good!
h_kick_return_metres is good!
a_kick_return_metres is good!
diff_kick_return_metres is good!
h_post_contact_metres is good!
a_post_contact_metres is good!
diff_post_contact_metres is good!
h_line_breaks is good!
a_line_breaks is good!
diff_line_breaks is good!
h_line_break_assists is good!
a_line_break_assists is good!
diff_line_break_assists is good!
h_try_assists is good!
a_try_assists is good!
diff_try_assist

In [40]:
X = correlation_df.iloc[:, :-1]
Y = correlation_df['h_win']

In [41]:
test = SelectKBest(score_func=f_classif, k=15)
fit = test.fit(X, Y)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [29]:
for column in list(correlation_df.columns):
    lr = LogisticRegression()
    print(column)

h_points
a_points
diff_points
h_tries
a_tries
diff_tries
h_conversions
a_conversions
diff_conversions
h_conversion_attempts
a_conversion_attempts
diff_conversion_attempts
h_penalty_goals
a_penalty_goals
diff_penalty_goals
h_field_goals
a_field_goals
diff_field_goals
h_total_runs
a_total_runs
diff_total_runs
h_total_run_metres
a_total_run_metres
diff_total_run_metres
h_kick_return_metres
a_kick_return_metres
diff_kick_return_metres
h_post_contact_metres
a_post_contact_metres
diff_post_contact_metres
h_line_breaks
a_line_breaks
diff_line_breaks
h_line_break_assists
a_line_break_assists
diff_line_break_assists
h_try_assists
a_try_assists
diff_try_assists
h_line_engaged_runs
a_line_engaged_runs
diff_line_engaged_runs
h_tackle_breaks
a_tackle_breaks
diff_tackle_breaks
h_hit_ups
a_hit_ups
diff_hit_ups
h_play_the_ball
a_play_the_ball
diff_play_the_ball
h_dummy_half_runs
a_dummy_half_runs
diff_dummy_half_runs
h_dummy_half_run_metres
a_dummy_half_run_metres
diff_dummy_half_run_metres
h_steals
a

In [None]:
import seaborn as sns
corr = correlation_df.corr()
corr['is_winner']
stats = corr[(corr['is_winner'] > .2) | (corr['is_winner'] < -.2)]
stats['is_winner']

In [70]:
def find_home_team(match_df, x):
    match = match_df[match_df['match_id'] == x['match_id']]
    #print(match['home_team_id'].values)
    if match['home_team_id'].values[0] == x['team_id']:
        return 1
    else:
        return 0
#match_stat_df['is_home'] = match_stat_df.apply(lambda x: find_home_team(match_df, x), axis=1)
#match_stat_df = pd.merge(match_stat_df, match_df[['match_id', 'date']], on='match_id')
#match_stat_df.head()

In [199]:
def add_home_away_stats_to_matches(match):
    match_df = match_stat_df[match_stat_df['match_id'] == match['id']]
    h = match_df.loc[match_df['is_home'] == 1]
    a = match_df.loc[match_df['is_home'] == 0]
    dict_h_a = {'h': h, 'a': a}
    for column in list(match_df.columns)[2:]:
        for key in dict_h_a:
            col_name = key + '_' + column
            match[col_name] = dict_h_a[key][column].values[0]
    return match

match_df = match_df.apply(lambda x: add_home_away_stats_to_matches(x), axis=1)
match_df.head()
    

Unnamed: 0,id,date,home_team_id,home_score,away_team_id,away_score,winner,home_odds,away_odds,draw_odds,...,h_tackle_percentage,a_tackle_percentage,h_metres_per_kick,a_metres_per_kick,h_metres_per_run,a_metres_per_run,h_metres_ran_per_try,a_metres_ran_per_try,h_is_home,a_is_home
0,1,2018-03-08,14,34,1,12,14,0.5882,0.4386,0.0488,...,0.88806,0.871465,28.117647,27.285714,9.630952,8.276923,323.6,538.0,1,0
1,2,2018-03-09,8,19,6,18,8,0.5181,0.5128,0.0501,...,0.846535,0.88835,25.619048,21.846154,7.421875,9.258427,475.0,549.333333,1,0
2,3,2018-03-09,10,20,4,14,10,0.5988,0.4329,0.0482,...,0.877419,0.865952,19.384615,20.928571,8.169697,8.537313,337.0,572.0,1,0
3,4,2018-03-10,16,10,15,8,15,0.25,0.7874,0.0369,...,0.898667,0.860406,24.521739,28.684211,8.839779,8.296512,1600.0,1427.0,1,0
4,5,2018-03-10,13,20,9,32,9,0.6803,0.3509,0.0476,...,0.864,0.855385,25.777778,25.4375,9.754717,9.754601,387.75,265.0,1,0


In [201]:
#match_df.dtypes
match_df = match_df.sort_values('date').reset_index(drop=True)
match_df['home_is_winner'] = match_df.apply(lambda x: 1 if x['home_team_id'] == x['winner'] else 0, axis=1)
match_df = match_df.rename(columns=({'home_odds': 'h_odds', 'away_odds': 'a_odds'}))
match_df = match_df.astype({'h_odds': 'float64', 'a_odds': 'float64', 'draw_odds': 'float64'})
match_df.head()

Unnamed: 0,id,date,home_team_id,home_score,away_team_id,away_score,winner,h_odds,a_odds,draw_odds,...,a_tackle_percentage,h_metres_per_kick,a_metres_per_kick,h_metres_per_run,a_metres_per_run,h_metres_ran_per_try,a_metres_ran_per_try,h_is_home,a_is_home,home_is_winner
0,1275,2013-03-07,15,10,13,28,13,0.4292,0.6211,0.055,...,0.921233,34.3125,26.347826,8.952756,8.94152,568.5,305.8,1,0,0
1,1276,2013-03-08,1,14,6,22,6,0.6329,0.4098,0.0577,...,0.941341,43.125,23.888889,8.496894,9.658537,684.0,396.0,1,0,0
2,1278,2013-03-09,3,12,10,24,10,0.4785,0.5618,0.0566,...,0.937695,21.95,24.736842,8.73913,9.723404,603.0,342.75,1,0,0
3,1277,2013-03-09,11,40,9,10,11,0.5618,0.4831,0.0561,...,0.889222,39.214286,29.142857,10.209459,8.698413,215.857143,548.0,1,0,1
4,1280,2013-03-10,7,30,14,10,7,0.813,0.2381,0.0458,...,0.924012,28.941176,29.368421,10.596154,9.085714,330.6,636.0,1,0,1


In [None]:

def opponent_columns(row, data):
    match_df = data[data['match_id'] == row['match_id']]
    opposition = match_df[match_df['team_id'] != row['team_id']]
    return opposition['team_id']
match_stat_df['opponent_id'] = match_stat_df.apply(lambda x: opponent_columns(x, match_stat_df))

# Prediction Algorithm

Things to try:
1. Want to create scoring function (sklearn.metrics import f1_score, make_scorer, classification_report)

2. Random Forest
3. Neural Network (10-3-2)

Features to try:
- home field advantage
- form in last X matches
- Points differential last X matches
- Points scored last X matches
- Points allowed last X matches
- Running metre differential last X matches
- Tackle Percentages
- Factor in the strength of opposition


In [6]:
correlation_df = match_stat_df
col = list(match_stat_df.columns)[35:52]
correlation_df = match_stat_df.drop(col, axis=1)
correlation_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,conversions,penalty_goals,conversion_percentage,...,ineffective_tackles,tackle_percentage,kicks,kicking_metres,metres_per_kick,errors,handling_errors,penalties,sin_bins,send_offs
0,1,1,14,1,1,34,5,4,3,0.8,...,14,0.8881,17,478,28.1176,10,9,7,0,0
1,2,1,1,14,0,12,2,1,1,0.5,...,15,0.8715,14,382,27.2857,13,11,9,0,0
2,3,2,8,6,1,19,3,3,0,1.0,...,27,0.8465,21,538,25.619,6,6,9,0,0
3,4,2,6,8,0,18,3,2,1,0.6667,...,5,0.8883,13,284,21.8462,16,10,11,0,0
4,5,3,10,4,1,20,4,1,1,0.25,...,19,0.8774,13,252,19.3846,15,10,6,0,0


In [7]:
correlation_df = correlation_df.drop(columns=['line_engaged_runs'])

In [8]:
import seaborn as sns
corr = correlation_df.corr()
corr['is_winner']
stats = corr[(corr['is_winner'] > .2) | (corr['is_winner'] < -.2)]
stats['is_winner']
#corr.style.background_gradient(cmap='coolwarm')

is_winner              1.000000
points                 0.616395
tries                  0.563085
conversions            0.495853
penalty_goals          0.331492
field_goals            0.213446
runs                   0.291058
run_metres             0.387814
kick_return_metres     0.257286
post_contact_metres    0.253308
line_breaks            0.372330
tackle_breaks          0.291125
hit_ups                0.295609
tackles               -0.330251
missed_tackles        -0.290167
kicks                  0.236040
kicking_metres         0.247338
errors                -0.236594
handling_errors       -0.203090
Name: is_winner, dtype: float64

Columns to take into consideration
From highly correlated stats:

- points (allowed)
- tries (allowed)
- penalty_goals
- run_metres (allowed)
- runs
- runs_per_metre (allowed)
- line_breaks (allowed)
- tackle_breaks
- tackles (allowed)
- missed_tackles
- kicking_metres
- errors

In [9]:
stat_columns = ['id', 'match_id', 'team_id', 'opponent_id', 'is_winner', 'points', 'tries', 'penalty_goals',
                     'run_metres', 'runs', 'line_breaks', 'tackle_breaks', 'tackles', 'missed_tackles',
                     'kicking_metres', 'errors']
results_df = match_stat_df[stat_columns]
results_df['metres_per_run'] = results_df['run_metres'] / results_df['runs']
results_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,penalty_goals,run_metres,runs,line_breaks,tackle_breaks,tackles,missed_tackles,kicking_metres,errors,metres_per_run
0,1,1,14,1,1,34,5,3,1618,168,5,35,238,16,478,10,9.630952
1,2,1,1,14,0,12,2,1,1076,130,2,16,339,35,382,13,8.276923
2,3,2,8,6,1,19,3,0,1425,192,3,41,342,35,538,6,7.421875
3,4,2,6,8,0,18,3,1,1648,178,4,35,366,41,284,16,9.258427
4,5,3,10,4,1,20,4,1,1348,165,3,37,272,19,252,15,8.169697


columns to create

- points (allowed)
- tries (allowed)
- penalty_goals
- run_metres (allowed)
- runs
- runs_per_metre (allowed)
- line_breaks (allowed)
- tackle_breaks
- tackles (allowed)
- missed_tackles
- kicking_metres
- errors

- tackle_percentage
- metres_per_run

- point_differential
- run_metre differential
- 

- home field advantage
- form in last X matches
- Points differential last X matches
- Points scored last X matches
- Points allowed last X matches
- Running metre differential last X matches
- line_breaks last X matches
- errors last X matches
- Tackle Percentages
- Strength of opposition metric

In [10]:
columns = [*results_df.columns.tolist(),'opp_points', 'opp_tries', 'opp_run_metres',
           'opp_metres_per_run', 'opp_line_breaks', 'opp_tackles', 'point_diff', 'run_metre_diff']
results_df = results_df.reindex(columns=columns, fill_value=0)
#print(predictions_df)

for match in list(results_df['match_id'].unique()):
    match_df = results_df[results_df['match_id'] == match]
    #print(match_df)
    for team in list(match_df['team_id']):
        team_data = match_df[match_df['team_id'] == team]
        opp_data = match_df[match_df['team_id'] != team]
        
        opp_points = opp_data['points'].iloc[0]
        opp_tries = opp_data['tries'].iloc[0]
        opp_run_metres = opp_data['run_metres'].iloc[0]
        opp_metres_per_run = opp_data['metres_per_run'].iloc[0]
        opp_line_breaks = opp_data['line_breaks'].iloc[0]
        opp_tackles = opp_data['tackles'].iloc[0]
        
        point_diff = team_data['points'].iloc[0] - opp_points
        run_metre_diff = team_data['run_metres'].iloc[0] - opp_run_metres
        
        results_df.loc[team_data.index, 'opp_points'] = opp_points
        results_df.loc[team_data.index, 'opp_tries'] = opp_tries
        results_df.loc[team_data.index, 'opp_run_metres'] = opp_run_metres
        results_df.loc[team_data.index, 'opp_metres_per_run'] = opp_metres_per_run
        results_df.loc[team_data.index, 'opp_line_breaks'] = opp_line_breaks
        results_df.loc[team_data.index, 'opp_tackles'] = opp_tackles
        results_df.loc[team_data.index, 'point_diff'] = point_diff
        results_df.loc[team_data.index, 'run_metre_diff'] = run_metre_diff
          
results_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,penalty_goals,run_metres,runs,...,errors,metres_per_run,opp_points,opp_tries,opp_run_metres,opp_metres_per_run,opp_line_breaks,opp_tackles,point_diff,run_metre_diff
0,1,1,14,1,1,34,5,3,1618,168,...,10,9.630952,12,2,1076,8.276923,2,339,22,542
1,2,1,1,14,0,12,2,1,1076,130,...,13,8.276923,34,5,1618,9.630952,5,238,-22,-542
2,3,2,8,6,1,19,3,0,1425,192,...,6,7.421875,18,3,1648,9.258427,4,366,1,-223
3,4,2,6,8,0,18,3,1,1648,178,...,16,9.258427,19,3,1425,7.421875,3,342,-1,223
4,5,3,10,4,1,20,4,1,1348,165,...,15,8.169697,14,2,1144,8.537313,1,323,6,204


In [11]:
def compute_past_x_match_avg(x, stat, data):
    x_match_avg = []
    for index in range(len(data)):
        if index <= (x - 1):
            start = 0
        else:
            start = (index - x)
        past_x_matches = data.iloc[start : index]
        avg = past_x_matches[stat].mean()
        x_match_avg.append(avg)
    return pd.Series(x_match_avg).values

In [12]:
columns = results_df.columns.tolist()
stats = ['is_winner', 'points', 'tries', 'penalty_goals', 'run_metres', 'runs', 'line_breaks',
         'tackle_breaks', 'tackles', 'missed_tackles', 'kicking_metres', 'errors']

for stat in stats:
    for x in [4, 8]:
        columns.append('avg_' + stat + '_past_' + str(x) + '_matches')
results_df = results_df.reindex(columns=columns, fill_value=0)

for team in list(results_df['team_id'].unique()):
    team_df = results_df[results_df['team_id'] == team]
    team_df = team_df.reset_index()
    for stat in stats:
        for x in [4, 8]:
            new_column = 'avg_' + stat + '_past_' + str(x) + '_matches'
            team_df[new_column] = compute_past_x_match_avg(x, stat, team_df)
    team_df = team_df.set_index('index')
    results_df.loc[results_df['team_id'] == team] = team_df
    #print(predictions_df[predictions_df['team_id'] == team])
print(results_df[results_df['team_id'] == 12])

      id  match_id  team_id  opponent_id  is_winner  points  tries  \
12    13         7       12           11          1      24      3   
24    25        13       12           13          1      18      3   
35    36        18       12            3          0      18      3   
49    50        25       12           10          1      33      4   
79    80        40       12           11          1      12      1   
92    93        47       12            5          1      35      6   
111  112        56       12            4          0      22      4   
120  121        61       12            3          1      22      3   
132  133        67       12           10          0      20      4   
147  148        74       12            8          1      29      4   
160  161        81       12           16          1      16      2   
186  187        94       12           14          1      28      4   
201  202       101       12            2          1      23      3   
221  222       111  

In [13]:
results_df.tail()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,penalty_goals,run_metres,runs,...,avg_tackle_breaks_past_4_matches,avg_tackle_breaks_past_8_matches,avg_tackles_past_4_matches,avg_tackles_past_8_matches,avg_missed_tackles_past_4_matches,avg_missed_tackles_past_8_matches,avg_kicking_metres_past_4_matches,avg_kicking_metres_past_8_matches,avg_errors_past_4_matches,avg_errors_past_8_matches
507,508,254,16,15,0,12,2,0,1479,161,...,32.25,29.875,367.75,361.5,25.0,26.625,537.5,542.375,10.0,9.75
508,509,255,9,8,0,18,3,0,1375,162,...,36.75,32.75,368.5,351.25,32.75,29.0,467.0,481.375,9.0,9.5
509,510,255,8,9,1,36,5,4,1679,167,...,34.75,29.25,356.0,339.25,33.5,36.125,474.25,481.0,10.5,11.0
510,511,256,11,14,1,32,5,2,1670,178,...,29.0,27.625,341.0,334.75,38.25,34.25,447.0,473.75,12.25,12.125
511,512,256,14,11,0,18,3,1,1419,159,...,30.25,33.375,356.5,354.375,22.5,24.25,683.25,625.75,10.25,10.5


In [14]:
query = pd.read_sql_query('''
    SELECT * 
    FROM Matches
    WHERE id < 257;
''', mydb)
predictions_df = pd.DataFrame(query)
predictions_df.tail()

Unnamed: 0,id,date,round,home_team_id,home_score,away_team_id,away_score,winner,is_draw,stadium_id,weather,url,home_odds,draw_odds,away_odds,is_playoff
251,252,2019-05-04,8,2,30,12,12,2,0,36,,http://www.nrl.com/draw/nrl-premiership/2019/r...,0.5714,0.0504,0.463,0
252,253,2019-05-04,8,6,18,3,10,6,0,13,,http://www.nrl.com/draw/nrl-premiership/2019/r...,0.5618,0.0534,0.4739,0
253,254,2019-05-04,8,15,42,16,12,15,0,29,,http://www.nrl.com/draw/nrl-premiership/2019/r...,0.8264,0.0418,0.2174,0
254,255,2019-05-05,8,9,18,8,36,8,0,30,,http://www.nrl.com/draw/nrl-premiership/2019/r...,0.6135,0.05,0.4237,0
255,256,2019-05-05,8,11,32,14,18,11,0,35,,http://www.nrl.com/draw/nrl-premiership/2019/r...,0.5525,0.052,0.4808,0


In [15]:
predictions_df = predictions_df[['id', 'round', 'home_team_id', 'home_score', 'away_team_id', 'away_score', 'winner', 'home_odds', 'away_odds']]

In [16]:
columns = list(results_df.drop(columns=['id', 'match_id', 'team_id', 'opponent_id', 'is_winner', 'points',
                                    'tries', 'penalty_goals', 'run_metres', 'runs', 'line_breaks',
                                    'tackle_breaks', 'tackles', 'missed_tackles', 'kicking_metres',
                                    'errors', 'metres_per_run', 'opp_points', 'opp_tries', 'opp_run_metres',
                                    'opp_metres_per_run', 'opp_line_breaks', 'opp_tackles', 'point_diff',
                                    'run_metre_diff']))

prediction_columns = predictions_df.columns.tolist()
for team in ['h', 'a']:
    for column in columns:
        prediction_columns.append(team + '_' + column)
print(prediction_columns)
predictions_df = predictions_df.reindex(columns=prediction_columns, fill_value=0)
predictions_df.head()

['id', 'round', 'home_team_id', 'home_score', 'away_team_id', 'away_score', 'winner', 'home_odds', 'away_odds', 'h_avg_is_winner_past_4_matches', 'h_avg_is_winner_past_8_matches', 'h_avg_points_past_4_matches', 'h_avg_points_past_8_matches', 'h_avg_tries_past_4_matches', 'h_avg_tries_past_8_matches', 'h_avg_penalty_goals_past_4_matches', 'h_avg_penalty_goals_past_8_matches', 'h_avg_run_metres_past_4_matches', 'h_avg_run_metres_past_8_matches', 'h_avg_runs_past_4_matches', 'h_avg_runs_past_8_matches', 'h_avg_line_breaks_past_4_matches', 'h_avg_line_breaks_past_8_matches', 'h_avg_tackle_breaks_past_4_matches', 'h_avg_tackle_breaks_past_8_matches', 'h_avg_tackles_past_4_matches', 'h_avg_tackles_past_8_matches', 'h_avg_missed_tackles_past_4_matches', 'h_avg_missed_tackles_past_8_matches', 'h_avg_kicking_metres_past_4_matches', 'h_avg_kicking_metres_past_8_matches', 'h_avg_errors_past_4_matches', 'h_avg_errors_past_8_matches', 'a_avg_is_winner_past_4_matches', 'a_avg_is_winner_past_8_matche

Unnamed: 0,id,round,home_team_id,home_score,away_team_id,away_score,winner,home_odds,away_odds,h_avg_is_winner_past_4_matches,...,a_avg_tackle_breaks_past_4_matches,a_avg_tackle_breaks_past_8_matches,a_avg_tackles_past_4_matches,a_avg_tackles_past_8_matches,a_avg_missed_tackles_past_4_matches,a_avg_missed_tackles_past_8_matches,a_avg_kicking_metres_past_4_matches,a_avg_kicking_metres_past_8_matches,a_avg_errors_past_4_matches,a_avg_errors_past_8_matches
0,1,1,14,34,1,12,14,0.5882,0.4386,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,8,19,6,18,8,0.5181,0.5128,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,10,20,4,14,10,0.5988,0.4329,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,16,10,15,8,16,0.25,0.7874,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,13,20,9,32,9,0.6803,0.3509,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
for match in predictions_df.iterrows():
    match = match[1]
    home_team = match['home_team_id']
    away_team = match['away_team_id']
    match_result_data = results_df.loc[results_df['match_id'] == match['id']]
    home_data = match_result_data[match_result_data['team_id'] == home_team]
    away_data = match_result_data[match_result_data['team_id'] == away_team]
    for team in ['h', 'a']:
        if team == 'h':
            df = home_data
        else:
            df = away_data
        #print(df)
        df = df.drop(columns=['id', 'match_id', 'team_id', 'opponent_id', 'is_winner', 'points',
                              'tries', 'penalty_goals', 'run_metres', 'runs', 'line_breaks',
                              'tackle_breaks', 'tackles', 'missed_tackles', 'kicking_metres',
                              'errors', 'metres_per_run', 'opp_points', 'opp_tries', 'opp_run_metres',
                              'opp_metres_per_run', 'opp_line_breaks', 'opp_tackles', 'point_diff',
                              'run_metre_diff'])
        for column in list(df.columns):
            col = team + '_' + column
            #print(df.iloc[0][column])
            match[col] = df.iloc[0][column]
            #print('\n')
            #print(df.iloc[0][column])
    #print(match.values)
    #print('\n')
    predictions_df.loc[predictions_df['id'] == match['id']] = match.values
print(predictions_df.head(10))

   id  round  home_team_id  home_score  away_team_id  away_score  winner  \
0   1      1            14          34             1          12      14   
1   2      1             8          19             6          18       8   
2   3      1            10          20             4          14      10   
3   4      1            16          10            15           8      16   
4   5      1            13          20             9          32       9   
5   6      1             3          18             7          36       7   
6   7      1            12          24            11          14      12   
7   8      1             5          30             2          28       5   
8   9      2             4          16            14          20      14   
9  10      2            15          30             3          12      15   

  home_odds away_odds  h_avg_is_winner_past_4_matches  ...  \
0    0.5882    0.4386                             NaN  ...   
1    0.5181    0.5128                  

## want to predict if home team wins
1 = home_team wins, 0 = away_team wins

In [18]:
predictions_df.head()

Unnamed: 0,id,round,home_team_id,home_score,away_team_id,away_score,winner,home_odds,away_odds,h_avg_is_winner_past_4_matches,...,a_avg_tackle_breaks_past_4_matches,a_avg_tackle_breaks_past_8_matches,a_avg_tackles_past_4_matches,a_avg_tackles_past_8_matches,a_avg_missed_tackles_past_4_matches,a_avg_missed_tackles_past_8_matches,a_avg_kicking_metres_past_4_matches,a_avg_kicking_metres_past_8_matches,a_avg_errors_past_4_matches,a_avg_errors_past_8_matches
0,1,1,14,34,1,12,14,0.5882,0.4386,,...,,,,,,,,,,
1,2,1,8,19,6,18,8,0.5181,0.5128,,...,,,,,,,,,,
2,3,1,10,20,4,14,10,0.5988,0.4329,,...,,,,,,,,,,
3,4,1,16,10,15,8,16,0.25,0.7874,,...,,,,,,,,,,
4,5,1,13,20,9,32,9,0.6803,0.3509,,...,,,,,,,,,,


In [19]:
predictions_df['home_winner'] = predictions_df.apply(lambda x: x['home_team_id'] == x['winner'], axis=1)

In [20]:
predictions_df = predictions_df.rename(columns={'home_odds': 'h_odds', 'away_odds': 'a_odds'})

In [21]:
comparison_cols = list(predictions_df.drop(columns=['id', 'round', 'home_team_id', 'home_score', 'away_team_id',
                                                                  'away_score', 'winner', 'home_winner']).columns)
stat_list = []
for column in comparison_cols:
    stat = column.split('_', 1)[1]
    if stat not in stat_list:
        stat_list.append(stat)
#stat_list

In [22]:
def compute_home_differential(row, column):
    h_col = 'h_' + column
    a_col = 'a_' + column
    return float(row[h_col]) - float(row[a_col])

diff_cols = []
for stat in stat_list:
    diff_col = 'diff_' + stat
    diff_cols.append(diff_col)
    predictions_df[diff_col] = predictions_df.apply(lambda x: compute_home_differential(x, stat), axis=1)
#predictions_df.head(10)
print(diff_cols)

['diff_odds', 'diff_avg_is_winner_past_4_matches', 'diff_avg_is_winner_past_8_matches', 'diff_avg_points_past_4_matches', 'diff_avg_points_past_8_matches', 'diff_avg_tries_past_4_matches', 'diff_avg_tries_past_8_matches', 'diff_avg_penalty_goals_past_4_matches', 'diff_avg_penalty_goals_past_8_matches', 'diff_avg_run_metres_past_4_matches', 'diff_avg_run_metres_past_8_matches', 'diff_avg_runs_past_4_matches', 'diff_avg_runs_past_8_matches', 'diff_avg_line_breaks_past_4_matches', 'diff_avg_line_breaks_past_8_matches', 'diff_avg_tackle_breaks_past_4_matches', 'diff_avg_tackle_breaks_past_8_matches', 'diff_avg_tackles_past_4_matches', 'diff_avg_tackles_past_8_matches', 'diff_avg_missed_tackles_past_4_matches', 'diff_avg_missed_tackles_past_8_matches', 'diff_avg_kicking_metres_past_4_matches', 'diff_avg_kicking_metres_past_8_matches', 'diff_avg_errors_past_4_matches', 'diff_avg_errors_past_8_matches']


In [120]:
cols = ['id', 'round', 'home_team_id', 'away_team_id', 'winner', 'home_winner', 'home_score', 'away_score', 'h_odds']
cols.extend(diff_cols)
print(cols)
final_predictions = predictions_df[cols] 
final_predictions.head()

['id', 'round', 'home_team_id', 'away_team_id', 'winner', 'home_winner', 'home_score', 'away_score', 'h_odds', 'diff_odds', 'diff_avg_is_winner_past_4_matches', 'diff_avg_is_winner_past_8_matches', 'diff_avg_points_past_4_matches', 'diff_avg_points_past_8_matches', 'diff_avg_tries_past_4_matches', 'diff_avg_tries_past_8_matches', 'diff_avg_penalty_goals_past_4_matches', 'diff_avg_penalty_goals_past_8_matches', 'diff_avg_run_metres_past_4_matches', 'diff_avg_run_metres_past_8_matches', 'diff_avg_runs_past_4_matches', 'diff_avg_runs_past_8_matches', 'diff_avg_line_breaks_past_4_matches', 'diff_avg_line_breaks_past_8_matches', 'diff_avg_tackle_breaks_past_4_matches', 'diff_avg_tackle_breaks_past_8_matches', 'diff_avg_tackles_past_4_matches', 'diff_avg_tackles_past_8_matches', 'diff_avg_missed_tackles_past_4_matches', 'diff_avg_missed_tackles_past_8_matches', 'diff_avg_kicking_metres_past_4_matches', 'diff_avg_kicking_metres_past_8_matches', 'diff_avg_errors_past_4_matches', 'diff_avg_erro

Unnamed: 0,id,round,home_team_id,away_team_id,winner,home_winner,home_score,away_score,h_odds,diff_odds,...,diff_avg_tackle_breaks_past_4_matches,diff_avg_tackle_breaks_past_8_matches,diff_avg_tackles_past_4_matches,diff_avg_tackles_past_8_matches,diff_avg_missed_tackles_past_4_matches,diff_avg_missed_tackles_past_8_matches,diff_avg_kicking_metres_past_4_matches,diff_avg_kicking_metres_past_8_matches,diff_avg_errors_past_4_matches,diff_avg_errors_past_8_matches
0,1,1,14,1,14,True,34,12,0.5882,0.1496,...,,,,,,,,,,
1,2,1,8,6,8,True,19,18,0.5181,0.0053,...,,,,,,,,,,
2,3,1,10,4,10,True,20,14,0.5988,0.1659,...,,,,,,,,,,
3,4,1,16,15,16,True,10,8,0.25,-0.5374,...,,,,,,,,,,
4,5,1,13,9,9,False,20,32,0.6803,0.3294,...,,,,,,,,,,


In [121]:
#final_predictions.corr()
final_predictions = final_predictions.dropna()
print(final_predictions.shape)
final_predictions.head()

(248, 34)


Unnamed: 0,id,round,home_team_id,away_team_id,winner,home_winner,home_score,away_score,h_odds,diff_odds,...,diff_avg_tackle_breaks_past_4_matches,diff_avg_tackle_breaks_past_8_matches,diff_avg_tackles_past_4_matches,diff_avg_tackles_past_8_matches,diff_avg_missed_tackles_past_4_matches,diff_avg_missed_tackles_past_8_matches,diff_avg_kicking_metres_past_4_matches,diff_avg_kicking_metres_past_8_matches,diff_avg_errors_past_4_matches,diff_avg_errors_past_8_matches
8,9,2,4,14,14,False,16,20,0.5102,-0.0079,...,-16.0,-16.0,85.0,85.0,21.0,21.0,-185.0,-185.0,4.0,4.0
9,10,2,15,3,15,True,30,12,0.7519,0.462,...,-6.0,-6.0,48.0,48.0,-4.0,-4.0,234.0,234.0,0.0,0.0
10,11,2,1,10,1,True,24,20,0.4484,-0.133,...,-21.0,-21.0,67.0,67.0,16.0,16.0,130.0,130.0,-2.0,-2.0
11,12,2,9,5,9,True,20,8,0.7634,0.4848,...,-2.0,-2.0,35.0,35.0,-2.0,-2.0,27.0,27.0,-4.0,-4.0
12,13,2,12,13,12,True,18,14,0.7194,0.3989,...,-2.0,-2.0,15.0,15.0,1.0,1.0,184.0,184.0,-4.0,-4.0


In [122]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set()

In [123]:
final_predictions['home_winner'] = final_predictions['home_winner'].replace({True: 1, False: 0})

In [124]:
final_predictions = final_predictions.reset_index()
final_predictions.head()

Unnamed: 0,index,id,round,home_team_id,away_team_id,winner,home_winner,home_score,away_score,h_odds,...,diff_avg_tackle_breaks_past_4_matches,diff_avg_tackle_breaks_past_8_matches,diff_avg_tackles_past_4_matches,diff_avg_tackles_past_8_matches,diff_avg_missed_tackles_past_4_matches,diff_avg_missed_tackles_past_8_matches,diff_avg_kicking_metres_past_4_matches,diff_avg_kicking_metres_past_8_matches,diff_avg_errors_past_4_matches,diff_avg_errors_past_8_matches
0,8,9,2,4,14,14,0,16,20,0.5102,...,-16.0,-16.0,85.0,85.0,21.0,21.0,-185.0,-185.0,4.0,4.0
1,9,10,2,15,3,15,1,30,12,0.7519,...,-6.0,-6.0,48.0,48.0,-4.0,-4.0,234.0,234.0,0.0,0.0
2,10,11,2,1,10,1,1,24,20,0.4484,...,-21.0,-21.0,67.0,67.0,16.0,16.0,130.0,130.0,-2.0,-2.0
3,11,12,2,9,5,9,1,20,8,0.7634,...,-2.0,-2.0,35.0,35.0,-2.0,-2.0,27.0,27.0,-4.0,-4.0
4,12,13,2,12,13,12,1,18,14,0.7194,...,-2.0,-2.0,15.0,15.0,1.0,1.0,184.0,184.0,-4.0,-4.0


# First Iteration of Predictions

In [125]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [127]:
features_df = final_predictions.drop(columns=['id', 'round', 'home_team_id', 'away_team_id', 'winner', 'home_winner', 'home_score', 'away_score', 'h_odds'])
target = final_predictions['home_winner']

In [128]:
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.2, random_state=42)
#print(y_test.shape, y_train.shape, X_test.shape, X_train.shape)

In [129]:
selector = SelectKBest(score_func=f_classif, k=6)
selector.fit(X_train, y_train)

SelectKBest(k=6, score_func=<function f_classif at 0x123dd6ae8>)

In [130]:
f_score_df = pd.DataFrame({'feature': X_train.columns.to_list(), 'score': selector.scores_})
f_score_df.sort_values('score', ascending=False)

Unnamed: 0,feature,score
1,diff_odds,11.508907
3,diff_avg_is_winner_past_8_matches,11.452377
9,diff_avg_penalty_goals_past_8_matches,5.563852
11,diff_avg_run_metres_past_8_matches,5.407459
2,diff_avg_is_winner_past_4_matches,4.900473
8,diff_avg_penalty_goals_past_4_matches,4.71053
22,diff_avg_kicking_metres_past_4_matches,4.540151
23,diff_avg_kicking_metres_past_8_matches,3.852152
10,diff_avg_run_metres_past_4_matches,3.704269
5,diff_avg_points_past_8_matches,2.290703


In [131]:
cols = selector.get_support(indices=True)
X_train_new = X_train.iloc[:, cols]
X_test_new = X_test.iloc[:, cols]

X_train_new = X_train_new.drop(columns=['diff_avg_is_winner_past_4_matches', 'diff_avg_penalty_goals_past_4_matches'])
X_test_new = X_test_new.drop(columns=['diff_avg_is_winner_past_4_matches', 'diff_avg_penalty_goals_past_4_matches'])

In [132]:
log = LogisticRegression()
log.fit(X_train_new, y_train)
y_pred = log.predict(X_test_new)



In [133]:
print('Accuracy of logistic regression classifier on test set: ' + str(log.score(X_test_new, y_pred)))

Accuracy of logistic regression classifier on test set: 1.0


In [134]:
print(y_pred.shape)
print(y_test.shape, y_train.shape, X_test_new.shape, X_train_new.shape)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

(50,)
(50,) (198,) (50, 4) (198, 4)
[[ 8 13]
 [ 8 21]]


In [135]:
df = pd.DataFrame({'winner': y_test, 'prediction': y_pred})
#print(df.index.to_list())
#print(final_predictions.head())
#print(X_test_new.head())
results_df = final_predictions.iloc[df.index.to_list()]
#print(results_df.head())
results_df = results_df[['home_team_id', 'away_team_id', 'home_score', 'away_score', 'h_odds', 'home_winner']]
results_df['predictions'] = y_pred
results_df.head()

Unnamed: 0,home_team_id,away_team_id,home_score,away_score,h_odds,home_winner,predictions
33,7,8,40,14,0.7752,1,1
6,6,11,54,0,0.5405,1,1
182,3,4,18,30,0.3831,0,1
86,8,4,10,48,0.3906,0,0
156,6,3,18,6,0.6667,1,1


In [136]:
results_df['correct_odds'] = results_df.apply(lambda x: 
                                              ((float(x['h_odds']) < 0.5) & (x['home_winner'] == False)) |
                                              ((float(x['h_odds']) > 0.5) & (x['home_winner'] == True)), axis=1)
results_df['correct_prediction'] = results_df.apply(lambda x: 
                                              ((x['predictions'] == 0) & (x['home_winner'] == False)) |
                                              ((x['predictions'] == 1) & (x['home_winner'] == True)), axis=1)

In [137]:
results_df.head()
prediction_percent = len(results_df[results_df['correct_prediction'] == True]) / results_df.shape[0]
betting_percent = len(results_df[results_df['correct_odds'] == True]) / results_df.shape[0]                                   
print('prediction_percentage = ' + str(prediction_percent))
print('betting_percentage = ' + str(betting_percent))

prediction_percentage = 0.58
betting_percentage = 0.62


In [141]:
analysis_df = predictions_df[['id', 'round', 'home_team_id', 'away_team_id', 'home_score', 'away_score', 'winner', 'home_winner', 'h_odds']].sort_values('id')
analysis_df = analysis_df.iloc

In [None]:
mycursor.close()