# Predicting Match Outcomes

In [1]:
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd
import datetime

In [2]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor()

In [3]:
query = pd.read_sql_query('''
    SELECT * 
    FROM TeamMatchStats
''', mydb)

In [4]:
match_stat_df = pd.DataFrame(query)

In [5]:
match_stat_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,conversions,penalty_goals,conversion_percentage,...,prop_1,prop_2,hooker,sr_1,sr_2,lock_1,int_1,int_2,int_3,int_4
0,1,1,14,1,1,34,5,4,3,0.8,...,404,396,391,407,406,483,1626,402,397,219
1,2,1,1,14,0,12,2,1,1,0.5,...,11,500,499,2,10,269,3,400,9,1624
2,3,2,8,6,1,19,3,3,0,1.0,...,192,503,201,196,188,197,190,429,194,505
3,4,2,6,8,0,18,3,2,1,0.6667,...,510,509,153,511,155,512,516,309,513,327
4,5,3,10,4,1,20,4,1,1,0.25,...,268,272,265,264,375,266,260,276,267,261


# Prediction Algorithm

Things to try:
1. Want to create scoring function (sklearn.metrics import f1_score, make_scorer, classification_report)

2. Random Forest
3. Neural Network (10-3-2)

Features to try:
- home field advantage
- form in last X matches
- Points differential last X matches
- Points scored last X matches
- Points allowed last X matches
- Running metre differential last X matches
- Tackle Percentages
- Factor in the strength of opposition


In [6]:
correlation_df = match_stat_df
col = list(match_stat_df.columns)[35:52]
correlation_df = match_stat_df.drop(col, axis=1)
correlation_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,conversions,penalty_goals,conversion_percentage,...,ineffective_tackles,tackle_percentage,kicks,kicking_metres,metres_per_kick,errors,handling_errors,penalties,sin_bins,send_offs
0,1,1,14,1,1,34,5,4,3,0.8,...,14,0.8881,17,478,28.1176,10,9,7,0,0
1,2,1,1,14,0,12,2,1,1,0.5,...,15,0.8715,14,382,27.2857,13,11,9,0,0
2,3,2,8,6,1,19,3,3,0,1.0,...,27,0.8465,21,538,25.619,6,6,9,0,0
3,4,2,6,8,0,18,3,2,1,0.6667,...,5,0.8883,13,284,21.8462,16,10,11,0,0
4,5,3,10,4,1,20,4,1,1,0.25,...,19,0.8774,13,252,19.3846,15,10,6,0,0


In [7]:
correlation_df = correlation_df.drop(columns=['line_engaged_runs'])

In [8]:
import seaborn as sns
corr = correlation_df.corr()
corr['is_winner']
stats = corr[(corr['is_winner'] > .2) | (corr['is_winner'] < -.2)]
stats['is_winner']
#corr.style.background_gradient(cmap='coolwarm')

is_winner              1.000000
points                 0.616395
tries                  0.563085
conversions            0.495853
penalty_goals          0.331492
field_goals            0.213446
runs                   0.291058
run_metres             0.387814
kick_return_metres     0.257286
post_contact_metres    0.253308
line_breaks            0.372330
tackle_breaks          0.291125
hit_ups                0.295609
tackles               -0.330251
missed_tackles        -0.290167
kicks                  0.236040
kicking_metres         0.247338
errors                -0.236594
handling_errors       -0.203090
Name: is_winner, dtype: float64

Columns to take into consideration
From highly correlated stats:

- points
- tries
- penalty_goals
- run_metres
- runs
- line_breaks
- tackle_breaks
- tackles
- missed_tackles
- kicking_metres
- errors

In [19]:
prediction_columns = ['id', 'match_id', 'team_id', 'opponent_id', 'is_winner', 'points', 'tries', 'penalty_goals',
                     'run_metres', 'runs', 'line_breaks', 'tackle_breaks', 'tackles', 'missed_tackles',
                     'kicking_metres', 'errors']
predictions_df = match_stat_df[prediction_columns]
predictions_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,penalty_goals,run_metres,runs,line_breaks,tackle_breaks,tackles,missed_tackles,kicking_metres,errors
0,1,1,14,1,1,34,5,3,1618,168,5,35,238,16,478,10
1,2,1,1,14,0,12,2,1,1076,130,2,16,339,35,382,13
2,3,2,8,6,1,19,3,0,1425,192,3,41,342,35,538,6
3,4,2,6,8,0,18,3,1,1648,178,4,35,366,41,284,16
4,5,3,10,4,1,20,4,1,1348,165,3,37,272,19,252,15


columns to create

- tackle_percentage
- metres_per_run

- point_differential
- run_metre differential
- 

- home field advantage
- form in last X matches
- Points differential last X matches
- Points scored last X matches
- Points allowed last X matches
- Running metre differential last X matches
- line_breaks last X matches
- errors last X matches
- Tackle Percentages
- Strength of opposition metric

In [20]:
columns = [*predictions_df.columns.tolist(),'point_diff', 'run_metre_diff', 'points_allowed', 'run_metres_allowed', 'line_breaks_allowed']
predictions_df = predictions_df.reindex(columns=columns, fill_value=0)
#print(predictions_df)

for match in list(predictions_df['match_id'].unique()):
    match_df = predictions_df[predictions_df['match_id'] == match]
    #print(match_df)
    for team in list(match_df['team_id']):
        team_data = match_df[match_df['team_id'] == team]
        opp_data = match_df[match_df['team_id'] != team]
        point_diff = team_data['points'].iloc[0] - opp_data['points'].iloc[0]
        run_metre_diff = team_data['run_metres'].iloc[0] - opp_data['run_metres'].iloc[0]
        points_allowed = opp_data['points'].iloc[0]
        run_metres_allowed = opp_data['run_metres'].iloc[0]
        line_breaks_allowed = opp_data['line_breaks'].iloc[0]
        
        predictions_df.loc[team_data.index, 'point_diff'] = point_diff
        predictions_df.loc[team_data.index, 'run_metre_diff'] = run_metre_diff
        predictions_df.loc[team_data.index, 'points_allowed'] = points_allowed
        predictions_df.loc[team_data.index, 'run_metres_allowed'] = run_metres_allowed
        predictions_df.loc[team_data.index, 'line_breaks_allowed'] = line_breaks_allowed
        
predictions_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,penalty_goals,run_metres,runs,...,tackle_breaks,tackles,missed_tackles,kicking_metres,errors,point_diff,run_metre_diff,points_allowed,run_metres_allowed,line_breaks_allowed
0,1,1,14,1,1,34,5,3,1618,168,...,35,238,16,478,10,22,542,12,1076,2
1,2,1,1,14,0,12,2,1,1076,130,...,16,339,35,382,13,-22,-542,34,1618,5
2,3,2,8,6,1,19,3,0,1425,192,...,41,342,35,538,6,1,-223,18,1648,4
3,4,2,6,8,0,18,3,1,1648,178,...,35,366,41,284,16,-1,223,19,1425,3
4,5,3,10,4,1,20,4,1,1348,165,...,37,272,19,252,15,6,204,14,1144,1


In [21]:
def compute_past_x_match_avg(x, stat, data):
    x_match_avg = []
    for index in range(len(data)):
        if index <= (x - 1):
            start = 0
        else:
            start = (index - x)
        past_x_days = data.iloc[(index - x) : index]
        avg = past_x_days[stat].mean()
        x_match_avg.append(avg)
    new_column = 'avg_' + stat + '_' + str(x) + '_matches'
    return pd.Series(x_match_avg).values

new_columns = []
for stat in ['is_winner', 'points', 'points_allowed', 'point_diff', 'run_metre_diff', 'line_breaks', 'errors']:
    new_columns.append('avg_' + stat + '_' + str(5) + '_matches')
columns = [*predictions_df.columns.tolist(), new_columns]
predictions_df = predictions_df.reindex(columns=columns, fill_value=0)
print(predictions_df.columns)

for team in list(predictions_df['team_id'].unique()):
    team_df = predictions_df[predictions_df['team_id'] == team]
    team_df = team_df.reset_index()
    for stat in ['is_winner', 'points', 'points_allowed', 'point_diff', 'run_metre_diff', 'line_breaks', 'errors']:
        x = 5
        new_column = 'avg_' + stat + '_' + str(x) + '_matches'
        team_df[new_column] = compute_past_x_match_avg(x, stat, team_df)
    #print(team_df.head(10))
    predictions_df[predictions_df['team_id'] == team] = team_df
    print(predictions_df[predictions_df['team_id'] == team])

TypeError: unhashable type: 'list'

In [None]:
mycursor.close()