# Predicting Match Outcomes

In [1]:
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd
import datetime

In [2]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor()

In [3]:
query = pd.read_sql_query('''
    SELECT * 
    FROM TeamMatchStats
''', mydb)

In [4]:
match_stat_df = pd.DataFrame(query)

In [5]:
match_stat_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,conversions,penalty_goals,conversion_percentage,...,prop_1,prop_2,hooker,sr_1,sr_2,lock_1,int_1,int_2,int_3,int_4
0,1,1,14,1,1,34,5,4,3,0.8,...,404,396,391,407,406,483,1626,402,397,219
1,2,1,1,14,0,12,2,1,1,0.5,...,11,500,499,2,10,269,3,400,9,1624
2,3,2,8,6,1,19,3,3,0,1.0,...,192,503,201,196,188,197,190,429,194,505
3,4,2,6,8,0,18,3,2,1,0.6667,...,510,509,153,511,155,512,516,309,513,327
4,5,3,10,4,1,20,4,1,1,0.25,...,268,272,265,264,375,266,260,276,267,261


# Prediction Algorithm

Things to try:
1. Want to create scoring function (sklearn.metrics import f1_score, make_scorer, classification_report)

2. Random Forest
3. Neural Network (10-3-2)

Features to try:
- home field advantage
- form in last X matches
- Points differential last X matches
- Points scored last X matches
- Points allowed last X matches
- Running metre differential last X matches
- Tackle Percentages
- Factor in the strength of opposition


In [6]:
correlation_df = match_stat_df
col = list(match_stat_df.columns)[35:52]
correlation_df = match_stat_df.drop(col, axis=1)
correlation_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,conversions,penalty_goals,conversion_percentage,...,ineffective_tackles,tackle_percentage,kicks,kicking_metres,metres_per_kick,errors,handling_errors,penalties,sin_bins,send_offs
0,1,1,14,1,1,34,5,4,3,0.8,...,14,0.8881,17,478,28.1176,10,9,7,0,0
1,2,1,1,14,0,12,2,1,1,0.5,...,15,0.8715,14,382,27.2857,13,11,9,0,0
2,3,2,8,6,1,19,3,3,0,1.0,...,27,0.8465,21,538,25.619,6,6,9,0,0
3,4,2,6,8,0,18,3,2,1,0.6667,...,5,0.8883,13,284,21.8462,16,10,11,0,0
4,5,3,10,4,1,20,4,1,1,0.25,...,19,0.8774,13,252,19.3846,15,10,6,0,0


In [7]:
correlation_df = correlation_df.drop(columns=['line_engaged_runs'])

In [8]:
import seaborn as sns
corr = correlation_df.corr()
corr['is_winner']
stats = corr[(corr['is_winner'] > .2) | (corr['is_winner'] < -.2)]
stats['is_winner']
#corr.style.background_gradient(cmap='coolwarm')

is_winner              1.000000
points                 0.616395
tries                  0.563085
conversions            0.495853
penalty_goals          0.331492
field_goals            0.213446
runs                   0.291058
run_metres             0.387814
kick_return_metres     0.257286
post_contact_metres    0.253308
line_breaks            0.372330
tackle_breaks          0.291125
hit_ups                0.295609
tackles               -0.330251
missed_tackles        -0.290167
kicks                  0.236040
kicking_metres         0.247338
errors                -0.236594
handling_errors       -0.203090
Name: is_winner, dtype: float64

Columns to take into consideration
From highly correlated stats:

- points
- tries
- penalty_goals
- run_metres
- runs
- line_breaks
- tackle_breaks
- tackles
- missed_tackles
- kicking_metres
- errors

In [58]:
prediction_columns = ['id', 'match_id', 'team_id', 'opponent_id', 'is_winner', 'points', 'tries', 'penalty_goals',
                     'run_metres', 'runs', 'line_breaks', 'tackle_breaks', 'tackles', 'missed_tackles',
                     'kicking_metres', 'errors']
predictions_df = match_stat_df[prediction_columns]
predictions_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,penalty_goals,run_metres,runs,line_breaks,tackle_breaks,tackles,missed_tackles,kicking_metres,errors
0,1,1,14,1,1,34,5,3,1618,168,5,35,238,16,478,10
1,2,1,1,14,0,12,2,1,1076,130,2,16,339,35,382,13
2,3,2,8,6,1,19,3,0,1425,192,3,41,342,35,538,6
3,4,2,6,8,0,18,3,1,1648,178,4,35,366,41,284,16
4,5,3,10,4,1,20,4,1,1348,165,3,37,272,19,252,15


columns to create

- tackle_percentage
- metres_per_run

- point_differential
- run_metre differential
- 

- home field advantage
- form in last X matches
- Points differential last X matches
- Points scored last X matches
- Points allowed last X matches
- Running metre differential last X matches
- line_breaks last X matches
- errors last X matches
- Tackle Percentages
- Strength of opposition metric

In [59]:
columns = [*predictions_df.columns.tolist(),'point_diff', 'run_metre_diff', 'points_allowed', 'run_metres_allowed', 'line_breaks_allowed']
predictions_df = predictions_df.reindex(columns=columns, fill_value=0)
#print(predictions_df)

for match in list(predictions_df['match_id'].unique()):
    match_df = predictions_df[predictions_df['match_id'] == match]
    #print(match_df)
    for team in list(match_df['team_id']):
        team_data = match_df[match_df['team_id'] == team]
        opp_data = match_df[match_df['team_id'] != team]
        point_diff = team_data['points'].iloc[0] - opp_data['points'].iloc[0]
        run_metre_diff = team_data['run_metres'].iloc[0] - opp_data['run_metres'].iloc[0]
        points_allowed = opp_data['points'].iloc[0]
        run_metres_allowed = opp_data['run_metres'].iloc[0]
        line_breaks_allowed = opp_data['line_breaks'].iloc[0]
        
        predictions_df.loc[team_data.index, new_columns] = [point_diff, run_metre_diff, points_allowed, run_metres_allowed, line_breaks_allowed]
        predictions_df.loc[team_data.index, 'point_diff'] = point_diff
        predictions_df.loc[team_data.index, 'run_metre_diff'] = run_metre_diff
        predictions_df.loc[team_data.index, 'points_allowed'] = points_allowed
        predictions_df.loc[team_data.index, 'run_metres_allowed'] = run_metres_allowed
        predictions_df.loc[team_data.index, 'line_breaks_allowed'] = line_breaks_allowed
        
predictions_df.head()

Unnamed: 0,id,match_id,team_id,opponent_id,is_winner,points,tries,penalty_goals,run_metres,runs,...,tackle_breaks,tackles,missed_tackles,kicking_metres,errors,point_diff,run_metre_diff,points_allowed,run_metres_allowed,line_breaks_allowed
0,1,1,14,1,1,34,5,3,1618,168,...,35,238,16,478,10,22,542,12,1076,2
1,2,1,1,14,0,12,2,1,1076,130,...,16,339,35,382,13,-22,-542,34,1618,5
2,3,2,8,6,1,19,3,0,1425,192,...,41,342,35,538,6,1,-223,18,1648,4
3,4,2,6,8,0,18,3,1,1648,178,...,35,366,41,284,16,-1,223,19,1425,3
4,5,3,10,4,1,20,4,1,1348,165,...,37,272,19,252,15,6,204,14,1144,1


In [62]:
def compute_past_x_match_avg(x, stat, data):
    x_match_avg = []
    for index in range(len(data)):
        if index <= (x - 1):
            start = 0
        else:
            start = (index - x)
        past_x_days = data.iloc[(index - x) : index]
        avg = past_x_days[stat].mean()
        x_match_avg.append(avg)
    new_column = 'avg_' + stat + '_' + str(x) + '_matches'
    return pd.Series(x_match_avg).values

for team in list(predictions_df['team_id'].unique()):
    team_df = predictions_df[predictions_df['team_id'] == team]
    team_df = team_df.reset_index()
    for stat in ['is_winner', 'points', 'points_allowed', 'point_diff', 'run_metre_diff', 'line_breaks', 'errors']:
        x = 5
        new_column = 'avg_' + stat + '_' + str(x) + '_matches'
        team_df[new_column] = compute_past_x_match_avg(x, stat, team_df)
    print(team_df.head(10))

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0      0    1         1       14            1          1      34      5   
1     17   18         9       14            4          1      20      3   
2     45   46        23       14            5          1      54      9   
3     58   59        30       14            8          1      30      5   
4     68   69        35       14           13          1      16      2   
5     84   85        43       14            4          1      40      6   
6     99  100        50       14            9          0      12      2   
7    112  113        57       14           15          1      24      4   
8    140  141        71       14            7          1      34      4   
9    157  158        79       14           13          0      10      2   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              3        1618  ...              12                1076   
1              2        1353

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0      2    3         2        8            6          1      19      3   
1     31   32        16        8            2          1      30      5   
2     47   48        24        8           15          0       8      1   
3     59   60        30        8           14          0      12      2   
4     74   75        38        8            1          1      15      2   
5     83   84        42        8            7          0      14      3   
6    105  106        53        8           16          1      22      5   
7    119  120        60        8            6          1      18      3   
8    130  131        66        8           13          0      18      4   
9    146  147        74        8           12          0      18      3   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              0        1425  ...              18                1648   
1              1        1362

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0      4    5         3       10            4          1      20      4   
1     21   22        11       10            1          0      20      3   
2     33   34        17       10            7          0      14      2   
3     48   49        25       10           12          0      14      3   
4     73   74        37       10            9          0      12      2   
5     88   89        45       10            3          0      10      2   
6    106  107        54       10            5          1      26      4   
7    124  125        63       10            2          0       8      1   
8    133  134        67       10           12          1      26      4   
9    145  146        73       10           16          0      12      2   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              1        1348  ...              14                1144   
1              1        1364

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0      6    7         4       16           15          1      10      1   
1     27   28        14       16            7          1      10      1   
2     36   37        19       16            1          0       7      1   
3     62   63        32       16           11          1      30      5   
4     70   71        36       16            7          1      11      2   
5     95   96        48       16            6          1      38      6   
6    104  105        53       16            8          0      20      3   
7    127  128        64       16           11          0      22      3   
8    137  138        69       16            9          0       4      1   
9    144  145        73       16           10          1      20      4   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              1        1600  ...               8                1427   
1              3        1376

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0      8    9         5       13            9          0      20      4   
1     25   26        13       13           12          0      14      2   
2     40   41        21       13            6          1      34      7   
3     50   51        26       13            3          1      20      4   
4     69   70        35       13           14          0      12      2   
5     81   82        41       13           15          1      26      4   
6    102  103        52       13            2          1      42      7   
7    116  117        59       13            1          0      20      3   
8    131  132        66       13            8          1      36      6   
9    156  157        79       13           14          1      24      4   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              0        1551  ...              32                1590   
1              1        1650

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0     10   11         6        3            7          0      18      3   
1     19   20        10        3           15          0      12      2   
2     34   35        18        3           12          1      20      4   
3     51   52        26        3           13          0      16      2   
4     65   66        33        3            2          0      10      2   
5     89   90        45        3           10          1      27      3   
6     96   97        49        3           15          0       0      0   
7    121  122        61        3           12          0      14      2   
8    129  130        65        3            1          0      20      3   
9    148  149        75        3           11          1      20      2   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              1        1427  ...              36                1616   
1              0        1470

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0     12   13         7       12           11          1      24      3   
1     24   25        13       12           13          1      18      3   
2     35   36        18       12            3          0      18      3   
3     49   50        25       12           10          1      33      4   
4     79   80        40       12           11          1      12      1   
5     92   93        47       12            5          1      35      6   
6    111  112        56       12            4          0      22      4   
7    120  121        61       12            3          1      22      3   
8    132  133        67       12           10          0      20      4   
9    147  148        74       12            8          1      29      4   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              3        1583  ...              14                1400   
1             -1        1675

   index   id  match_id  team_id  opponent_id  is_winner  points  tries  \
0     14   15         8        5            2          1      30      5   
1     23   24        12        5            9          0       8      1   
2     44   45        23        5           14          0       8      1   
3     61   62        31        5            1          1      26      5   
4     76   77        39        5            6          1      32      5   
5     93   94        47        5           12          0      12      2   
6    107  108        54        5           10          0      14      2   
7    122  123        62        5            4          0       9      1   
8    135  136        68        5            2          0      18      3   
9    153  154        77        5            7          0      22      4   

   penalty_goals  run_metres  ...  points_allowed  run_metres_allowed  \
0              1        1563  ...              28                1221   
1              1        1498

In [None]:
mycursor.close()