# NRL match analysis

Going to determine which stats have the biggest impact on the outcome of NRL matches. We will see if we can use this information to make predictions about future matches.

In [2]:
#Import Pandas
import pandas as pd

In [3]:
#Connect to database
import mysql.connector as sql

mydb = sql.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)


In [4]:
#Function to easily create dataframes from SQL query results
def create_df(query):
    return pd.read_sql_query(query, con=mydb)

In [5]:
#Only want to analyze matches from the 2018 season
matches2018 = "SELECT * FROM Matches WHERE year(date) = '2018';"
matches2018 = create_df(matches2018)
matches2018.head()

Unnamed: 0,id,date,round,home_team_id,home_score,away_team_id,away_score,winner,is_draw,stadium_id,weather,url
0,1,2018-03-08,1,14,34,1,12,14,0,1,,http://www.nrl.com/draw/nrl-premiership/2018/r...
1,2,2018-03-09,1,8,19,6,18,8,0,2,,http://www.nrl.com/draw/nrl-premiership/2018/r...
2,3,2018-03-09,1,10,20,4,14,10,0,3,,http://www.nrl.com/draw/nrl-premiership/2018/r...
3,4,2018-03-10,1,16,10,15,8,16,0,4,,http://www.nrl.com/draw/nrl-premiership/2018/r...
4,5,2018-03-10,1,13,20,9,32,9,0,5,,http://www.nrl.com/draw/nrl-premiership/2018/r...


# Make Standings to have better idea of teams that performed well

In [6]:
#Better way to view the data in a table...easier to work with team names than ids
matches2018 = '''SELECT m.id, m.round, m.date,
CASE
	WHEN m.home_team_id = m.winner THEN home.nickname
    ELSE away.nickname
END winner,
CASE
	WHEN m.home_team_id = m.winner THEN m.home_score
    ELSE m.away_score
END winning_score,
CASE
	WHEN m.home_team_id = m.winner THEN away.nickname
    ELSE home.nickname
END loser,
CASE
	WHEN m.home_team_id = m.winner THEN m.away_score
    ELSE m.home_score
END losing_score, home.nickname home, away.nickname away
FROM Matches m
JOIN Teams home
ON m.home_team_id = home.id
JOIN Teams away
ON m.away_team_id = away.id
WHERE year(m.date) = 2018
ORDER BY date;'''
matches2018 = create_df(matches2018)
matches2018.head()

Unnamed: 0,id,round,date,winner,winning_score,loser,losing_score,home,away
0,1,1,2018-03-08,Dragons,34,Broncos,12,Dragons,Broncos
1,2,1,2018-03-09,Knights,19,Sea Eagles,18,Knights,Sea Eagles
2,3,1,2018-03-09,Cowboys,20,Sharks,14,Cowboys,Sharks
3,6,1,2018-03-10,Storm,36,Bulldogs,18,Bulldogs,Storm
4,5,1,2018-03-10,Warriors,32,Rabbitohs,20,Rabbitohs,Warriors


In [7]:
winners = matches2018.groupby('winner')
w_scored = winners['winning_score'].sum()
w_allowed = winners['losing_score'].sum()

losers = matches2018.groupby('loser')
l_scored = losers['losing_score'].sum()
l_allowed = losers['winning_score'].sum()

total_scores = pd.concat([w_scored, l_scored, w_allowed, l_allowed], axis=1, join='inner')
total_scores.columns = ['scored_in_win', 'scored_in_loss', 'allowed_in_win', 'allowed_in_loss']
total_scores.head(2)

total_scores['scored'] = total_scores['scored_in_win'] + total_scores['scored_in_loss']
total_scores['allowed'] = total_scores['allowed_in_win'] + total_scores['allowed_in_loss']
total_scores['point_differential'] = total_scores['scored'] - total_scores['allowed']
total_points_data = total_scores[['point_differential', 'scored', 'allowed']]
total_points_data.head()

Unnamed: 0,point_differential,scored,allowed
Broncos,56,556,500
Bulldogs,-46,428,474
Cowboys,-72,449,521
Dragons,47,519,472
Eels,-176,374,550


In [8]:
#Pull win / loss records from database
standings2018 = '''SELECT t.nickname team,
    count(m.winner) as wins,
    24 - count(m.winner) as loses
FROM Matches m
JOIN Teams t
ON m.winner = t.id
WHERE year(m.date) = 2018
GROUP BY m.winner
ORDER BY wins DESC;'''
standings2018 = create_df(standings2018).set_index('team')
standings2018.head()

Unnamed: 0_level_0,wins,loses
team,Unnamed: 1_level_1,Unnamed: 2_level_1
Roosters,16,8
Rabbitohs,16,8
Storm,16,8
Sharks,16,8
Dragons,15,9


In [9]:
final_standings = pd.concat([standings2018, total_points_data], axis=1, join='inner').sort_values(['wins', 'point_differential'], ascending=[0,0])
final_standings

Unnamed: 0,wins,loses,point_differential,scored,allowed
Roosters,16,8,181,542,361
Storm,16,8,173,536,363
Rabbitohs,16,8,145,582,437
Sharks,16,8,96,519,423
Broncos,15,9,56,556,500
Panthers,15,9,56,517,461
Dragons,15,9,47,519,472
Warriors,15,9,25,472,447
Tigers,12,12,-83,377,460
Raiders,10,14,23,563,540


In [10]:
#From the standings above, we can see that points and 

final_standings.plot.scatter(x='wins', y='point_differential', color='Red', label='point_differential')
final_standings.plot.scatter(x='wins', y='scored', color='Blue', label='scored')
final_standings.plot.scatter(x='wins', y='allowed', color='Green', label='allowed')

<matplotlib.axes._subplots.AxesSubplot at 0x1194c7400>

We can see correlation between point_differential and wins, but lets look at alternative metrics and player stats to find a better correlation to win probability.

# Want to see if there are any statistics that are a good predictor of wins

In [11]:
#PlayerMatchStats
stats2018 = '''SELECT t.nickname team, 
	count(m.winner) * 2 as points,
    count(m.winner) as wins
FROM PlayerMatchStats m
JOIN Teams t
ON m.winner = t.id
WHERE year(m.date) = 2018
GROUP BY m.winner
ORDER BY points DESC;'''

In [45]:
query = '''SELECT CONCAT(p.first_name, ' ' ,p.last_name) name, t.nickname team, p_stats.*
        FROM PlayerMatchStats p_stats
        JOIN Players p
        ON p_stats.player_id = p.id
        JOIN Teams t
        ON p_stats.team_id = t.id;'''

player_df = create_df(query)
player_df

Unnamed: 0,name,team,id,match_id,player_id,team_id,position_id,minutes_played,points,tries,...,kicked_dead,errors,handling_errors,one_on_ones_lost,penalties,on_report,sin_bins,send_offs,stint_one,stint_two
0,Darius Boyd,Broncos,18,1,22,1,1,80.0,0,0,...,0,1,1,0,0,0,0,0,80.0,
1,Corey Oates,Broncos,19,1,20,1,2,80.0,0,0,...,0,1,1,0,0,0,0,0,80.0,
2,James Roberts,Broncos,20,1,27,1,3,80.0,8,2,...,0,0,0,0,1,0,0,0,80.0,
3,Jordan Kahu,Broncos,21,1,251,1,3,80.0,4,0,...,0,1,1,0,0,0,0,0,80.0,
4,Jamayne Isaako,Broncos,22,1,26,1,2,80.0,0,0,...,0,2,0,0,0,0,0,0,80.0,
5,Anthony Milford,Broncos,23,1,19,1,4,80.0,0,0,...,1,1,1,0,1,0,0,0,80.0,
6,Kodi Nikorima,Broncos,24,1,498,1,5,80.0,0,0,...,1,0,0,0,0,0,0,0,80.0,
7,Matthew Lodge,Broncos,25,1,11,1,6,58.0,0,0,...,0,2,2,0,1,0,0,0,26.45,30.37
8,Sam Thaiday,Broncos,26,1,499,1,7,39.0,0,0,...,0,1,1,0,1,0,0,0,26.34,12.09
9,Tevita Pangai Junior,Broncos,27,1,500,1,6,46.0,0,0,...,0,2,2,0,0,0,0,0,22.44,22.55


In [46]:
grouped = player_df.groupby(['match_id', 'team']) 
grouped.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,id,player_id,team_id,position_id,minutes_played,points,tries,conversions,conversion_attempts,...,kicked_dead,errors,handling_errors,one_on_ones_lost,penalties,on_report,sin_bins,send_offs,stint_one,stint_two
match_id,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,Broncos,Darius Boyd,18,22,1,1,80.0,0,0,0,0,...,0,1,1,0,0,0,0,0,80.0,
1,Dragons,Matthew Dufty,1,385,14,1,80.0,0,0,0,0,...,0,0,0,0,1,0,0,0,80.0,
2,Knights,Kalyn Ponga,35,209,8,1,89.0,10,1,3,3,...,0,0,0,0,0,0,0,0,89.0,
2,Sea Eagles,Tom Trbojevic,52,151,6,1,89.0,0,0,0,0,...,0,3,2,0,0,0,0,0,89.0,
3,Cowboys,Ben Hampton,69,244,10,1,80.0,0,0,0,0,...,0,4,2,0,0,0,0,0,80.0,
3,Sharks,Valentine Holmes,86,472,4,1,80.0,0,0,0,0,...,0,1,1,0,1,0,0,0,80.0,
4,Roosters,James Tedesco,120,432,15,1,80.0,0,0,0,0,...,0,3,2,0,0,0,0,0,80.0,
4,Tigers,Tuimoala Lolohea,103,521,16,1,80.0,6,0,1,1,...,0,0,0,0,0,0,0,0,80.0,
5,Rabbitohs,Alex Johnston,137,346,13,1,80.0,0,0,0,0,...,0,3,2,0,0,0,0,0,80.0,
5,Warriors,Roger Tuivasa-sheck,154,531,9,1,80.0,0,0,0,0,...,0,0,0,0,0,0,0,0,80.0,
