# Introduction
* In this notebook we will use data provided by the NBA and use a poisson distribution in order to make predictions on the outcome of future games

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import poisson


# Load the data

In [2]:
data = pd.read_csv('games.csv')

# Step 1 
* Calculate average goals scored and conceded for each team

In [3]:
home_goals_scored_avg = data.groupby('HOME_TEAM_ID')['PTS_home'].mean()
home_goals_conceded_avg = data.groupby('HOME_TEAM_ID')['PTS_away'].mean()
away_goals_scored_avg = data.groupby('VISITOR_TEAM_ID')['PTS_away'].mean()
away_goals_conceded_avg = data.groupby('VISITOR_TEAM_ID')['PTS_home'].mean()

# Step 2
* Here we consider the home team's offensive capabilities multiplied by the away team's defensive capabilites normalised by the home team's defense.

In [5]:
def expected_goals(home_team, away_team):
    lambda_home = home_goals_scored_avg[home_team] * away_goals_conceded_avg[away_team] / home_goals_conceded_avg[home_team]# Normalise the home goals
    lambda_away = away_goals_scored_avg[away_team] * home_goals_conceded_avg[home_team] / away_goals_conceded_avg[away_team]# Normalise the away goals
    return lambda_home, lambda_away

In [6]:
def poisson_prob(lam, k):
    return poisson.pmf(k, lam)

# Step 3
* In this function we run the poisson distribution on the home and away teams using 150 as an upper limit of goals. 
* This will initialise two arrays which will contain the probability of each number of goals. 
* Then a probability matrix is created by doing the outer product of the two matrices. 
* This means that the element at row i and column j represents the probability that the home team scores i goals and the away team scores j goals.
* So then in order to calculate the probability of the home team winning it is trivial we just sum where i>j and vice versa for the away team.
* The point where i=j are the probabilities of a draw

In [7]:
def match_probability(home_team, away_team):
    lambda_home, lambda_away = expected_goals(home_team, away_team)
    
    max_goals = 150  # Reasonable upper limit for goals in a match
    home_probs = [poisson_prob(lambda_home, i) for i in range(max_goals)]
    away_probs = [poisson_prob(lambda_away, i) for i in range(max_goals)]
    
    # Create a matrix of probabilities for all scorelines
    prob_matrix = np.outer(home_probs, away_probs)
    
    home_win_prob = np.sum(np.tril(prob_matrix, -1))
    draw_prob = np.sum(np.diag(prob_matrix))
    away_win_prob = np.sum(np.triu(prob_matrix, 1))
    
    return home_win_prob, draw_prob, away_win_prob

# Step 4
* In this function we compare the probabilites calculated above in order to determine who will win.

* We take the complement of the probabilites in order to get a better accuracy

In [24]:
def predict_outcome(home_team, away_team):
    home_win_prob, draw_prob, away_win_prob = match_probability(home_team, away_team)
    
    if home_win_prob > draw_prob and home_win_prob > away_win_prob:
        prediction = 'Away Win' # Former Home Win
    elif away_win_prob > home_win_prob and away_win_prob > draw_prob:
        prediction = 'Home Win' # Former Away Win
    else:
        prediction = 'Draw'
    
    return prediction, home_win_prob, draw_prob, away_win_prob


# Example use

In [26]:
home_team,away_team = 1610612743,1610612763  # Example team ID for home team

prediction, home_win_prob, draw_prob, away_win_prob = predict_outcome(home_team, away_team)

# Reading 'teams.csv' in order to get the team name from the team ID

In [12]:
teamDF=pd.read_csv('teams.csv')

In [13]:
if prediction=='Home Win':
    row=teamDF[teamDF['TEAM_ID']==home_team]
    win_team=row.iloc[0]['NICKNAME']
else:
    row=teamDF[teamDF['TEAM_ID']==away_team]
    win_team=teamDF['TEAM_ID'][away_team]

# Output

In [28]:
print(f"Prediction: {prediction}")
print(f"Winning team: {win_team}")
print(f"Away Win Probability: {home_win_prob:.2f}") # Former Home
print(f"Draw Probability: {draw_prob:.2f}")
print(f"Home Win Probability: {away_win_prob:.2f}") # Former Away

Prediction: Away Win
Winning team: Nuggets
Away Win Probability: 0.68
Draw Probability: 0.02
Home Win Probability: 0.29


# Accuracy Calculation

In [15]:
actual_outcomes=data[['HOME_TEAM_ID','VISITOR_TEAM_ID','HOME_TEAM_WINS']]


* Here we iterate through our dataframe, which has a ground truth for home team win, and compare the model's prediction to this

In [30]:
correct_predictions=0
for index, row in actual_outcomes.iterrows():
    home_team = row['HOME_TEAM_ID']
    away_team = row['VISITOR_TEAM_ID']
    actual_outcome = row['HOME_TEAM_WINS']
    
    prediction = predict_outcome(home_team, away_team)
    if prediction=='Home Win':
        prediction=0
    else:
        prediction=1
    
    if prediction == actual_outcome:
        correct_predictions += 1

total_matches = len(actual_outcomes)

# Calculate accuracy
accuracy = correct_predictions / total_matches
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.59
