<h3> Predicting Sports Winners with Decision Trees</h3>
<p> Predicting the winner of sports matches using a
different type of classification algorithm: decision trees.</p>


<h4> Loading the dataset</h4>

In [264]:
import os
import numpy as np
import pandas as pd
home_folder = os.path.expanduser("~")
data_folder = os.path.join(home_folder, "Desktop", "IMaR","Data_Source")
data_filename = os.path.join(data_folder, "leagues_NBA_2014_games_games.csv")

In [265]:
results = pd.read_csv(data_filename)
results.head()

Unnamed: 0,Date,Unnamed: 1,Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Notes
0,Tue Oct 29 2013,Box Score,Orlando Magic,87,Indiana Pacers,97,,
1,Tue Oct 29 2013,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,
2,Tue Oct 29 2013,Box Score,Chicago Bulls,95,Miami Heat,107,,
3,Wed Oct 30 2013,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,
4,Wed Oct 30 2013,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,


<h4> Cleaning up the dataset</h4>

In [266]:
results = pd.read_csv(data_filename)
# Fix the name of the columns
results.columns = ["Date", "Score Type", "Visitor Team", "VisitorPts", "Home Team","HomePts", "OT", "Notes"]

results.OT = results.OT.fillna("")
results.Notes = results.Notes.fillna("")

results.head()

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT,Notes
0,Tue Oct 29 2013,Box Score,Orlando Magic,87,Indiana Pacers,97,,
1,Tue Oct 29 2013,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,
2,Tue Oct 29 2013,Box Score,Chicago Bulls,95,Miami Heat,107,,
3,Wed Oct 30 2013,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,
4,Wed Oct 30 2013,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,


In [267]:
#Extracting new features
results["HomeWin"] = results["VisitorPts"] < results["HomePts"]
# Our "class values" for NumPy
y_true = results["HomeWin"].values
results.head()

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT,Notes,HomeWin
0,Tue Oct 29 2013,Box Score,Orlando Magic,87,Indiana Pacers,97,,,True
1,Tue Oct 29 2013,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True
2,Tue Oct 29 2013,Box Score,Chicago Bulls,95,Miami Heat,107,,,True
3,Wed Oct 30 2013,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True
4,Wed Oct 30 2013,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True


In [268]:
print("Home Win percentage: {0:.1f}%".format(100 * results["HomeWin"].sum() / results["HomeWin"].count()))

Home Win percentage: 58.0%


In [269]:
results["HomeLastWin"] = False
results["VisitorLastWin"] = False
# This creates two new columns, all set to False
results.loc[:5]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,Tue Oct 29 2013,Box Score,Orlando Magic,87,Indiana Pacers,97,,,True,False,False
1,Tue Oct 29 2013,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True,False,False
2,Tue Oct 29 2013,Box Score,Chicago Bulls,95,Miami Heat,107,,,True,False,False
3,Wed Oct 30 2013,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True,False,False
4,Wed Oct 30 2013,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True,False,False
5,Wed Oct 30 2013,Box Score,Washington Wizards,102,Detroit Pistons,113,,,True,False,False


In [270]:
# Now compute the actual values for these
# Did the home and visitor teams win their last game?
from collections import defaultdict
won_last = defaultdict(int)

for index, row in results.iterrows():  # Note that this is not efficient
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team] 
    row["VisitorLastWin"] = won_last[visitor_team]
    results.loc[index] = row    
    # Set current win
    won_last[home_team] = row["HomeWin"]
    won_last[visitor_team] = not row["HomeWin"]
results.loc[20:25]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT,Notes,HomeWin,HomeLastWin,VisitorLastWin
20,Fri Nov 1 2013,Box Score,Milwaukee Bucks,105,Boston Celtics,98,,,False,False,False
21,Fri Nov 1 2013,Box Score,Miami Heat,100,Brooklyn Nets,101,,,True,False,False
22,Fri Nov 1 2013,Box Score,Cleveland Cavaliers,84,Charlotte Bobcats,90,,,True,False,True
23,Fri Nov 1 2013,Box Score,Portland Trail Blazers,113,Denver Nuggets,98,,,False,False,False
24,Fri Nov 1 2013,Box Score,Dallas Mavericks,105,Houston Rockets,113,,,True,True,True
25,Fri Nov 1 2013,Box Score,San Antonio Spurs,91,Los Angeles Lakers,85,,,False,False,True


In [271]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

In [272]:
from sklearn.cross_validation import cross_val_score

# Create a dataset with just the neccessary information
X_previouswins = results[["HomeLastWin", "VisitorLastWin"]].values

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Using just the last result from the home and visitor teams")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using just the last result from the home and visitor teams
Accuracy: 57.8%


In [273]:
# Let's try see which team is better on the ladder. Using the previous year's ladder
standings_filename = os.path.join(data_folder, "Expanded Standings.csv")
standings = pd.read_csv(standings_filename, skiprows=[0])
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,?3,?10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-Dec,41-11,25-May,14-Apr,12-Jun,15-Jan,...,30-2,09-Mar,39-8,1-0,10-Mar,10-May,08-May,12-Jan,17-Jan,08-Jan
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-Sep,39-13,07-Mar,08-Feb,06-Apr,...,21-Aug,03-Jun,44-6,,13-Apr,11-Feb,11-May,07-Apr,12-May,06-Feb
2,3,San Antonio Spurs,58-24,35-6,23-18,25-May,33-19,08-Feb,09-Jan,08-Feb,...,16-Dec,09-May,31-Oct,1-0,12-Apr,12-Apr,12-Mar,08-Mar,10-Apr,03-Jun
3,4,Denver Nuggets,57-25,38-3,19-22,19-Nov,38-14,05-May,10-0,04-Jun,...,24-Apr,11-Jul,28-Aug,0-1,08-Aug,09-Jun,12-Mar,08-Apr,13-Feb,07-Jan
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-Sep,35-17,07-Mar,08-Feb,06-Apr,...,17-Sep,03-May,38-12,1-0,08-Jun,16-0,09-Jul,08-May,07-Jul,07-Jan


In [274]:
# We can create a new feature -- HomeTeamRanksHigher\
results["HomeTeamRanksHigher"] = 0
for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
        
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    results.loc[index] = row
results.head()

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher
0,Tue Oct 29 2013,Box Score,Orlando Magic,87,Indiana Pacers,97,,,True,0,0,0
1,Tue Oct 29 2013,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True,0,0,1
2,Tue Oct 29 2013,Box Score,Chicago Bulls,95,Miami Heat,107,,,True,0,0,0
3,Wed Oct 30 2013,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True,0,0,1
4,Wed Oct 30 2013,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True,0,0,1


In [275]:
X_homehigher =  results[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 60.3%


In [276]:
from sklearn.grid_search import GridSearchCV

parameter_space = {"max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],}
clf = DecisionTreeClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_homehigher, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

Accuracy: 60.6%


In [277]:
# Who won the last match? We ignore home/visitor for this bit
last_match_winner = defaultdict(int)
results["HomeTeamWonLast"] = 0

for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    
    # Set in the row, who won the last encounter
    row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
    results.loc[index] = row
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner
results.head()

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher,HomeTeamWonLast
0,Tue Oct 29 2013,Box Score,Orlando Magic,87,Indiana Pacers,97,,,True,0,0,0,0
1,Tue Oct 29 2013,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True,0,0,1,0
2,Tue Oct 29 2013,Box Score,Chicago Bulls,95,Miami Heat,107,,,True,0,0,0,0
3,Wed Oct 30 2013,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True,0,0,1,0
4,Wed Oct 30 2013,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True,0,0,1,0


In [278]:
X_home_higher =  results[["HomeTeamRanksHigher", "HomeTeamWonLast"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))


Using whether the home team is ranked higher
Accuracy: 60.6%


In [279]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoding = LabelEncoder()
encoding.fit(results["Home Team"].values)
home_teams = encoding.transform(results["Home Team"].values)
visitor_teams = encoding.transform(results["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 60.3%


In [280]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Using full team labels is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using full team labels is ranked higher
Accuracy: 60.9%


In [281]:
X_all = np.hstack([X_home_higher, X_teams])
print(X_all.shape)

(1230, 62)


In [282]:
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 61.1%


In [283]:
#n_estimators=10, criterion='gini', max_depth=None, 
#min_samples_split=2, min_samples_leaf=1,
#max_features='auto',
#max_leaf_nodes=None, bootstrap=True,
#oob_score=False, n_jobs=1,
#random_state=None, verbose=0, min_density=None, compute_importances=None
parameter_space = {"max_features": [2, 10, 'auto'],"n_estimators": [100,],"criterion": ["gini", "entropy"],"min_samples_leaf": [2, 4, 6],}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)

Accuracy: 64.2%
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=14, verbose=0, warm_start=False)
