**Team Visualization**

Goal: For all 30 teams, compare the *expected* number of runs scored per game calculated using the Markov Chain to the *actual* number of runs scored per game in 2016.

In [1]:
import sys
sys.path.insert(0, '../')
from markov_functions import (find_team_atbats, import_raw_batting_data, run_matrix,
                            find_states, make_empty_transition_matrix, make_team_transition_matrix,
                            make_transition_matrix, team_markov, team_markov_from_raw)

batting = import_raw_batting_data(verbose = False)

In [2]:
team_markov_from_raw(batting, "MIN")
# Should be closer to 4.46, not 5.81307. Looks better now.

4.6844342545587079

In [3]:
# Now find the expected runs for 9 innings of all 30 teams
# (This takes some time to run...)
team_codes = batting.visteam.unique()
team_runs_per_9 = {team:team_markov_from_raw(batting, team) for team in team_codes}
print(team_runs_per_9)

{'MIN': 4.6844342545587079, 'MIL': 4.3461280169796321, 'ARI': 4.7041030210006261, 'HOU': 4.7745029852071728, 'ATL': 3.9470153709911746, 'KCA': 3.8286590512036978, 'NYA': 4.2308523605258275, 'TEX': 5.0078809712598868, 'CLE': 4.8554977751575192, 'SFN': 4.4012193470178138, 'CIN': 4.4600595531244842, 'SDN': 3.9992322207942084, 'PIT': 4.162041724401039, 'SLN': 5.0718486251709258, 'LAN': 4.6185163186453613, 'CHN': 5.3362637683535228, 'ANA': 4.26727297561896, 'DET': 4.9974908578012327, 'WAS': 4.6985060301927106, 'CHA': 4.3232651162359277, 'SEA': 4.9222541640349551, 'MIA': 4.0525026147268592, 'PHI': 3.970970076363074, 'BAL': 5.3088519482244161, 'OAK': 3.9907234778001568, 'COL': 5.3723549286181722, 'TOR': 4.6501165729856, 'TBA': 4.5004926537691254, 'NYN': 4.7717745267503844, 'BOS': 5.5764046436862351}


How do these predicted runs-per-9-innings compare to the actual 2016 runs per game? We'll compare the two using data from http://www.baseball-reference.com/leagues/MLB/2016.shtml. Unfortunately, our data set uses a set of team codes that is slightly different from this data, so we'll re-code the index of our imported data frame accordingly.

In [5]:
import pandas as pd
team_2016_data = pd.read_csv('../Baseball_Data/team_2016_stats.csv', index_col = 'Tm')
# Data from http://www.baseball-reference.com/leagues/MLB/2016.shtml

code_exchange = {'ARI':'ARI', 'ATL':'ATL', 'BAL':'BAL', 'BOS':'BOS', 'CHC':'CHN', 'CHW':'CHA', 
                 'CIN':'CIN', 'CLE':'CLE', 'COL':'COL', 'DET':'DET', 'HOU':'HOU', 'KCR':'KCA', 
                 'LAA':'ANA', 'LAD':'LAN', 'MIA':'MIA', 'MIL':'MIL', 'MIN':'MIN', 'NYM':'NYN', 
                 'NYY':'NYA', 'OAK':'OAK', 'PHI':'PHI', 'PIT':'PIT', 'SDP':'SDN', 'SEA':'SEA', 
                 'SFG':'SFN', 'STL':'SLN', 'TBR':'TBA', 'TEX':'TEX', 'TOR':'TOR', 'WSN':'WAS'}


team_2016_data.index = [code_exchange[i] for i in team_2016_data.index]
print(team_2016_data['R/G'])

ARI    4.64
ATL    4.03
BAL    4.59
BOS    5.42
CHN    4.99
CHA    4.23
CIN    4.42
CLE    4.83
COL    5.22
DET    4.66
HOU    4.47
KCA    4.17
ANA    4.43
LAN    4.48
MIA    4.07
MIL    4.14
MIN    4.46
NYN    4.14
NYA    4.20
OAK    4.03
PHI    3.77
PIT    4.50
SDN    4.23
SEA    4.74
SFN    4.41
SLN    4.81
TBA    4.15
TEX    4.72
TOR    4.69
WAS    4.71
Name: R/G, dtype: float64


In [6]:
from bokeh.io import output_notebook, show, vform, curdoc
from bokeh.plotting import figure
from bokeh.models import HoverTool
from bokeh.models import Range1d
from bokeh.models import ColumnDataSource, CustomJS
from bokeh.models.widgets import Select
from bokeh.models.annotations import Label
from bokeh.layouts import column, row
output_notebook()

In [7]:
# create a new plot with default tools, using figure

team_league = ['AL', 'AL', 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'AL', 'AL', 'AL',
 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'NL', 'NL', 'AL', 'NL', 'NL',
 'NL', 'NL', 'NL', 'NL', 'NL', 'NL']

expected_runs = [team_runs_per_9[i] for i in team_codes]
#print(exp_runs)

actual_runs = [team_2016_data.at[i,'R/G'] for i in team_codes]
#print(actual_runs)

source = ColumnDataSource(
        data={
            "expected_runs":expected_runs,
            "actual_runs":actual_runs,
            "team":team_codes,
            "league":team_league
        }
    )

hover = HoverTool(
        tooltips=[
            ("Team", "@team"),
            ("Expected Runs", "@expected_runs"),
            ("Actual Runs", "@actual_runs"),
            ("League", "@league")
        ]
    )


p = figure(plot_width=600, plot_height=600, tools=[hover], title="This Markov Chain Prediction is Quite Awful!")
p.x_range = Range1d(3, 6)
p.y_range = Range1d(3, 6)

p.xaxis.axis_label = "Expected Runs Per Game"
p.yaxis.axis_label = "Actual Runs Per Game"


p.circle(x="expected_runs", y="actual_runs", size=15, line_color="black", fill_color="firebrick", 
         fill_alpha=0.5, source=source)
p.line(x=[3,9],y=[3,9], line_width=4, line_color="navy", alpha=0.6)
p.line(x=[3,9],y=[3,9], line_width=150, line_color="navy", alpha=0.1)

show(p)

In [30]:
df = pd.DataFrame(index=team_codes)
df['expected_runs'] = expected_runs
df['actual_runs'] = [team_2016_data.at[i,'R/G'] for i in team_codes]
df['league'] = [ 'AL', 'AL', 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'AL', 'AL', 'AL',
                 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'NL', 'NL', 'AL', 'NL', 'NL',
                 'NL', 'NL', 'NL', 'NL', 'NL', 'NL']
print(df)
df.to_csv("team_2016_data_with_pred.csv")
import os
os.getcwd()

     expected_runs  actual_runs league
BAL       5.308852         4.59     AL
TOR       4.650117         4.69     AL
TBA       4.500493         4.15     AL
ATL       3.947015         4.03     NL
NYA       4.230852         4.20     AL
OAK       3.990723         4.03     AL
HOU       4.774503         4.47     NL
CLE       4.855498         4.83     AL
COL       5.372355         5.22     NL
SEA       4.922254         4.74     AL
CHA       4.323265         4.23     AL
ANA       4.267273         4.43     AL
TEX       5.007881         4.72     AL
SFN       4.401219         4.41     NL
MIN       4.684434         4.46     AL
DET       4.997491         4.66     AL
ARI       4.704103         4.64     NL
KCA       3.828659         4.17     AL
SDN       3.999232         4.23     NL
SLN       5.071849         4.81     NL
PIT       4.162042         4.50     NL
BOS       5.576405         5.42     AL
MIL       4.346128         4.14     NL
WAS       4.698506         4.71     NL
PHI       3.970970       

'/home/david/Documents/GitRepos/MarkovChainBaseball/Baseball_Data'

In [19]:
# create a new plot with default tools, using figure

team_league = ['AL', 'AL', 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'AL', 'AL', 'AL',
 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'NL', 'NL', 'AL', 'NL', 'NL',
 'NL', 'NL', 'NL', 'NL', 'NL', 'NL']

expected_runs = [team_runs_per_9[i] for i in team_codes]
#print(exp_runs)

actual_runs = [team_2016_data.at[i,'runs_per_game'] for i in team_codes]
#print(actual_runs)

#source = ColumnDataSource(
#        data=dict(
#            expected_runs=expected_runs,
#            actual_runs=actual_runs,
#            team=team_codes,
#            league=team_league
#        )
#    )

hover = HoverTool(
        tooltips=[
            ("Team", "@team"),
            ("Expected Runs", "@expected_runs"),
            ("Actual Runs", "@actual_runs"),
            ("League", "@league")
        ]
    )


def get_dataset(league_option):
    df = dict(
            expected_runs=expected_runs,
            actual_runs=actual_runs,
            team=team_codes,
            league=team_league
        )
    return ColumnDataSource(data=df)



def make_plot(source, title_text):
    p = figure(plot_width=600, plot_height=600, tools=[hover], title = title_text)
    p.x_range = Range1d(3, 7)
    p.y_range = Range1d(3, 7)

    p.xaxis.axis_label = "Expected Runs Per Game"
    p.yaxis.axis_label = "Actual Runs Per Game"


    p.circle(x=expected_runs, y=actual_runs, size=15, line_color="black", fill_color="firebrick", 
             fill_alpha=0.5, source=source)
    p.line(x=[3,9],y=[3,9], line_width=4, line_color="navy", alpha=0.6)
    p.line(x=[3,9],y=[3,9], line_width=150, line_color="navy", alpha=0.1)

    p.add_layout(Label(x=6.638, y=5.42, x_offset=-17, y_offset=10, text="BOS", text_color = "black"))
    p.add_layout(Label(x=5.121, y=3.77, x_offset=-15, y_offset=-30, text="PHI", text_color = "black"))
    p.add_layout(Label(x=4.705, y=4.030, x_offset=-17, y_offset=-30, text="OAK", text_color = "black"))
    p.add_layout(Label(x=5.917, y=4.14, x_offset=-12, y_offset=-30, text="MIL", text_color = "black"))

    return p # show the results

def update_plot(attrname, old, new):
    print("hello world")
    city = city_select.value
    plot.title.text = "Weather data for " + cities[city]['title']

    src = get_dataset(df, cities[city]['airport'], distribution_select.value)
    source.data.update(src.data)

    

output_notebook()
league = 'All Teams'
league_options = ['All Teams', 'AL', 'NL']

league_select = Select(value=league, title='League', options=league_options)
league_select.on_change('value', update_plot) 

source = get_dataset("AL")
plot = make_plot(source, "This is a Markov Chain Prediction")

controls = column(league_select)

curdoc().add_root(row(plot, controls))
curdoc().title = "Markov Chain Plot"

KeyError: 'runs_per_game'

In [None]:
from os.path import join, dirname
import datetime

import pandas as pd
from scipy.signal import savgol_filter

from bokeh.io import curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, DataRange1d, Select
from bokeh.palettes import Blues4
from bokeh.plotting import figure

STATISTICS = ['record_min_temp', 'actual_min_temp', 'average_min_temp', 'average_max_temp', 'actual_max_temp', 'record_max_temp']

def get_dataset(src, name, distribution):
    df = src[src.airport == name].copy()
    del df['airport']
    df['date'] = pd.to_datetime(df.date)
    # timedelta here instead of pd.DateOffset to avoid pandas bug < 0.18 (Pandas issue #11925)
    df['left'] = df.date - datetime.timedelta(days=0.5)
    df['right'] = df.date + datetime.timedelta(days=0.5)
    df = df.set_index(['date'])
    df.sort_index(inplace=True)
    if distribution == 'Smoothed':
        window, order = 51, 3
        for key in STATISTICS:
            df[key] = savgol_filter(df[key], window, order)

    return ColumnDataSource(data=df)

def make_plot(source, title):
    plot = figure(x_axis_type="datetime", plot_width=800, tools="", toolbar_location=None)
    plot.title.text = title

    plot.quad(top='record_max_temp', bottom='record_min_temp', left='left', right='right',
              color=Blues4[2], source=source, legend="Record")
    plot.quad(top='average_max_temp', bottom='average_min_temp', left='left', right='right',
              color=Blues4[1], source=source, legend="Average")
    plot.quad(top='actual_max_temp', bottom='actual_min_temp', left='left', right='right',
              color=Blues4[0], alpha=0.5, line_color="black", source=source, legend="Actual")

    # fixed attributes
    plot.xaxis.axis_label = None
    plot.yaxis.axis_label = "Temperature (F)"
    plot.axis.axis_label_text_font_style = "bold"
    plot.x_range = DataRange1d(range_padding=0.0)
    plot.grid.grid_line_alpha = 0.3

    return plot

def update_plot(attrname, old, new):
    city = city_select.value
    plot.title.text = "Weather data for " + cities[city]['title']

    src = get_dataset(df, cities[city]['airport'], distribution_select.value)
    source.data.update(src.data)

city = 'Austin'
distribution = 'Discrete'

cities = {
    'Austin': {
        'airport': 'AUS',
        'title': 'Austin, TX',
    },
    'Boston': {
        'airport': 'BOS',
        'title': 'Boston, MA',
    },
    'Seattle': {
        'airport': 'SEA',
        'title': 'Seattle, WA',
    }
}

city_select = Select(value=city, title='City', options=sorted(cities.keys()))
distribution_select = Select(value=distribution, title='Distribution', options=['Discrete', 'Smoothed'])

df = pd.read_csv('2015_weather.csv')
source = get_dataset(df, cities[city]['airport'], distribution)
plot = make_plot(source, "Weather data for " + cities[city]['title'])

city_select.on_change('value', update_plot)
distribution_select.on_change('value', update_plot)

controls = column(city_select, distribution_select)

curdoc().add_root(row(plot, controls))
curdoc().title = "Weather"

In [None]:
df = pd.DataFrame(index=team_codes)
df['expected_runs'] = expected_runs
df['actual_runs'] = [team_2016_data.at[i,'runs_per_game'] for i in team_codes]
df['league'] = [ 'AL', 'AL', 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'AL', 'AL', 'AL',
                 'AL', 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'NL', 'NL', 'AL', 'NL', 'NL',
                 'NL', 'NL', 'NL', 'NL', 'NL', 'NL']
#df.index = df['team_code']
print(df)
df.to_csv("team_visualization/team_data.csv")

It's clear from the graph that our model consistently **overestimates** the number of runs per game. Likely this is due to an error in the transition matrices that ignores non-transitions like pitching changes, etc. On the other hand, the shape looks pretty linear, which is good.