## Predicting Runs Scored: Data Wrangling
- #### Last lesson we decided to switch our focus onto predicting runs scored
- #### This could be useful in several capacities:
  - #### For predicting the over/under
  - #### To create features for predicting the game winner
- #### We will focus on predicting the distribution of runs scored - that is, putting a probability on each possible value of runs scored for a team (up to some maximum value)
- #### Predicting the distribution of a numeric target variable is known as "probabilistic regression"
- #### In this notebook, we will "wrangle" our data such that each row represents a hitting team against a pitching team
- #### The goal will be to predict the (distribution of the) number of runs scored by the hitting team

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

In [2]:
df=pd.read_csv('df_bp9.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Create df for runs of a single team
- #### Want to "split" each game into two rows:
    - #### One where the home team bats against the visiting pitchers (and we try to predict the runs of the home team)
    - #### Other where we switch the roles of home and visiting
    - #### Then we can model how many runs we expect based on the hitters for that team and the opposing pitching

In [3]:
cols_ref = ['season', 'date','dblheader_code','date_dblhead','team_h','team_v',
            'league_h','league_v','over_under_line','over_under_result',
           'ballpark_id', 'day_night']
target_cols = ['runs_h', 'runs_v']

In [4]:
team_hit_stems = ['BATAVG','OBP','SLG','OBS','ERR','SB','CS']
lineup_hit_stems = ['BATAVG','OBP','SLG','OBS','SLGmod','SObat_perc']
strt_pitch_stems = ['ERA','WHIP','SO_perc','H_BB_perc','TB_BB_perc','FIP','FIP_perc']
bpen_pitch_stems = ['WHIP','SO_perc','H_BB_perc','TB_BB_perc']

team_hit_winsizes = [30,162]
lineup_hit_winsizes = [30,75,162,350]
strt_pitch_winsizes = [10,35,75]
bpen_pitch_winsizes = [10,35,75]

In [5]:
# We will make two data frames:
# version 'a': home hitting and visitor pitching
# version 'b': visit hitting and home pitching

team_hit_features_a = [x+'_'+str(winsize)+'_h' for winsize in team_hit_winsizes for x in team_hit_stems ]
lineup_hit_features_a = ['lineup'+n89+'_'+x+'_'+str(winsize)+wornot+'_h' for winsize in lineup_hit_winsizes
                        for x in lineup_hit_stems for wornot in ['','_w'] for n89 in ['8','9']]
start_pitch_features_a = ['Strt_'+x+'_'+str(winsize)+'_v' for winsize in strt_pitch_winsizes for x in strt_pitch_stems]
bpen_pitch_features_a = ['Bpen_'+x+'_'+str(winsize)+'_v' for winsize in bpen_pitch_winsizes for x in bpen_pitch_stems]

team_hit_features_b = [x+'_'+str(winsize)+'_v' for winsize in team_hit_winsizes for x in team_hit_stems ]
lineup_hit_features_b = ['lineup'+n89+'_'+x+'_'+str(winsize)+wornot+'_v' for winsize in lineup_hit_winsizes
                        for x in lineup_hit_stems for wornot in ['','_w'] for n89 in ['8','9']]
start_pitch_features_b = ['Strt_'+x+'_'+str(winsize)+'_h' for winsize in strt_pitch_winsizes for x in strt_pitch_stems]
bpen_pitch_features_b = ['Bpen_'+x+'_'+str(winsize)+'_h' for winsize in bpen_pitch_winsizes for x in bpen_pitch_stems]


In [6]:
cols_a = cols_ref + ['runs_h'] + team_hit_features_a + lineup_hit_features_a + start_pitch_features_a + bpen_pitch_features_a
df_a = df.loc[:,cols_a]
df_a['home_hitting'] = 1


cols_b = cols_ref + ['runs_v'] + team_hit_features_b + lineup_hit_features_b + start_pitch_features_b + bpen_pitch_features_b
df_b = df.loc[:,cols_b]
df_b['home_hitting'] = 0

stripped_feats  = [x[:-2] for x in team_hit_features_a + lineup_hit_features_a + 
                                              start_pitch_features_a + bpen_pitch_features_a]

final_col_list = cols_ref +['runs'] + stripped_feats+ ['home_hitting']

df_a.columns = final_col_list
df_b.columns = final_col_list
df_runs = pd.concat((df_a,df_b))

In [7]:
df_runs.sample(5)

Unnamed: 0,season,date,dblheader_code,date_dblhead,team_h,team_v,league_h,league_v,over_under_line,over_under_result,ballpark_id,day_night,runs,BATAVG_30,OBP_30,SLG_30,OBS_30,ERR_30,SB_30,CS_30,BATAVG_162,OBP_162,SLG_162,OBS_162,ERR_162,SB_162,CS_162,lineup8_BATAVG_30,lineup9_BATAVG_30,lineup8_BATAVG_30_w,lineup9_BATAVG_30_w,lineup8_OBP_30,lineup9_OBP_30,lineup8_OBP_30_w,lineup9_OBP_30_w,lineup8_SLG_30,lineup9_SLG_30,lineup8_SLG_30_w,lineup9_SLG_30_w,lineup8_OBS_30,lineup9_OBS_30,lineup8_OBS_30_w,lineup9_OBS_30_w,lineup8_SLGmod_30,lineup9_SLGmod_30,lineup8_SLGmod_30_w,lineup9_SLGmod_30_w,lineup8_SObat_perc_30,lineup9_SObat_perc_30,lineup8_SObat_perc_30_w,lineup9_SObat_perc_30_w,lineup8_BATAVG_75,lineup9_BATAVG_75,lineup8_BATAVG_75_w,lineup9_BATAVG_75_w,lineup8_OBP_75,lineup9_OBP_75,lineup8_OBP_75_w,lineup9_OBP_75_w,lineup8_SLG_75,lineup9_SLG_75,lineup8_SLG_75_w,lineup9_SLG_75_w,lineup8_OBS_75,lineup9_OBS_75,lineup8_OBS_75_w,lineup9_OBS_75_w,lineup8_SLGmod_75,lineup9_SLGmod_75,lineup8_SLGmod_75_w,lineup9_SLGmod_75_w,lineup8_SObat_perc_75,lineup9_SObat_perc_75,lineup8_SObat_perc_75_w,lineup9_SObat_perc_75_w,lineup8_BATAVG_162,lineup9_BATAVG_162,lineup8_BATAVG_162_w,lineup9_BATAVG_162_w,lineup8_OBP_162,lineup9_OBP_162,lineup8_OBP_162_w,lineup9_OBP_162_w,lineup8_SLG_162,lineup9_SLG_162,lineup8_SLG_162_w,lineup9_SLG_162_w,lineup8_OBS_162,lineup9_OBS_162,lineup8_OBS_162_w,lineup9_OBS_162_w,lineup8_SLGmod_162,lineup9_SLGmod_162,lineup8_SLGmod_162_w,lineup9_SLGmod_162_w,lineup8_SObat_perc_162,lineup9_SObat_perc_162,lineup8_SObat_perc_162_w,lineup9_SObat_perc_162_w,lineup8_BATAVG_350,lineup9_BATAVG_350,lineup8_BATAVG_350_w,lineup9_BATAVG_350_w,lineup8_OBP_350,lineup9_OBP_350,lineup8_OBP_350_w,lineup9_OBP_350_w,lineup8_SLG_350,lineup9_SLG_350,lineup8_SLG_350_w,lineup9_SLG_350_w,lineup8_OBS_350,lineup9_OBS_350,lineup8_OBS_350_w,lineup9_OBS_350_w,lineup8_SLGmod_350,lineup9_SLGmod_350,lineup8_SLGmod_350_w,lineup9_SLGmod_350_w,lineup8_SObat_perc_350,lineup9_SObat_perc_350,lineup8_SObat_perc_350_w,lineup9_SObat_perc_350_w,Strt_ERA_10,Strt_WHIP_10,Strt_SO_perc_10,Strt_H_BB_perc_10,Strt_TB_BB_perc_10,Strt_FIP_10,Strt_FIP_perc_10,Strt_ERA_35,Strt_WHIP_35,Strt_SO_perc_35,Strt_H_BB_perc_35,Strt_TB_BB_perc_35,Strt_FIP_35,Strt_FIP_perc_35,Strt_ERA_75,Strt_WHIP_75,Strt_SO_perc_75,Strt_H_BB_perc_75,Strt_TB_BB_perc_75,Strt_FIP_75,Strt_FIP_perc_75,Bpen_WHIP_10,Bpen_SO_perc_10,Bpen_H_BB_perc_10,Bpen_TB_BB_perc_10,Bpen_WHIP_35,Bpen_SO_perc_35,Bpen_H_BB_perc_35,Bpen_TB_BB_perc_35,Bpen_WHIP_75,Bpen_SO_perc_75,Bpen_H_BB_perc_75,Bpen_TB_BB_perc_75,home_hitting
4262,1982,19820614,0,198206140,SDN,LAN,NL,NL,0.0,,SAN01,N,4,0.253227,0.308188,0.374379,0.682567,28.0,20.0,6.0,0.26271,0.325889,0.374344,0.700233,137.0,113.0,58.0,0.260019,0.247001,0.261158,0.249684,0.321535,0.304093,0.322518,0.307196,0.38602,0.361888,0.387089,0.365919,0.707555,0.665981,0.709606,0.673115,0.308654,0.291237,0.308923,0.293693,0.11026,0.109261,0.110748,0.109828,0.272026,0.256616,0.273142,0.259581,0.337085,0.315928,0.337649,0.319124,0.38349,0.357177,0.384864,0.36176,0.720575,0.673104,0.722513,0.680884,0.28999,0.271102,0.290068,0.273572,0.098926,0.098305,0.099331,0.098749,0.276346,0.2525,0.27744,0.256517,0.33746,0.307509,0.338068,0.311864,0.390713,0.354845,0.392254,0.360794,0.728173,0.662354,0.730323,0.672657,0.289584,0.263581,0.289878,0.26715,0.100491,0.094127,0.100866,0.095274,0.257296,0.231882,0.256783,0.234647,0.317076,0.285338,0.315892,0.288301,0.379835,0.341124,0.379466,0.345708,0.696912,0.626461,0.695358,0.634009,0.285888,0.256979,0.285082,0.259925,0.096255,0.087782,0.096022,0.088648,4.334862,1.197248,0.105263,0.286184,0.453947,4.857798,1.161184,3.863208,1.415094,0.12296,0.326442,0.452666,4.221698,0.973885,3.741279,1.398256,0.122366,0.326988,0.439157,4.168605,0.974847,1.1,0.226667,0.294333,0.385,1.464286,0.146617,0.368421,0.451128,1.12628,0.168707,0.29932,0.361905,0
78685,2015,20150501,0,201505010,HOU,SEA,AL,AL,0.0,,HOU03,N,3,0.2364,0.289134,0.387735,0.676869,22.0,17.0,5.0,0.245273,0.297145,0.379108,0.676253,85.0,104.0,40.0,0.249616,0.243592,0.249854,0.244573,0.302146,0.290285,0.302986,0.292551,0.405047,0.384307,0.404106,0.386093,0.707192,0.674592,0.707092,0.678644,0.387556,0.36876,0.387046,0.370687,0.172584,0.17512,0.172481,0.174704,0.262022,0.246241,0.262012,0.248237,0.321543,0.300631,0.322148,0.303834,0.421685,0.389646,0.419953,0.392152,0.743228,0.690276,0.742101,0.695986,0.394813,0.367241,0.394165,0.370159,0.167262,0.162011,0.167517,0.162908,0.263006,0.239956,0.264053,0.24383,0.327707,0.298154,0.329363,0.303404,0.425578,0.38515,0.426599,0.391209,0.753285,0.683304,0.755963,0.694613,0.41287,0.37454,0.413298,0.379796,0.176752,0.163286,0.176424,0.1647,0.261536,0.235334,0.262443,0.239481,0.329858,0.296382,0.331124,0.301778,0.42616,0.381984,0.427612,0.388907,0.756018,0.678366,0.758736,0.690685,0.424294,0.380642,0.425631,0.387395,0.182427,0.165014,0.18277,0.167536,3.8,1.266667,0.211667,0.311583,0.37625,2.738133,0.684533,4.555556,1.385714,0.179724,0.315668,0.43318,3.786362,0.916055,4.17,1.43,0.166922,0.328484,0.437213,3.963333,0.910413,0.95122,0.242718,0.252427,0.31068,0.940299,0.234597,0.248815,0.369668,1.120879,0.233939,0.288485,0.401212,0
86392,2018,20180526,0,201805260,LAN,SDN,NL,NL,0.0,,LOS03,N,5,0.227273,0.305808,0.39697,0.702777,23.0,9.0,1.0,0.24233,0.322491,0.427154,0.749645,88.0,72.0,27.0,0.247186,0.223425,0.247733,0.226938,0.338949,0.308695,0.340452,0.313896,0.433779,0.389285,0.435465,0.39646,0.772728,0.69798,0.775916,0.710356,0.475424,0.487414,0.479021,0.489139,0.190979,0.23087,0.192527,0.2272,0.232818,0.216579,0.233503,0.21926,0.324461,0.303225,0.324633,0.306078,0.404048,0.370265,0.405653,0.376006,0.728509,0.673489,0.730287,0.682085,0.474452,0.483957,0.475054,0.483293,0.204576,0.237401,0.204847,0.233475,0.250378,0.229074,0.251031,0.23237,0.339393,0.311285,0.33948,0.314935,0.43941,0.397788,0.439944,0.403559,0.778803,0.709073,0.779424,0.718493,0.505782,0.496909,0.50471,0.497069,0.221255,0.240224,0.220871,0.237468,0.23425,0.211238,0.233466,0.213454,0.315588,0.284967,0.313756,0.287204,0.418627,0.375446,0.416166,0.37871,0.734215,0.660413,0.729922,0.665913,0.486103,0.453996,0.481781,0.454172,0.215263,0.211504,0.213641,0.210516,3.577778,1.083333,0.228333,0.271583,0.409583,3.250667,0.812667,5.787302,1.480952,0.198095,0.337071,0.510595,5.07421,1.268552,5.61037,1.495556,0.186222,0.348011,0.493167,4.922987,1.230747,1.409639,0.194915,0.330508,0.483051,1.35,0.239089,0.324478,0.449715,1.317343,0.247986,0.319606,0.464637,1
50210,2003,20030615,0,200306150,NYA,SLN,AL,NL,0.0,,NYC16,D,5,0.254528,0.332195,0.443279,0.775474,25.0,22.0,5.0,0.272327,0.349449,0.453056,0.802505,127.0,109.0,35.0,0.270686,0.267319,0.270225,0.267331,0.358456,0.352059,0.35847,0.352885,0.486491,0.470898,0.487368,0.473671,0.844947,0.822957,0.845838,0.826556,0.492738,0.472404,0.495539,0.477517,0.186189,0.180251,0.187386,0.182086,0.266712,0.263826,0.266964,0.264421,0.348944,0.342739,0.349649,0.344164,0.461243,0.44703,0.462281,0.449773,0.810186,0.78977,0.811929,0.793937,0.461673,0.442305,0.463427,0.446349,0.178479,0.173335,0.179033,0.174489,0.268101,0.251686,0.269061,0.254638,0.354792,0.332861,0.355954,0.336696,0.459442,0.426912,0.461465,0.432871,0.814235,0.759772,0.817419,0.769567,0.457247,0.423589,0.459606,0.429995,0.17252,0.161238,0.17354,0.163593,0.256011,0.233756,0.258042,0.238418,0.334344,0.30529,0.3366,0.311019,0.44618,0.405176,0.449349,0.413247,0.780525,0.710466,0.785949,0.724265,0.441148,0.400068,0.44371,0.407601,0.166448,0.151605,0.167528,0.154465,2.958904,1.123288,0.165563,0.271523,0.390728,2.890411,0.698675,2.182979,0.982979,0.184466,0.249191,0.350593,2.434043,0.617044,3.305389,1.171657,0.167392,0.283986,0.423319,3.638723,0.881955,1.991597,0.103627,0.409326,0.626943,1.608997,0.135392,0.368171,0.543943,1.478261,0.1542,0.352129,0.528193,1
56305,2005,20050920,0,200509200,CHA,CLE,AL,AL,0.0,,CHI12,N,6,0.281773,0.339075,0.510345,0.84942,21.0,14.0,7.0,0.270449,0.330156,0.44872,0.778876,109.0,67.0,40.0,0.284126,0.27896,0.285135,0.280526,0.345745,0.341362,0.347325,0.343346,0.491955,0.493399,0.495148,0.496099,0.837699,0.834761,0.842473,0.839445,0.453546,0.462211,0.455832,0.463174,0.179157,0.181273,0.178742,0.180629,0.295324,0.293011,0.295942,0.293863,0.356893,0.355198,0.357989,0.356403,0.485754,0.486247,0.488414,0.488586,0.842647,0.841445,0.846403,0.84499,0.439171,0.444941,0.441598,0.4464,0.179073,0.181714,0.179054,0.181362,0.281321,0.277075,0.282265,0.278466,0.348732,0.345235,0.349842,0.346682,0.469432,0.46574,0.470964,0.467593,0.818163,0.810975,0.820807,0.814275,0.443667,0.447075,0.444359,0.447267,0.180013,0.182475,0.179906,0.182067,0.275335,0.272986,0.276138,0.27401,0.34305,0.341333,0.3438,0.342229,0.452415,0.452105,0.453601,0.453214,0.795465,0.793437,0.797401,0.795443,0.437953,0.442132,0.437888,0.441542,0.18458,0.1864,0.184169,0.185798,4.432836,1.402985,0.136054,0.319728,0.47619,4.761194,1.085034,3.139535,1.151163,0.151141,0.282319,0.402091,3.430233,0.841255,3.561558,1.234296,0.151322,0.298541,0.423428,3.823492,0.924795,1.352941,0.233577,0.335766,0.459854,1.195652,0.217631,0.30303,0.443526,1.243781,0.200739,0.307882,0.435961,0


In [8]:
df_runs.to_csv('df_runs_bp11.csv', index=False)