### The purpose of this note book is to combine 2019 and 2020 MLB stats for both pitchers and batters.

In [1]:
#imports
import pandas as pd

In [2]:
# read in 2019 and 2020 batter data
bat_19 = pd.read_csv('CapStone_Data/fanduel_2019 - batters.csv')
bat_20 = pd.read_csv('CapStone_Data/fanduel_2020 - batters.csv')

In [3]:
# take a look at both should be very similar
bat_19.head()

Unnamed: 0,Rank,Name,Team,POS,GMS,AB,R,H,2B,3B,...,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS,PTS
0,1,Cody Bellinger,LAD,CF,156,558,121,170,34,3,...,115,95,108,15,5,0.305,0.406,0.629,1.035,2226.7
1,2,Ronald Acuna,ATL,CF,156,626,127,175,22,2,...,101,76,188,37,9,0.28,0.365,0.518,0.883,2208.9
2,3,Alex Bregman,HOU,3B,157,554,122,164,37,2,...,112,119,83,5,1,0.296,0.423,0.592,1.015,2180.4
3,4,Anthony Rendon,WSH,3B,146,545,117,174,44,3,...,126,80,86,5,1,0.319,0.412,0.598,1.01,2099.4
4,5,Rafael Devers,BOS,3B,156,647,129,201,54,4,...,115,48,119,8,8,0.311,0.361,0.555,0.916,2096.3


In [4]:
bat_20.head()

Unnamed: 0,Rank,Name,Team,POS,GMS,AB,R,H,2B,3B,...,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS,PTS
0,1,Freddie Freeman,ATL,1B,60,214,51,73,23,1,...,53,45,37,2,0,0.341,0.462,0.64,1.102,915.7
1,2,Marcell Ozuna,ATL,LF,60,228,38,77,14,0,...,56,38,60,0,0,0.338,0.431,0.636,1.067,866.6
2,3,José Ramírez,CLE,3B,58,219,45,64,16,1,...,46,31,43,10,3,0.292,0.386,0.607,0.993,866.0
3,4,Fernando Tatis Jr.,SD,SS,59,224,50,62,11,2,...,45,27,61,11,3,0.277,0.366,0.571,0.937,863.5
4,5,José Abreu,CHW,1B,60,240,43,76,15,0,...,60,18,59,0,0,0.317,0.37,0.617,0.987,854.6


In [5]:
# check shape for each
print(f'2019: {bat_19.shape}')
print(f'2020: {bat_20.shape}')

2019: (636, 21)
2020: (583, 21)


Same number of columns, which is great.  2019 shows more rows, which will be used as base in merge.

In [24]:
# read in test data
batters = pd.read_csv('CapStone_Data/fanduel_2021 - batters.csv')

In [25]:
    # Create function for batters
def batter_combine(batters):
        # step 1 merge
        batters = bat_19.merge(bat_20, how='left', on='Name')

        # step 2 handle nulls
        batters.fillna(0, inplace=True)

        # step 3 create list for integer stats
        # create a list of integer stats
        col_list = ['GMS_', 'AB_', 'R_', 'H_', '2B_', '3B_', 'HR_', 'RBI_', 'BB_', 'SO_', 'SB_', 'CS_', 'PTS_']

        # for each column in the list of int stats
        for col in col_list:
            # create name for col
            col_name = str(col).replace('_','')
            # create list for data
            col_data = []
            # set counter for y column
            counter = 0

            # for each row in the specific column
            for row in batters[f'{col}x']:
                # sum similar cols somehow; possibly use x,y f string
                col_data.append(row + batters[f'{col}y'][counter])
                # increase counter
                counter += 1

            # add col to df
            batters[col_name] = col_data
            # drop x, y cols for col
            batters.drop(columns=[f'{col}x', f'{col}y'], inplace=True)

        # step 4 drop and rename columns
        # create drop list
        drop_list = ['Rank_', 'AVG_', 'OBP_', 'SLG_', 'OPS_']
        # loop through list and drop 
        for col in drop_list:
            batters.drop(columns=[f'{col}x', f'{col}y'], inplace=True)
        # drop duplicate cols
        batters.drop(columns=['Team_y', 'POS_y'], inplace=True)
        # rename remaining team, pos columns
        batters.rename(columns={'Team_x':'Team', 'POS_x': 'POS'}, inplace=True)

        # step 5 total bases column
        # total bases
        total_bases = []
        counter2 = 0
        for row in batters['H']:
            # define singles
            single = row - (batters['2B'][counter2] + batters['3B'][counter2] + batters['HR'][counter2])
            # create total bases for each player and add to total bases list 
            total_bases.append(single + ((batters['2B'][counter2])*2) + ((batters['3B'][counter2])*3) + ((batters['HR'][counter2])*4))
            # increase counter
            counter2 += 1

        # add list as column to dataframe
        batters['TB'] = total_bases

        # step 6 create percentage stat categories
        # create batting avg list
        bat_avg = []
        # create slugging percentage list 
        slg_list = []
        # create on base percentage list
        obp = []
        # create ops list
        ops = []
        # fantasy points per game
        fppg = []
        # for each row calculate stats
        for i in batters.index:
            #calculate batting average and add to list 
            bat_avg.append(round((batters['H'][i]/batters['AB'][i]), 4))
            #calculate slugging percentage and add to list
            slg_list.append(round((batters['TB'][i]/batters['AB'][i]), 4))
            #calculate on base percentage and add to list 
            obp.append(round(((batters['H'][i]+batters['BB'][i])/(batters['AB'][i]+batters['BB'][i])),4))
            #calculate fppg and add to list
            fppg.append(batters['PTS'][i]/batters['GMS'][i])

        # create seperate loop for ops
        for x,y in zip(obp, slg_list):
            ops.append(round((x+y), 4))

        # add stats to dataframe
        batters['AVG'] = bat_avg
        batters['SLG'] = slg_list
        batters['OBP'] = obp
        batters['OPS'] = ops
        batters['FPPG'] = fppg

        # step 7 drop nulls if any 
        batters.dropna(inplace=True)

        # RETURN
        return batters

In [26]:
# seperate function for test data 
def testing_data_clean(batters):
        # total bases
        total_bases = []
        counter2 = 0
        for row in batters['H']:
            # define singles
            single = row - (batters['2B'][counter2] + batters['3B'][counter2] + batters['HR'][counter2])
            # create total bases for each player and add to total bases list 
            total_bases.append(single + ((batters['2B'][counter2])*2) + ((batters['3B'][counter2])*3) + ((batters['HR'][counter2])*4))
            # increase counter
            counter2 += 1

        # add list as column to dataframe
        batters['TB'] = total_bases
        
        # fantasy points per game
        fppg = []
        
        for i in batters.index:
            #calculate fppg and add to list
            fppg.append(batters['PTS'][i]/batters['GMS'][i])
        # add to df
        batters['FPPG'] = fppg
        
        # drop rank
        batters.drop(columns='Rank', inplace=True)
        
        # match same column order as train data
        batters = batters[['Name', 'TEAM', 'POS', 'GMS', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI',
                           'BB', 'SO', 'SB', 'CS', 'TB', 'AVG', 'OBP', 'SLG', 'OPS', 'PTS',
                           'FPPG']]
        return batters

In [27]:
batters = testing_data_clean(batters)

In [28]:
batters.columns

Index(['Name', 'TEAM', 'POS', 'GMS', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI',
       'BB', 'SO', 'SB', 'CS', 'TB', 'AVG', 'OBP', 'SLG', 'OPS', 'PTS',
       'FPPG'],
      dtype='object')

In [29]:
# export to folder 
batters.to_csv('Modeling_Data/batter_test_data.csv', index=False)

In [6]:
# merge attempt
batters = bat_19.merge(bat_20, how='left', on='Name')

In [7]:
# investigating merged dataframe
batters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 636 entries, 0 to 635
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rank_x  636 non-null    int64  
 1   Name    636 non-null    object 
 2   Team_x  636 non-null    object 
 3   POS_x   635 non-null    object 
 4   GMS_x   636 non-null    int64  
 5   AB_x    636 non-null    int64  
 6   R_x     636 non-null    int64  
 7   H_x     636 non-null    int64  
 8   2B_x    636 non-null    int64  
 9   3B_x    636 non-null    int64  
 10  HR_x    636 non-null    int64  
 11  RBI_x   636 non-null    int64  
 12  BB_x    636 non-null    int64  
 13  SO_x    636 non-null    int64  
 14  SB_x    636 non-null    int64  
 15  CS_x    636 non-null    int64  
 16  AVG_x   636 non-null    float64
 17  OBP_x   636 non-null    float64
 18  SLG_x   636 non-null    float64
 19  OPS_x   636 non-null    float64
 20  PTS_x   636 non-null    float64
 21  Rank_y  420 non-null    float64
 22  Te

Successfully merged on name, now need to combine columns.
Need a function that adds or sums the integer stats and drops individual columns.  Then once sum of ints is prepared that data will be used to calculate average stats.
- Need to fill nulls first

### Handle nulls
We can not merge data with nan values.  The only operation we will be doing is addition so 0 will work for the purpose of the merge.

In [8]:
batters.fillna(0, inplace=True)

In [9]:
cols = 'AB_'
type(batters[f'{cols}y'][0])

numpy.float64

In [10]:
# create a list of integer stats
col_list = ['GMS_', 'AB_', 'R_', 'H_', '2B_', '3B_', 'HR_', 'RBI_', 'BB_', 'SO_', 'SB_', 'CS_', 'PTS_']

# for each column in the list of int stats
for col in col_list:
    # create name for col
    col_name = str(col).replace('_','')
    # create list for data
    col_data = []
    # set counter for y column
    counter = 0
    
    # for each row in the specific column
    for row in batters[f'{col}x']:
        # sum similar cols somehow; possibly use x,y f string
        col_data.append(row + batters[f'{col}y'][counter])
        # increase counter
        counter += 1
        
    # add col to df
    batters[col_name] = col_data
    # drop x, y cols for col
    batters.drop(columns=[f'{col}x', f'{col}y'], inplace=True)
    
# need to create a list for each stat in col list

In [11]:
batters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 636 entries, 0 to 635
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rank_x  636 non-null    int64  
 1   Name    636 non-null    object 
 2   Team_x  636 non-null    object 
 3   POS_x   636 non-null    object 
 4   AVG_x   636 non-null    float64
 5   OBP_x   636 non-null    float64
 6   SLG_x   636 non-null    float64
 7   OPS_x   636 non-null    float64
 8   Rank_y  636 non-null    float64
 9   Team_y  636 non-null    object 
 10  POS_y   636 non-null    object 
 11  AVG_y   636 non-null    float64
 12  OBP_y   636 non-null    float64
 13  SLG_y   636 non-null    float64
 14  OPS_y   636 non-null    float64
 15  GMS     636 non-null    float64
 16  AB      636 non-null    float64
 17  R       636 non-null    float64
 18  H       636 non-null    float64
 19  2B      636 non-null    float64
 20  3B      636 non-null    float64
 21  HR      636 non-null    float64
 22  RB

### Need to calculate percentage stats based on totals provided above.
- drop columns that will not be needed: rank, team, pos
- create total bases column
- create avg, obp, slg, ops, fppg

In [12]:
# create drop list
drop_list = ['Rank_', 'AVG_', 'OBP_', 'SLG_', 'OPS_']
# loop through list and drop 
for col in drop_list:
    batters.drop(columns=[f'{col}x', f'{col}y'], inplace=True)

In [13]:
# drop duplicate cols
batters.drop(columns=['Team_y', 'POS_y'], inplace=True)

In [14]:
# rename remaining team, pos columns
batters.rename(columns={'Team_x':'Team', 'POS_x': 'POS'}, inplace=True)

In [15]:
batters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 636 entries, 0 to 635
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    636 non-null    object 
 1   Team    636 non-null    object 
 2   POS     636 non-null    object 
 3   GMS     636 non-null    float64
 4   AB      636 non-null    float64
 5   R       636 non-null    float64
 6   H       636 non-null    float64
 7   2B      636 non-null    float64
 8   3B      636 non-null    float64
 9   HR      636 non-null    float64
 10  RBI     636 non-null    float64
 11  BB      636 non-null    float64
 12  SO      636 non-null    float64
 13  SB      636 non-null    float64
 14  CS      636 non-null    float64
 15  PTS     636 non-null    float64
dtypes: float64(13), object(3)
memory usage: 104.5+ KB


In [16]:
# create avg stats
# total bases
total_bases = []
counter2 = 0
for row in batters['H']:
    # define singles
    single = row - (batters['2B'][counter2] + batters['3B'][counter2] + batters['HR'][counter2])
    # create total bases for each player and add to total bases list 
    total_bases.append(single + ((batters['2B'][counter2])*2) + ((batters['3B'][counter2])*3) + ((batters['HR'][counter2])*4))
    # increase counter
    counter2 += 1
    
# add list as column to dataframe
batters['TB'] = total_bases

In [17]:
batters.head(1)

Unnamed: 0,Name,Team,POS,GMS,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,PTS,TB
0,Cody Bellinger,LAD,CF,212.0,771.0,154.0,221.0,44.0,3.0,59.0,145.0,125.0,150.0,21.0,6.0,2854.3,448.0


The following cell with create the remaining percent stats now that we have total bases.

In [18]:
# create batting avg list
bat_avg = []
# create slugging percentage list 
slg_list = []
# create on base percentage list
obp = []
# create ops list
ops = []
# fantasy points per game
fppg = []
# for each row calculate stats
for i in batters.index:
    #calculate batting average and add to list 
    bat_avg.append(round((batters['H'][i]/batters['AB'][i]), 4))
    #calculate slugging percentage and add to list
    slg_list.append(round((batters['TB'][i]/batters['AB'][i]), 4))
    #calculate on base percentage and add to list 
    obp.append(round(((batters['H'][i]+batters['BB'][i])/(batters['AB'][i]+batters['BB'][i])),4))
    #calculate fppg and add to list
    fppg.append(batters['PTS'][i]/batters['GMS'][i])
    
# create seperate loop for ops
for x,y in zip(obp, slg_list):
    ops.append(round((x+y), 4))
    
# add stats to dataframe
batters['AVG'] = bat_avg
batters['SLG'] = slg_list
batters['OBP'] = obp
batters['OPS'] = ops
batters['FPPG'] = fppg

  bat_avg.append(round((batters['H'][i]/batters['AB'][i]), 4))
  slg_list.append(round((batters['TB'][i]/batters['AB'][i]), 4))
  obp.append(round(((batters['H'][i]+batters['BB'][i])/(batters['AB'][i]+batters['BB'][i])),4))


pd.set_option('display.max_columns', 25)
batters.head()

In [20]:
# test for nulls
batters.isna().sum()

Name    0
Team    0
POS     0
GMS     0
AB      0
R       0
H       0
2B      0
3B      0
HR      0
RBI     0
BB      0
SO      0
SB      0
CS      0
PTS     0
TB      0
AVG     1
SLG     1
OBP     1
OPS     1
FPPG    0
dtype: int64

In [21]:
# print tail to look at null values
batters.tail(1)

Unnamed: 0,Name,Team,POS,GMS,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,PTS,TB,AVG,SLG,OBP,OPS,FPPG
635,Locke St John,TEX,0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0


In [22]:
# fill with zeros
batters.fillna(0, inplace=True)

In [23]:
# save file as modeling batter data
batters.to_csv('Modeling_Data/batter_data_modeling.csv', index=False)

---
# Pitcher Data

In [24]:
# read in data from previous two years 
pitch_19 = pd.read_csv('CapStone_Data/fanduel_2019 - pitchers.csv')
pitch_20 = pd.read_csv('CapStone_Data/fanduel_2020 - pitchers.csv')

In [25]:
# check shape for each
print(f'2019: {pitch_19.shape}')
print(f'2020: {pitch_20.shape}')

2019: (774, 20)
2020: (708, 20)


In [26]:
pitch_19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 774 entries, 0 to 773
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rank    774 non-null    int64  
 1   Name    774 non-null    object 
 2   Team    774 non-null    object 
 3   POS     774 non-null    object 
 4   W       774 non-null    int64  
 5   L       774 non-null    int64  
 6   ERA     774 non-null    float64
 7   GMS     774 non-null    int64  
 8   GS      774 non-null    int64  
 9   SV      774 non-null    int64  
 10  IP      774 non-null    float64
 11  H       774 non-null    int64  
 12  R       774 non-null    int64  
 13  ER      774 non-null    int64  
 14  HR      774 non-null    int64  
 15  BB      774 non-null    int64  
 16  SO      774 non-null    int64  
 17  AVG     774 non-null    float64
 18  WHIP    774 non-null    float64
 19  PTS     774 non-null    int64  
dtypes: float64(4), int64(13), object(3)
memory usage: 121.1+ KB


In [27]:
pitch_19.head()

Unnamed: 0,Rank,Name,Team,POS,W,L,ERA,GMS,GS,SV,IP,H,R,ER,HR,BB,SO,AVG,WHIP,PTS
0,1,Gerrit Cole,HOU,SP,20,5,2.5,33,33,0,212.1,142,66,59,29,48,326,0.186,0.9,1662
1,2,Justin Verlander,HOU,SP,21,6,2.58,34,34,0,223.0,137,66,64,36,42,300,0.172,0.8,1607
2,3,Shane Bieber,CLE,SP,15,8,3.28,34,33,0,214.1,186,86,78,31,40,259,0.23,1.05,1372
3,4,Jacob deGrom,NYM,SP,11,8,2.43,34,32,0,204.0,154,59,55,19,44,255,0.207,0.97,1370
4,5,Stephen Strasburg,WSH,SP,18,6,3.32,34,33,0,209.0,161,79,77,24,56,251,0.21,1.04,1345


In [28]:
pitch_20.head()

Unnamed: 0,Rank,Name,Team,POS,W,L,ERA,GMS,GS,SV,IP,H,R,ER,HR,BB,SO,AVG,WHIP,PTS
0,1,Shane Bieber,CLE,SP,8,1,1.63,12,12,0,77.1,46,15,14,7,21,122,0.167,0.87,644
1,2,Yu Darvish,CHC,SP,8,3,2.01,12,12,0,76.0,59,18,17,5,14,93,0.211,0.96,544
2,3,Trevor Bauer,CIN,SP,5,4,1.73,11,11,0,73.0,41,17,14,9,17,100,0.159,0.8,543
3,4,Jacob deGrom,NYM,SP,4,2,2.38,12,12,0,68.0,47,21,18,7,18,104,0.19,0.96,518
4,5,Gerrit Cole,NYY,SP,7,3,2.84,12,11,0,73.0,53,27,23,14,17,94,0.197,0.96,502


In [29]:
# merge the two data frames
pitchers = pitch_19.merge(pitch_20, how='left', on='Name')

In [30]:
pitchers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 775 entries, 0 to 774
Data columns (total 39 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rank_x  775 non-null    int64  
 1   Name    775 non-null    object 
 2   Team_x  775 non-null    object 
 3   POS_x   775 non-null    object 
 4   W_x     775 non-null    int64  
 5   L_x     775 non-null    int64  
 6   ERA_x   775 non-null    float64
 7   GMS_x   775 non-null    int64  
 8   GS_x    775 non-null    int64  
 9   SV_x    775 non-null    int64  
 10  IP_x    775 non-null    float64
 11  H_x     775 non-null    int64  
 12  R_x     775 non-null    int64  
 13  ER_x    775 non-null    int64  
 14  HR_x    775 non-null    int64  
 15  BB_x    775 non-null    int64  
 16  SO_x    775 non-null    int64  
 17  AVG_x   775 non-null    float64
 18  WHIP_x  775 non-null    float64
 19  PTS_x   775 non-null    int64  
 20  Rank_y  483 non-null    float64
 21  Team_y  483 non-null    object 
 22  PO

Merged with all data from 2019, which had more observations.
Next step is to fill nulls then sum int columns.

In [31]:
# fill nulls with zero
pitchers.fillna(0, inplace=True)

In [32]:
# now create column list of int stats to sum up
# create a list of integer stats
col_list = ['W_', 'L_', 'GMS_', 'GS_', 'SV_', 'IP_', 'H_', 'R_', 'ER_', 'HR_', 'BB_', 'SO_', 'PTS_']

# for each column in the list of int stats
for col in col_list:
    # create name for col
    col_name = str(col).replace('_','')
    # create list for data
    col_data = []
    # set counter for y column
    counter = 0
    
    # for each row in the specific column
    for row in pitchers[f'{col}x']:
        # sum similar cols somehow; possibly use x,y f string
        col_data.append(row + pitchers[f'{col}y'][counter])
        # increase counter
        counter += 1
        
    # add col to df
    pitchers[col_name] = col_data
    # drop x, y cols for col
    pitchers.drop(columns=[f'{col}x', f'{col}y'], inplace=True)
    
# need to create a list for each stat in col list

In [33]:
pitchers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 775 entries, 0 to 774
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rank_x  775 non-null    int64  
 1   Name    775 non-null    object 
 2   Team_x  775 non-null    object 
 3   POS_x   775 non-null    object 
 4   ERA_x   775 non-null    float64
 5   AVG_x   775 non-null    float64
 6   WHIP_x  775 non-null    float64
 7   Rank_y  775 non-null    float64
 8   Team_y  775 non-null    object 
 9   POS_y   775 non-null    object 
 10  ERA_y   775 non-null    float64
 11  AVG_y   775 non-null    float64
 12  WHIP_y  775 non-null    float64
 13  W       775 non-null    float64
 14  L       775 non-null    float64
 15  GMS     775 non-null    float64
 16  GS      775 non-null    float64
 17  SV      775 non-null    float64
 18  IP      775 non-null    float64
 19  H       775 non-null    float64
 20  R       775 non-null    float64
 21  ER      775 non-null    float64
 22  HR

Now drop duplicate columns. Where only one value is needed, Team and Pos.

In [34]:
# drop duplicate cols
pitchers.drop(columns=['Team_y', 'POS_y'], inplace=True)

In [35]:
# rename remaining team, pos columns
pitchers.rename(columns={'Team_x':'Team', 'POS_x': 'POS'}, inplace=True)

Now to drop unneeded columns and ones that will be calculated with the totals.

In [36]:
# create drop list
drop_list = ['Rank_', 'ERA_', 'AVG_', 'WHIP_']
# loop through list and drop 
for col in drop_list:
    pitchers.drop(columns=[f'{col}x', f'{col}y'], inplace=True)

In [37]:
pitchers.head()

Unnamed: 0,Name,Team,POS,W,L,GMS,GS,SV,IP,H,R,ER,HR,BB,SO,PTS
0,Gerrit Cole,HOU,SP,27.0,8.0,45.0,44.0,0.0,285.1,195.0,93.0,82.0,43.0,65.0,420.0,2164.0
1,Justin Verlander,HOU,SP,22.0,6.0,35.0,35.0,0.0,229.0,140.0,68.0,66.0,38.0,43.0,307.0,1650.0
2,Shane Bieber,CLE,SP,23.0,9.0,46.0,45.0,0.0,291.2,232.0,101.0,92.0,38.0,61.0,381.0,2016.0
3,Jacob deGrom,NYM,SP,15.0,10.0,46.0,44.0,0.0,272.0,201.0,80.0,73.0,26.0,62.0,359.0,1888.0
4,Stephen Strasburg,WSH,SP,18.0,7.0,36.0,35.0,0.0,214.0,169.0,85.0,83.0,25.0,57.0,253.0,1348.0


Add percentage stats to pitchers. ERA and WHIP.

In [62]:
# create era list
era_list = []
# create whip list 
whip_list = []
# fantasy points per game
fppg = []
# for each row calculate stats
for i in pitchers.index:
    #calculate batting average and add to list 
    era_list.append(round(((9*pitchers['ER'][i])/pitchers['IP'][i]), 4))
    #calculate slugging percentage and add to list
    whip_list.append(round((pitchers['BB'][i] + pitchers['H'][i]/pitchers['IP'][i]), 4))
    #calculate fppg and add to list
    fppg.append(pitchers['PTS'][i]/pitchers['GMS'][i])
    
# create seperate loop for ops
for x,y in zip(obp, slg_list):
    ops.append(round((x+y), 4))
    
# add stats to dataframe
pitchers['ERA'] = era_list
pitchers['WHIP'] = whip_list
pitchers['FPPG'] = fppg

  era_list.append(round(((9*pitchers['ER'][i])/pitchers['IP'][i]), 4))
  whip_list.append(round((pitchers['BB'][i] + pitchers['H'][i]/pitchers['IP'][i]), 4))
  era_list.append(round(((9*pitchers['ER'][i])/pitchers['IP'][i]), 4))


In [63]:
pitchers.head()

Unnamed: 0,Name,Team,POS,W,L,GMS,GS,SV,IP,H,R,ER,HR,BB,SO,PTS,ERA,WHIP,FPPG
0,Gerrit Cole,HOU,SP,27.0,8.0,45.0,44.0,0.0,285.1,195.0,93.0,82.0,43.0,65.0,420.0,2164.0,2.5886,65.684,48.088889
1,Justin Verlander,HOU,SP,22.0,6.0,35.0,35.0,0.0,229.0,140.0,68.0,66.0,38.0,43.0,307.0,1650.0,2.5939,43.6114,47.142857
2,Shane Bieber,CLE,SP,23.0,9.0,46.0,45.0,0.0,291.2,232.0,101.0,92.0,38.0,61.0,381.0,2016.0,2.8434,61.7967,43.826087
3,Jacob deGrom,NYM,SP,15.0,10.0,46.0,44.0,0.0,272.0,201.0,80.0,73.0,26.0,62.0,359.0,1888.0,2.4154,62.739,41.043478
4,Stephen Strasburg,WSH,SP,18.0,7.0,36.0,35.0,0.0,214.0,169.0,85.0,83.0,25.0,57.0,253.0,1348.0,3.4907,57.7897,37.444444


Now that we have clean data for previous 2 years on pitchers and batters we will export clean data and attempt to model.

In [82]:
pitchers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 773 entries, 0 to 774
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    773 non-null    object 
 1   Team    773 non-null    object 
 2   POS     773 non-null    object 
 3   W       773 non-null    float64
 4   L       773 non-null    float64
 5   GMS     773 non-null    float64
 6   GS      773 non-null    float64
 7   SV      773 non-null    float64
 8   IP      773 non-null    float64
 9   H       773 non-null    float64
 10  R       773 non-null    float64
 11  ER      773 non-null    float64
 12  HR      773 non-null    float64
 13  BB      773 non-null    float64
 14  SO      773 non-null    float64
 15  PTS     773 non-null    float64
 16  ERA     773 non-null    float64
 17  WHIP    773 non-null    float64
 18  FPPG    773 non-null    float64
dtypes: float64(16), object(3)
memory usage: 120.8+ KB


In [79]:
pitchers.dropna(inplace=True)

In [91]:
pitchers = pitchers.loc[pitchers['PTS']>0]

In [95]:
# export to csv
pitchers.to_csv('Modeling_Data/pitcher_data_modeling.csv', index=False)