### The purpose of this note book is to combine 2019 and 2020 MLB stats for both pitchers and batters.  Also clean testing data to match training data for predictions.

In [1]:
#imports
import pandas as pd

# Batter Functions

In [4]:
    # Create function for batters
def batter_combine(bat_19, bat_20):
        # step 1 merge
        batters = bat_19.merge(bat_20, how='left', on='Name')

        # step 2 handle nulls
        batters.fillna(0, inplace=True)

        # step 3 create list for integer stats
        # create a list of integer stats
        col_list = ['GMS_', 'AB_', 'R_', 'H_', '2B_', '3B_', 'HR_', 'RBI_', 'BB_', 'SO_', 'SB_', 'CS_', 'PTS_']

        # for each column in the list of int stats
        for col in col_list:
            # create name for col
            col_name = str(col).replace('_','')
            # create list for data
            col_data = []
            # set counter for y column
            counter = 0

            # for each row in the specific column
            for row in batters[f'{col}x']:
                # sum similar cols somehow; possibly use x,y f string
                col_data.append(row + batters[f'{col}y'][counter])
                # increase counter
                counter += 1

            # add col to df
            batters[col_name] = col_data
            # drop x, y cols for col
            batters.drop(columns=[f'{col}x', f'{col}y'], inplace=True)

        # step 4 drop and rename columns
        # create drop list
        drop_list = ['Rank_', 'AVG_', 'OBP_', 'SLG_', 'OPS_']
        # loop through list and drop 
        for col in drop_list:
            batters.drop(columns=[f'{col}x', f'{col}y'], inplace=True)
        # drop duplicate cols
        batters.drop(columns=['Team_y', 'POS_y'], inplace=True)
        # rename remaining team, pos columns
        batters.rename(columns={'Team_x':'Team', 'POS_x': 'POS'}, inplace=True)

        # step 5 total bases column
        # total bases
        total_bases = []
        counter2 = 0
        for row in batters['H']:
            # define singles
            single = row - (batters['2B'][counter2] + batters['3B'][counter2] + batters['HR'][counter2])
            # create total bases for each player and add to total bases list 
            total_bases.append(single + ((batters['2B'][counter2])*2) + ((batters['3B'][counter2])*3) + ((batters['HR'][counter2])*4))
            # increase counter
            counter2 += 1

        # add list as column to dataframe
        batters['TB'] = total_bases

        # step 6 create percentage stat categories
        # create batting avg list
        bat_avg = []
        # create slugging percentage list 
        slg_list = []
        # create on base percentage list
        obp = []
        # create ops list
        ops = []
        # fantasy points per game
        fppg = []
        # for each row calculate stats
        for i in batters.index:
            #calculate batting average and add to list 
            bat_avg.append(round((batters['H'][i]/batters['AB'][i]), 4))
            #calculate slugging percentage and add to list
            slg_list.append(round((batters['TB'][i]/batters['AB'][i]), 4))
            #calculate on base percentage and add to list 
            obp.append(round(((batters['H'][i]+batters['BB'][i])/(batters['AB'][i]+batters['BB'][i])),4))
            #calculate fppg and add to list
            fppg.append(batters['PTS'][i]/batters['GMS'][i])

        # create seperate loop for ops
        for x,y in zip(obp, slg_list):
            ops.append(round((x+y), 4))

        # add stats to dataframe
        batters['AVG'] = bat_avg
        batters['SLG'] = slg_list
        batters['OBP'] = obp
        batters['OPS'] = ops
        batters['FPPG'] = fppg

        # step 7 drop nulls if any 
        batters.dropna(inplace=True)

        # RETURN
        return batters

In [8]:
# seperate function for test data 
def testing_data_clean(batters):
        # total bases
        total_bases = []
        counter2 = 0
        for row in batters['H']:
            # define singles
            single = row - (batters['2B'][counter2] + batters['3B'][counter2] + batters['HR'][counter2])
            # create total bases for each player and add to total bases list 
            total_bases.append(single + ((batters['2B'][counter2])*2) + ((batters['3B'][counter2])*3) + ((batters['HR'][counter2])*4))
            # increase counter
            counter2 += 1

        # add list as column to dataframe
        batters['TB'] = total_bases
        
        # fantasy points per game
        fppg = []
        
        for i in batters.index:
            #calculate fppg and add to list
            fppg.append(batters['PTS'][i]/batters['GMS'][i])
        # add to df
        batters['FPPG'] = fppg
        
        # drop rank
        batters.drop(columns='Rank', inplace=True)
        
        # match same column order as train data
        batters = batters[['Name', 'TEAM', 'POS', 'GMS', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI',
                           'BB', 'SO', 'SB', 'CS', 'TB', 'AVG', 'OBP', 'SLG', 'OPS', 'PTS',
                           'FPPG']]
        return batters

---

### Importing, cleaning and saving batter testing data.

In [10]:
# read in test data
batters = pd.read_csv('../CapStone_Data/fanduel_2021 - batters.csv')

In [11]:
batters = testing_data_clean(batters)

In [13]:
# export to folder 
batters.to_csv('../Modeling_Data/batter_test_data.csv', index=False)

### Importing, cleaning and saving batter training data.

In [3]:
# read in 2019 and 2020 batter data
bat_19 = pd.read_csv('../CapStone_Data/fanduel_2019 - batters.csv')
bat_20 = pd.read_csv('../CapStone_Data/fanduel_2020 - batters.csv')

In [6]:
# test function on past data
batter = batter_combine(bat_19, bat_20)

  bat_avg.append(round((batters['H'][i]/batters['AB'][i]), 4))
  slg_list.append(round((batters['TB'][i]/batters['AB'][i]), 4))
  obp.append(round(((batters['H'][i]+batters['BB'][i])/(batters['AB'][i]+batters['BB'][i])),4))


In [None]:
# save file as modeling batter data
batter.to_csv('../Modeling_Data/batter_data_modeling.csv', index=False)

---

# Pitcher Functions

In [38]:
def combine_pitchers(df1, df2):
    # Step 1 merge dataframes
    pitchers = df1.merge(df2, how='left', on='Name')

    # Step 2 fill nulls with zero
    pitchers.fillna(0, inplace=True)

    # step 3 create column list for integer stats to be summed and add to dataframe, also delete old columns
    col_list = ['W_', 'L_', 'GMS_', 'GS_', 'SV_', 'IP_', 'H_', 'R_', 'ER_', 'HR_', 'BB_', 'SO_', 'PTS_']

    # for each column in the list of int stats
    for col in col_list:
        # create name for col
        col_name = str(col).replace('_','')
        # create list for data
        col_data = []
        # set counter for y column
        counter = 0

        # for each row in the specific column
        for row in pitchers[f'{col}x']:
            # sum similar cols somehow; possibly use x,y f string
            col_data.append(row + pitchers[f'{col}y'][counter])
            # increase counter
            counter += 1

        # add col to df
        pitchers[col_name] = col_data
        # drop x, y cols for col
        pitchers.drop(columns=[f'{col}x', f'{col}y'], inplace=True)

    # step 4 drop and rename columns
    pitchers.drop(columns=['Team_y', 'POS_y'], inplace=True)
    # rename remaining team, pos columns
    pitchers.rename(columns={'Team_x':'Team', 'POS_x': 'POS'}, inplace=True)
    # create drop list
    drop_list = ['Rank_', 'ERA_', 'AVG_', 'WHIP_']
    # loop through list and drop 
    for col in drop_list:
        pitchers.drop(columns=[f'{col}x', f'{col}y'], inplace=True)

    # step 5 Add percentage stats to pitchers. ERA and WHIP.
    # create era list
    era_list = []
    # create whip list 
    whip_list = []
    # fantasy points per game
    fppg = []
    # for each row calculate stats
    for i in pitchers.index:
        #calculate batting average and add to list 
        era_list.append(round(((9*pitchers['ER'][i])/pitchers['IP'][i]), 4))
        #calculate slugging percentage and add to list
        whip_list.append(round((pitchers['BB'][i] + pitchers['H'][i]/pitchers['IP'][i]), 4))
        #calculate fppg and add to list
        fppg.append(pitchers['PTS'][i]/pitchers['GMS'][i])

    # add stats to dataframe
    pitchers['ERA'] = era_list
    pitchers['WHIP'] = whip_list
    pitchers['FPPG'] = fppg

    # step 6 drop nulls and any pitcher with less than 0 fantasy point per game
    pitchers.dropna(inplace=True)
    pitchers = pitchers.loc[pitchers['PTS']>0]

    # return
    return pitchers

In [46]:
# testing data function
def testing_pitcher_data_clean(pitchers):
    
    # step 1 drop columns
    pitchers.drop(columns='Rank')
    
    # step 2 add FPPG
    fppg = []
    # for each row calculate stats
    for i in pitchers.index:
        #calculate fppg and add to list
        fppg.append(pitchers['PTS'][i]/pitchers['GMS'][i])
    # add stats to dataframe
    pitchers['FPPG'] = fppg
    
    # step 3 reorder columns to match train
    pitchers = pitchers[['Name', 'Team', 'Pos', 'W', 'L', 'GMS', 'GS', 'SV', 'IP', 'H', 'R',
       'ER', 'HR', 'BB', 'SO', 'PTS', 'ERA', 'WHIP', 'FPPG']]
    
    # step 4 drop any pitcher with less than 0 fantasy points per game
    pitchers = pitchers.loc[pitchers['PTS']>0]
    
    # return
    return pitchers

### Importing, cleaning and saving pitcher training data.

In [21]:
# test pitcher function
pitch_19 = pd.read_csv('../CapStone_Data/fanduel_2019 - pitchers.csv')
pitch_20 = pd.read_csv('../CapStone_Data/fanduel_2020 - pitchers.csv')

In [23]:
# call function
pitchers = combine_pitchers(pitch_19, pitch_20)

  era_list.append(round(((9*pitchers['ER'][i])/pitchers['IP'][i]), 4))
  whip_list.append(round((pitchers['BB'][i] + pitchers['H'][i]/pitchers['IP'][i]), 4))
  era_list.append(round(((9*pitchers['ER'][i])/pitchers['IP'][i]), 4))


In [25]:
# export to csv
pitchers.to_csv('../Modeling_Data/pitcher_data_modeling.csv', index=False)

### Importing, cleaning and saving pitcher training data.

In [47]:
# read in test data for pitchers
pitchers_test = pd.read_csv('../CapStone_Data/fanduel_2021 - pitchers.csv')

In [48]:
# clean pitcher test data with function
pitchers_test = testing_pitcher_data_clean(pitchers_test)

In [49]:
# export to csv
pitchers_test.to_csv('../Modeling_Data/pitcher_test_data.csv', index=False)