# **Batting HOF Dataset**

In [1]:
import pandas as pd
import numpy as np
import mysql.connector as mysql

# Connecting into the mysql Lahman db using the credentials from mysql workbench
db = mysql.connect(
    host="localhost",
    user="root",
    passwd="Mikejoe1",
    database="lahmansbaseballdb" 
    
)

print(db)

<mysql.connector.connection_cext.CMySQLConnection object at 0x104acc400>


In [2]:
# Join Lahman tables together to build our dataset

# Calling the mysql.connector.cursor() to help execute our query statements
mycursor = db.cursor()

# Populating 'sql' with our query. We want to create a query that gives us all the necessary batting data
sql = """SELECT DISTINCT 
           halloffame.playerID, nameFirst, nameLast, debut, inducted
            from halloffame
            JOIN people ON halloffame.playerID = people.playerID
            where inducted = 'Y'"""


# Use .execute() to write a simple query to make sure everything is working
mycursor.execute(sql)

# Use .fetchall() to retrieve the result set of the query
myresult = mycursor.fetchall()

In [3]:
# Converting the query to a DataFrame so we can start setting the data up and add more calculations
df = pd.DataFrame(myresult)

# We need to add the proper column headers back into this DataFrame
df.columns = ['PlayerID', 'FirstName', 'LastName', 'Debut', 'Inducted']

# Combining the FirstName & LastName columns to one
df['Name'] = df['FirstName'] + " " + df['LastName']

# Converting the Debut to a datetime object so we can manipulate the date
df['Debut'] = pd.to_datetime(df['Debut'])

# Extracting the year from the debut column
df['DebutYear'] = pd.DatetimeIndex(df['Debut']).year

# Dropping the columns from the df we don't need anymore
df.drop(['FirstName', 'LastName', 'Debut'], axis=1, inplace=True)

# Reordering the DataFrame columns to move the 'Name' & Debut columns
cols = list(df)
cols.insert(1, cols.pop(cols.index('Name')))
cols.insert(2, cols.pop(cols.index('DebutYear')))

# Repopulating df with new column order and rows
df = df.loc[:, cols]

df.head()

Unnamed: 0,PlayerID,Name,DebutYear,Inducted
0,cobbty01,Ty Cobb,1905.0,Y
1,ruthba01,Babe Ruth,1914.0,Y
2,wagneho01,Honus Wagner,1897.0,Y
3,mathech01,Christy Mathewson,1900.0,Y
4,johnswa01,Walter Johnson,1907.0,Y


In [4]:
# Reading in the excel with the batting data
df1 = pd.read_excel('batting.xlsx', index_col=0)

df1.head()

Unnamed: 0,PlayerID,Name,Year,TeamID,Team,Position,DebutYear,Bats,BirthCountry,BirthState,...,TB,HBP,SH,SF,GIDP,BB%,K%,ISO,BABIP,RC
36886,aaronha01,Hank Aaron,1954,ML1,Milwaukee Braves,LF,1954,R,USA,AL,...,209,3,6,4,13,0.055,0.077,0.167,0.281,67.0
36888,abramca01,Cal Abrams,1954,BAL,Baltimore Orioles,RF,1949,L,USA,PA,...,178,4,6,1,5,0.142,0.132,0.128,0.336,70.48
36889,abramca01,Cal Abrams,1954,PIT,Pittsburgh Pirates,RF,1949,L,USA,PA,...,9,0,0,0,2,0.192,0.173,0.071,0.182,2.77
36890,adamsbo03,Bobby Adams,1954,CIN,Cincinnati Redlegs,3B,1946,R,USA,CA,...,151,3,4,2,4,0.121,0.101,0.118,0.297,54.29
36891,adcocjo01,Joe Adcock,1954,ML1,Milwaukee Braves,1B,1950,R,USA,LA,...,260,3,11,4,7,0.078,0.103,0.212,0.31,94.63


In [5]:
# Grouping the data by PlayerID and creating numbering each season of a player's career
# This creates a generic number for each season in a player's career
df1['PlayingYear'] = df1.groupby(['PlayerID']).cumcount()+1

# Creating a boolean column to group data in and out of the first 6 years of a players career
df1['SixYear'] = df1['PlayingYear']<=6

# Removing the rows that contain 'False' from the df
df1 = df1[df1.SixYear]

# Looking at David Wright's data to make sure the years were numbered correctly & only the first 6 years remain
df1.loc[df1['Name'] == 'David Wright']

Unnamed: 0,PlayerID,Name,Year,TeamID,Team,Position,DebutYear,Bats,BirthCountry,BirthState,...,SH,SF,GIDP,BB%,K%,ISO,BABIP,RC,PlayingYear,SixYear
85977,wrighda03,David Wright,2004,NYN,New York Mets,3B,2004,R,USA,VA,...,0,3,7,0.049,0.141,0.232,0.297,45.34,1,True
87310,wrighda03,David Wright,2005,NYN,New York Mets,3B,2004,R,USA,VA,...,0,3,16,0.11,0.172,0.217,0.34,115.38,2,True
88681,wrighda03,David Wright,2006,NYN,New York Mets,3B,2004,R,USA,VA,...,0,8,15,0.1,0.171,0.22,0.344,117.78,3,True
90067,wrighda03,David Wright,2007,NYN,New York Mets,3B,2004,R,USA,VA,...,0,7,14,0.132,0.162,0.222,0.356,137.11,4,True
91453,wrighda03,David Wright,2008,NYN,New York Mets,3B,2004,R,USA,VA,...,0,11,15,0.128,0.161,0.232,0.321,131.28,5,True
92840,wrighda03,David Wright,2009,NYN,New York Mets,3B,2004,R,USA,VA,...,0,6,16,0.12,0.227,0.14,0.394,93.4,6,True


In [6]:
# Dropping unnecessary columns
# Also dropping this columns that we don't want to be aggregared (Dates)
df1.drop(['TeamID', 'Team', 'Position', 'Bats', 'BirthCountry', 'BirthState', 'SixYear', 'Year',  'BA', 'OBP', 'SLG', 'OPS', 'TB', 'BB%', 'K%', 'ISO', 'BABIP', 'RC'], axis=1, inplace=True)

df1.head()

Unnamed: 0,PlayerID,Name,DebutYear,G,AB,PA,R,H,1B,2B,...,RBI,SB,CS,BB,SO,HBP,SH,SF,GIDP,PlayingYear
36886,aaronha01,Hank Aaron,1954,122,468,509,58,131,85,27,...,69,2,2,28,39,3,6,4,13,1
36888,abramca01,Cal Abrams,1949,115,423,506,67,124,89,22,...,25,1,4,72,67,4,6,1,5,1
36889,abramca01,Cal Abrams,1949,17,42,52,6,6,4,1,...,2,0,0,10,9,0,0,0,2,2
36890,adamsbo03,Bobby Adams,1946,110,390,454,69,105,71,25,...,23,2,5,55,46,3,4,2,4,1
36891,adcocjo01,Joe Adcock,1950,133,500,562,73,154,99,27,...,87,1,4,44,58,3,11,4,7,1


In [7]:
# Grouping the data by player and aggregating the base stats from the batting table
df2 = df1.groupby(['PlayerID', 'Name', 'DebutYear']).agg({'G': sum, 'AB': sum, 'PA': sum, 'R': sum, 'H': sum, '1B': sum, '2B': sum, '3B': sum, 'HR': sum, 'RBI': sum, 'SB': sum, 'CS': sum, 'BB': sum, 'SO': sum, 'HBP': sum, 'SH': sum, 'SF': sum, 'GIDP': sum, 'PlayingYear': sum})

# Resetting the index so the group by columns convert back to df columns
df2.reset_index(inplace=True)

df2.head()

Unnamed: 0,PlayerID,Name,DebutYear,G,AB,PA,R,H,1B,2B,...,RBI,SB,CS,BB,SO,HBP,SH,SF,GIDP,PlayingYear
0,aaronha01,Hank Aaron,1954,886,3524,3866,612,1137,707,205,...,617,20,9,281,315,13,18,30,107,21
1,aaronto01,Tommie Aaron,1962,412,891,989,98,204,145,40,...,91,9,8,83,140,0,9,6,33,21
2,abadan01,Andy Abad,2001,10,18,20,1,2,2,0,...,0,0,1,2,5,0,0,0,1,3
3,abbotje01,Jeff Abbott,1997,233,596,649,82,157,104,33,...,83,6,5,38,91,3,5,7,12,15
4,abbotku01,Kurt Abbott,1993,486,1469,1604,193,376,233,78,...,180,15,7,93,414,15,18,9,28,21


In [8]:
## Creating a boolean column of whether a player's debut is before 1954
df2['Pre1954Debut'] = df2['DebutYear']>=1954

# Removing the rows that contain 'False' from the df
df2 = df2[df2.Pre1954Debut]

# Dropping the boolean debut column
df2.drop(['Pre1954Debut', 'DebutYear'], axis=1, inplace=True)

## Removing those players that do not have six years of data
df2['YearSum'] = df2['PlayingYear'] == 21

# Removing the rows that contain 'False' from the df
df2 = df2[df2.YearSum]

# Dropping the boolean debut column
df2.drop(['YearSum', 'PlayingYear'], axis=1, inplace=True)

df2.head()

Unnamed: 0,PlayerID,Name,G,AB,PA,R,H,1B,2B,3B,HR,RBI,SB,CS,BB,SO,HBP,SH,SF,GIDP
0,aaronha01,Hank Aaron,886,3524,3866,612,1137,707,205,46,179,617,20,9,281,315,13,18,30,107
1,aaronto01,Tommie Aaron,412,891,989,98,204,145,40,6,13,91,9,8,83,140,0,9,6,33
4,abbotku01,Kurt Abbott,486,1469,1604,193,376,233,78,19,46,180,15,7,93,414,15,18,9,28
7,abnersh01,Shawn Abner,295,632,674,68,133,91,29,3,10,55,5,6,31,118,4,3,4,11
9,abreubo01,Bobby Abreu,693,2417,2869,430,742,448,165,33,96,383,117,43,422,550,6,4,20,45


In [10]:
# Calculating some popular statistics
# Batting Average
df2['BA'] = df2['H']/df2['AB']

# On Base Percentage
df2['OBP'] = (df2['H'] + df2['BB'] + df2['HBP']) / (df2['AB'] + df2['BB'] + df2['HBP'] + df2['SF'])

# Total Bases
df2['TB'] = df2['1B'] + (2 * df2['2B']) + (3 * df2['3B']) + (4 * df2['HR'])

# Slugging Percentage
df2['SLG'] = df2['TB'] / df2['AB']

# On Base plus Slugginh
df2['OPS'] = df2['OBP'] + df2['SLG']

# Plate Appearances
df2['PA'] = df2['AB'] + df2['BB'] + df2['HBP'] + df2['SH'] + df2['SF']

# Walk Percentage
df2['BB%'] = df2['BB'] /df2['PA']

# Strikeout Percentage
df2['K%'] = df2['SO'] / df2['PA']

# Isolated Power
df2['ISO'] = df2['SLG'] - df2['BA']

# Batting Average on Balls in Play
df2['BABIP'] = (df2['H'] - df2['HR']) / (df2['AB'] - df2['SO'] - df2['HR'] + df2['SF'])

# Runs Created
df2['RC'] = ((df2['H'] + df2['BB']) * df2['TB']) / (df2['AB'] + df2['BB'])

# Rounding the new calculations to desired decimal places
df2 = df2.round({'BA':3, 'OBP':3, 'SLG':3, 'OPS':3, 'ISO':3, 'BABIP':3, 'RC':2, 'BB%':3, 'K%':3})

# Reshuffling the order of the columns
df2 = df2[['PlayerID', 'Name', 'G', 'AB', 'PA', 'R', 'H', '1B', '2B', '3B', 'HR',
       'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'HBP', 
        'SH', 'SF', 'GIDP', 'BB%', 'K%', 'ISO', 'BABIP', 'RC']]

df2.head()

Unnamed: 0,PlayerID,Name,G,AB,PA,R,H,1B,2B,3B,...,TB,HBP,SH,SF,GIDP,BB%,K%,ISO,BABIP,RC
0,aaronha01,Hank Aaron,886,3524,3866,612,1137,707,205,46,...,1971,13,18,30,107,0.073,0.081,0.237,0.313,734.53
1,aaronto01,Tommie Aaron,412,891,989,98,204,145,40,6,...,295,0,9,6,33,0.084,0.142,0.102,0.257,86.93
4,abbotku01,Kurt Abbott,486,1469,1604,193,376,233,78,19,...,630,15,18,9,28,0.058,0.258,0.173,0.324,189.16
7,abnersh01,Shawn Abner,295,632,674,68,133,91,29,3,...,198,4,3,4,11,0.046,0.175,0.103,0.242,48.98
9,abreubo01,Bobby Abreu,693,2417,2869,430,742,448,165,33,...,1261,6,4,20,45,0.147,0.192,0.215,0.361,517.01


In [11]:
## Creating a boolean column of whether a player's debut is before or after 1954
df['Pre1954Debut'] = df['DebutYear']>=1954

# Removing the rows that contain 'False' from the df
df = df[df.Pre1954Debut]

# Dropping the boolean debut column
df.drop('Pre1954Debut', axis=1, inplace=True)

df.head()

Unnamed: 0,PlayerID,Name,DebutYear,Inducted
126,koufasa01,Sandy Koufax,1955.0,Y
135,clemero01,Roberto Clemente,1955.0,Y
173,gibsobo01,Bob Gibson,1959.0,Y
176,aaronha01,Hank Aaron,1954.0,Y
177,robinfr02,Frank Robinson,1956.0,Y


In [12]:
# Drop those columns from the halloffame df that we don't need
df.drop(['Name', 'DebutYear'], axis=1, inplace=True)

# Perform a merge to combine the two dfs. This is essentially a vlookup in pandas
df = pd.merge(df2, df, on = 'PlayerID', how='left')

# Dropping the players with null names because they are pitchers
df = df[df['Name'].notna()]

# Replacing the values in the Inducted with binary
df.Inducted = df.Inducted.fillna(0)
df.Inducted = df.Inducted.replace({'Y':1})

# Dropping DebutYear from final dataset
df.drop('PlayerID', axis=1, inplace=True)

df.head()

Unnamed: 0,Name,G,AB,PA,R,H,1B,2B,3B,HR,...,HBP,SH,SF,GIDP,BB%,K%,ISO,BABIP,RC,Inducted
0,Hank Aaron,886,3524,3866,612,1137,707,205,46,179,...,13,18,30,107,0.073,0.081,0.237,0.313,734.53,1
1,Tommie Aaron,412,891,989,98,204,145,40,6,13,...,0,9,6,33,0.084,0.142,0.102,0.257,86.93,0
2,Kurt Abbott,486,1469,1604,193,376,233,78,19,46,...,15,18,9,28,0.058,0.258,0.173,0.324,189.16,0
3,Shawn Abner,295,632,674,68,133,91,29,3,10,...,4,3,4,11,0.046,0.175,0.103,0.242,48.98,0
4,Bobby Abreu,693,2417,2869,430,742,448,165,33,96,...,6,4,20,45,0.147,0.192,0.215,0.361,517.01,0


In [13]:
df.to_excel("Inducted.xlsx")