In [10]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [11]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [12]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

last_five_years

[2021, 2020, 2019, 2018, 2017]

In [13]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a function to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

    print(f'We just finished with {year}')

        


We just finished with 2021
We just finished with 2020
We just finished with 2019
We just finished with 2018
We just finished with 2017


In [19]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers

batter_stats_df = batter_stats_df.sort_values('Age')
batter_stats_df.head(50)

Unnamed: 0,Name,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos Summary,Year
6967,LgAvg per 600 PA,,,,200,600,536,73,136,27,...,0.749,,228,12,6,3,4,3,,2017
1723,LgAvg per 600 PA,,,,205,600,535,73,130,26,...,0.726,,219,11,7,2,4,2,,2021
5513,LgAvg per 600 PA,,,,200,600,536,70,133,27,...,0.727,,219,11,6,3,4,3,,2018
2420,LgAvg per 600 PA,,,,171,600,533,75,130,25,...,0.736,,221,11,7,1,4,2,,2020
3975,LgAvg per 600 PA,,,,200,600,536,75,134,27,...,0.752,,232,11,6,2,4,2,,2019
3238,Elvis Luciano,19.0,TOR,AL,1,1,1,0,0,0,...,0.0,-100.0,0,0,0,0,0,0,1,2019
5286,Juan Soto*,19.0,WSN,NL,116,494,414,77,121,25,...,0.923,142.0,214,9,0,1,0,10,*7/H,2018
3732,Juan Soto*,20.0,WSN,NL,150,659,542,110,153,32,...,0.949,142.0,297,11,3,0,6,3,*7,2019
3371,Adrián Morejón*,20.0,SDP,NL,4,1,1,0,0,0,...,0.0,-100.0,0,0,0,0,0,0,/1,2019
6625,Victor Robles,20.0,WSN,NL,13,27,24,2,6,1,...,0.766,94.0,11,2,2,1,0,0,/9H87,2017


In [None]:



# Change types of columns to numeric for columns with number values
batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']] = batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
batter_stats_df.dropna(subset=['PA'], axis = 0 , inplace= True)


# Remove any players with fewer than 100 plate appearances
filtered_batter_stats_df = batter_stats_df[batter_stats_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_batter_stats_df = filtered_batter_stats_df[['Year','Name','Age','R','HR','RBI','SB','BA','PA','OPS','OPS+','Pos\xa0Summary']]

# This drops duplicate players and (for now) keeps their aggregate stats because that is what is listed first on baseball reference
# There should be a better way to guarentee total stats is what we get, will try to code that later
final_batter_stats_df = final_batter_stats_df.drop_duplicates('Name')

#dataTypeDict = dict(filtered_batter_stats_df.dtypes)

# return the dataframe
final_batter_stats_df = final_batter_stats_df.set_index('Name')

# final_batter_stats_df

# # use a for loop to call the function for the past five years' worth of batting stats data
# batting_stats_dataframe()


In [None]:
R_zscores = stats.zscore(final_batter_stats_df['R'])
HR_zscores = stats.zscore(final_batter_stats_df['HR'])
RBI_zscores = stats.zscore(final_batter_stats_df['RBI'])
SB_zscores = stats.zscore(final_batter_stats_df['SB'])/5
BA_zscores = stats.zscore(final_batter_stats_df['BA'])

compare_players_df = pd.DataFrame({
    'Z_R': R_zscores,
    'Z_HR': HR_zscores,
    'Z-RBI': RBI_zscores,
    'Z_SB': SB_zscores,
    'Z_BA': BA_zscores,})

compare_players_df['average_z'] = compare_players_df.mean(axis=1)
compare_players_df['std_z'] = compare_players_df.std(axis=1)
compare_players_df['avg_confidence'] = (compare_players_df['average_z'] - compare_players_df['std_z'])
compare_players_df['position'] = final_batter_stats_df['Pos\xa0Summary']
final_compare_players_df = compare_players_df.sort_values(by=['avg_confidence'], ascending=False)
# final_compare_players_df.head(50)

In [None]:
final_batter_stats_df['Average Z'] = final_compare_players_df['average_z']
final_batter_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']
sorted_final_batter_stats_df = final_batter_stats_df.sort_values(by=['Z Confidence'], ascending=False)
sorted_final_batter_stats_df.head(50)

In [None]:
# batting_stats_dataframe()

sorted_final_batter_stats_df.sort_values(by=['HR'], ascending=False).head(50)

In [None]:
# for year in last_five_years:
#     batting_stats_dataframe(year)
#     final_batter_stats_df.sort_values(by=['HR'])
#     final_batter_stats_df

In [None]:
# For 5 year trend plan, consider a difference between new year and prev year, put difference into list, then average 
# the values in the list. Won't tell you much other than if they've trended up, down, or plateau'd