In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a function to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers


In [6]:
# Change types of columns to numeric for columns with number values
batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']] = batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
batter_stats_df.dropna(subset=['PA'], axis = 0 , inplace= True)

# Remove any players with fewer than 100 plate appearances
filtered_batter_stats_df = batter_stats_df[batter_stats_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_batter_stats_df = filtered_batter_stats_df[['Year','Name','Tm','Age','R','HR','RBI','SB','BA','PA','OPS','OPS+','Pos\xa0Summary']]

In [7]:
# Create variables for Z scores for all stats counted in ESPN Fantasy Baseball category leagues
R_zscores = stats.zscore(final_batter_stats_df['R'])
HR_zscores = stats.zscore(final_batter_stats_df['HR'])
RBI_zscores = stats.zscore(final_batter_stats_df['RBI'])
SB_zscores = stats.zscore(final_batter_stats_df['SB'])/5 # Steal Z scores get out of hand, so dividing by 5
BA_zscores = stats.zscore(final_batter_stats_df['BA'])

# Add Z scores to a new DataFrame
compare_players_df = pd.DataFrame({
    'Z_R': R_zscores,
    'Z_HR': HR_zscores,
    'Z-RBI': RBI_zscores,
    'Z_SB': SB_zscores,
    'Z_BA': BA_zscores,})

# Calculate average of 5 Z scores, stadard deviation, and a confidence level in the average Z score
# Confidence level is to try to avoid players with just one, large Z-score
compare_players_df['average_z'] = compare_players_df.mean(axis=1)
compare_players_df['std_z'] = compare_players_df.std(axis=1)
compare_players_df['avg_confidence'] = (compare_players_df['average_z'] - compare_players_df['std_z'])
compare_players_df['position'] = final_batter_stats_df['Pos\xa0Summary']
final_compare_players_df = compare_players_df.sort_values(by=['avg_confidence'], ascending=False)

In [8]:
# Add avg Z scores, confidence columns to batter stats DataFrame
final_batter_stats_df['Average Z'] = final_compare_players_df['average_z']
final_batter_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']
sorted_final_batter_stats_df = final_batter_stats_df.sort_values(by=['Z Confidence'], ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_batter_stats_df['Average Z'] = final_compare_players_df['average_z']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_batter_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']


In [9]:
# Eliminate Baseball Reference's name badges for accolades
sorted_final_batter_stats_df['Name'] = sorted_final_batter_stats_df['Name'].str.extract('([^\*|#]*)')

In [10]:
# Sort by index to prepare to drop duplicates
sorted_final_batter_stats_df = sorted_final_batter_stats_df.sort_index()

In [11]:
# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
sorted_final_batter_stats_df = sorted_final_batter_stats_df.drop_duplicates(subset=['Year', 'Name'])

In [12]:
# Sort data by name alphabetically, then by year in descending order
sorted_final_batter_stats_df = sorted_final_batter_stats_df.sort_values(['Name','Year'], ascending=[True, False])
sorted_final_batter_stats_df

Unnamed: 0,Year,Name,Tm,Age,R,HR,RBI,SB,BA,PA,OPS,OPS+,Pos Summary,Average Z,Z Confidence
4365,2018,A.J. Ellis,SDP,37.0,19,1,15,0,0.272,183,0.722,104.0,2H/D7,-0.486196,-1.143432
5883,2017,A.J. Ellis,MIA,36.0,17,6,14,0,0.210,163,0.669,82.0,2H,-0.733150,-1.067189
1193,2021,AJ Pollock,LAD,33.0,53,21,69,9,0.297,422,0.892,134.0,7H/8,0.772826,0.338356
2227,2020,AJ Pollock,LAD,32.0,30,16,34,2,0.276,210,0.881,132.0,*78D/H,0.081352,-0.366279
3515,2019,AJ Pollock,LAD,31.0,49,15,47,5,0.266,342,0.795,107.0,87/HD,0.266095,0.112896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2728,2019,Zack Cozart,LAA,33.0,4,0,7,0,0.124,107,0.322,-12.0,5/H64,-1.461241,-2.472274
4265,2018,Zack Cozart,LAA,32.0,29,5,18,0,0.219,253,0.658,81.0,546,-0.586229,-0.838183
5794,2017,Zack Cozart,CIN,31.0,80,24,63,3,0.297,507,0.933,140.0,*6/HD,0.955765,0.418179
6042,2017,Zack Granite,MIN,24.0,14,1,13,2,0.237,107,0.611,66.0,8H/7D9,-0.711440,-1.152379
