In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from selenium import webdriver
import requests

## Populate the list of players with all players with their stats

In [5]:
url = 'https://www.baseball-reference.com/leagues/majors/2025-standard-batting.shtml'
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html)
table = soup.find('table', id='players_standard_batting')
tbody = table.find('tbody')

class Player:
    def __init__(self, name, age, team, league, war, games_played, pa, ab, r, h, doubles, triples, hr, rbi, sb, cs, bb, so, ba, obp, slg, ops, ops_plus, roba, rbat_plus, tb, gidp, hbp, sh, sf, ibb, pos):
        self.name = name
        self.age = age
        self.team = team
        self.league = league
        self.war = war
        self.games_played = games_played
        self.pa = pa
        self.ab = ab
        self.r = r
        self.h = h
        self.doubles = doubles
        self.triples = triples
        self.hr = hr
        self.rbi = rbi
        self.sb = sb
        self.cs = cs
        self.bb = bb
        self.so = so
        self.ba = ba
        self.obp = obp
        self.slg = slg
        self.ops = ops
        self.ops_plus = ops_plus
        self.roba = roba
        self.rbat_plus = rbat_plus
        self.tb = tb
        self.gidp = gidp
        self.hbp = hbp
        self.sh = sh
        self.sf = sf
        self.ibb = ibb
        self.pos = pos

players_df = pd.DataFrame(data=None, index=None, columns=[])
index = 0
for p in tbody.find_all('tr'):
    name = age = team = league = war = games_played = pa = ab = r = h = doubles = triples = hr = rbi = sb = cs = bb = so = ba = obp = slg = ops = ops_plus = roba = rbat_plus = tb = gidp = hbp = sh = sf = ibb = pos = ""
    if p.get('class') == None or 'thead' not in p.get('class'):
        player_th = p.find('th')
        name_td = p.find('td', attrs={'data-stat':'name_display'})
        age_td = p.find('td', attrs={'data-stat':'age'})        
        team_td = p.find('td', attrs={'data-stat':'team_name_abbr'})
        league_td = p.find('td', attrs={'data-stat': 'comp_name_abbr'})
        war_td = p.find('td', attrs={'data-stat': 'b_war'})
        games_played_td = p.find('td', attrs={'data-stat': 'b_games'})
        pa_td = p.find('td', attrs={'data-stat': 'b_pa'})
        ab_td = p.find('td', attrs={'data-stat': 'b_ab'})
        r_td = p.find('td', attrs={'data-stat': 'b_r'})
        h_td = p.find('td', attrs={'data-stat': 'b_h'})
        doubles_td = p.find('td', attrs={'data-stat': 'b_doubles'})
        triples_td = p.find('td', attrs={'data-stat': 'b_triples'})
        hr_td = p.find('td', attrs={'data-stat': 'b_hr'})
        rbi_td = p.find('td', attrs={'data-stat': 'b_rbi'})
        sb_td = p.find('td', attrs={'data-stat': 'b_sb'})
        cs_td = p.find('td', attrs={'data-stat': 'b_cs'})
        bb_td = p.find('td', attrs={'data-stat': 'b_bb'})
        so_td = p.find('td', attrs={'data-stat': 'b_so'})
        ba_td = p.find('td', attrs={'data-stat': 'b_batting_avg'})
        obp_td = p.find('td', attrs={'data-stat': 'b_onbase_perc'})
        slg_td = p.find('td', attrs={'data-stat': 'b_slugging_perc'})
        ops_td = p.find('td', attrs={'data-stat': 'b_onbase_plus_slugging'})
        ops_plus_td = p.find('td', attrs={'data-stat': 'b_onbase_plus_slugging_plus'})
        roba_td = p.find('td', attrs={'data-stat': 'b_roba'})
        rbat_plus_td = p.find('td', attrs={'data-stat': 'b_rbat_plus'})
        tb_td = p.find('td', attrs={'data-stat': 'b_tb'})
        gidp_td = p.find('td', attrs={'data-stat': 'b_gidp'})
        hbp_td = p.find('td', attrs={'data-stat': 'b_hbp'})
        sh_td = p.find('td', attrs={'data-stat': 'b_sh'})
        sf_td = p.find('td', attrs={'data-stat': 'b_sf'})
        ibb_td = p.find('td', attrs={'data-stat': 'b_ibb'})
        pos_td = p.find('td', attrs={'data-stat': 'pos'})

        # If there's no content for name, means the row doesn't contain player information so we will skip it
        if name_td.a is None:
            continue
        # rank information, same player may appear in multiple rows with same rank so we will avoid duplicates
        if int(player_th.get_text(strip=True)) == index:
            continue
        
        name = name_td.a.get_text(strip=True)
        age = age_td.get_text(strip=True)
        team = team_td.get_text(strip=True)
        league = league_td.get_text(strip=True)
        war = war_td.get_text(strip=True)
        games_played = games_played_td.get_text(strip=True)
        pa = pa_td.get_text(strip=True)
        ab = ab_td.get_text(strip=True)
        r = r_td.get_text(strip=True)
        h = h_td.get_text(strip=True)
        doubles = doubles_td.get_text(strip=True)
        triples = triples_td.get_text(strip=True)
        hr = hr_td.get_text(strip=True)
        rbi = rbi_td.get_text(strip=True)
        sb = sb_td.get_text(strip=True)
        cs = cs_td.get_text(strip=True)
        bb = bb_td.get_text(strip=True)
        so = so_td.get_text(strip=True)
        ba = ba_td.get_text(strip=True)
        obp = obp_td.get_text(strip=True)
        slg = slg_td.get_text(strip=True)
        ops = ops_td.get_text(strip=True)
        ops_plus = ops_plus_td.get_text(strip=True)
        roba = roba_td.get_text(strip=True)
        rbat_plus = rbat_plus_td.get_text(strip=True)
        tb = tb_td.get_text(strip=True)
        gidp = gidp_td.get_text(strip=True)
        hbp = hbp_td.get_text(strip=True)
        sh = sh_td.get_text(strip=True)
        sf = sf_td.get_text(strip=True)
        ibb = ibb_td.get_text(strip=True)
        pos = pos_td.get_text(strip=True)
        

        players.append(Player(name, age, team, league, war, games_played, pa, ab, r, h, doubles, triples, hr, rbi, sb, cs, bb, so, ba, obp, slg, ops, ops_plus, roba, rbat_plus, tb, gidp, hbp, sh, sf, ibb, pos))
        index+=1
    else:
        continue

print("Number of players: ", len(players))


Number of players:  1208


In [None]:
# Most home runs
