In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from selenium import webdriver
import requests

## Populate the list of players with all players with their stats

In [3]:
url = 'https://www.baseball-reference.com/leagues/majors/2025-standard-batting.shtml'
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html)
table = soup.find('table', id='players_standard_batting')
tbody = table.find('tbody')

class Player:
    def __init__(self, name, age, team, league, war, games_played, pa, ab, r, h, doubles, triples, hr, rbi, sb, cs, bb, so, ba, obp, slg, ops, ops_plus, roba, rbat_plus, tb, gidp, hbp, sh, sf, ibb, pos):
        self.name = name
        self.age = age
        self.team = team
        self.league = league
        self.war = war
        self.games_played = games_played
        self.pa = pa
        self.ab = ab
        self.r = r
        self.h = h
        self.doubles = doubles
        self.triples = triples
        self.hr = hr
        self.rbi = rbi
        self.sb = sb
        self.cs = cs
        self.bb = bb
        self.so = so
        self.ba = ba
        self.obp = obp
        self.slg = slg
        self.ops = ops
        self.ops_plus = ops_plus
        self.roba = roba
        self.rbat_plus = rbat_plus
        self.tb = tb
        self.gidp = gidp
        self.hbp = hbp
        self.sh = sh
        self.sf = sf
        self.ibb = ibb
        self.pos = pos

def try_castint(string):
    try: 
        return int(string)
    except:
        return 0

def try_castfloat(string):
    try: 
        return float(string)
    except:
        return 0.0

players_df = pd.DataFrame(data=None, index=None, columns=['name','age','team','league','war','games_played','pa','ab','r','h','doubles','triples','hr','rbi','sb','cs','bb','so','ba','obp','slg','ops','ops_plus','roba','rbat_plus','tb','gidp','hbp','sh','sf','ibb','pos'])

index = 0
for p in tbody.find_all('tr'):
    if p.get('class') == None or 'thead' not in p.get('class'):
        player_th = p.find('th')
        name_td = p.find('td', attrs={'data-stat':'name_display'})
        age_td = p.find('td', attrs={'data-stat':'age'})        
        team_td = p.find('td', attrs={'data-stat':'team_name_abbr'})
        league_td = p.find('td', attrs={'data-stat': 'comp_name_abbr'})
        war_td = p.find('td', attrs={'data-stat': 'b_war'})
        games_played_td = p.find('td', attrs={'data-stat': 'b_games'})
        pa_td = p.find('td', attrs={'data-stat': 'b_pa'})
        ab_td = p.find('td', attrs={'data-stat': 'b_ab'})
        r_td = p.find('td', attrs={'data-stat': 'b_r'})
        h_td = p.find('td', attrs={'data-stat': 'b_h'})
        doubles_td = p.find('td', attrs={'data-stat': 'b_doubles'})
        triples_td = p.find('td', attrs={'data-stat': 'b_triples'})
        hr_td = p.find('td', attrs={'data-stat': 'b_hr'})
        rbi_td = p.find('td', attrs={'data-stat': 'b_rbi'})
        sb_td = p.find('td', attrs={'data-stat': 'b_sb'})
        cs_td = p.find('td', attrs={'data-stat': 'b_cs'})
        bb_td = p.find('td', attrs={'data-stat': 'b_bb'})
        so_td = p.find('td', attrs={'data-stat': 'b_so'})
        ba_td = p.find('td', attrs={'data-stat': 'b_batting_avg'})
        obp_td = p.find('td', attrs={'data-stat': 'b_onbase_perc'})
        slg_td = p.find('td', attrs={'data-stat': 'b_slugging_perc'})
        ops_td = p.find('td', attrs={'data-stat': 'b_onbase_plus_slugging'})
        ops_plus_td = p.find('td', attrs={'data-stat': 'b_onbase_plus_slugging_plus'})
        roba_td = p.find('td', attrs={'data-stat': 'b_roba'})
        rbat_plus_td = p.find('td', attrs={'data-stat': 'b_rbat_plus'})
        tb_td = p.find('td', attrs={'data-stat': 'b_tb'})
        gidp_td = p.find('td', attrs={'data-stat': 'b_gidp'})
        hbp_td = p.find('td', attrs={'data-stat': 'b_hbp'})
        sh_td = p.find('td', attrs={'data-stat': 'b_sh'})
        sf_td = p.find('td', attrs={'data-stat': 'b_sf'})
        ibb_td = p.find('td', attrs={'data-stat': 'b_ibb'})
        pos_td = p.find('td', attrs={'data-stat': 'pos'})

        # If there's no content for name, means the row doesn't contain player information so we will skip it
        if name_td.a is None:
            continue
        # rank information, same player may appear in multiple rows with same rank so we will avoid duplicates
        if int(player_th.get_text(strip=True)) == index:
            continue
            
        name = name_td.a.get_text(strip=True)
        age = try_castint(age_td.get_text(strip=True))
        team = team_td.get_text(strip=True)
        league = league_td.get_text(strip=True)
        war = try_castfloat(war_td.get_text(strip=True))
        games_played = try_castint(games_played_td.get_text(strip=True))
        pa = try_castint(pa_td.get_text(strip=True))
        ab = try_castint(ab_td.get_text(strip=True))
        r = try_castint(r_td.get_text(strip=True))
        h = try_castint(h_td.get_text(strip=True))
        doubles = try_castint(doubles_td.get_text(strip=True))
        triples = try_castint(triples_td.get_text(strip=True))
        hr = try_castint(hr_td.get_text(strip=True))
        rbi = try_castint(rbi_td.get_text(strip=True))
        sb = try_castint(sb_td.get_text(strip=True))
        cs = try_castint(cs_td.get_text(strip=True))
        bb = try_castint(bb_td.get_text(strip=True))
        so = try_castint(so_td.get_text(strip=True))
        ba = try_castfloat(ba_td.get_text(strip=True))
        obp = try_castfloat(obp_td.get_text(strip=True))
        slg = try_castfloat(slg_td.get_text(strip=True))
        ops = try_castfloat(ops_td.get_text(strip=True))
        ops_plus = try_castint(ops_plus_td.get_text(strip=True))
        roba = try_castfloat(roba_td.get_text(strip=True))
        rbat_plus = try_castint(rbat_plus_td.get_text(strip=True))
        tb = try_castint(tb_td.get_text(strip=True))
        gidp = try_castint(gidp_td.get_text(strip=True))
        hbp = try_castint(hbp_td.get_text(strip=True))
        sh = try_castint(sh_td.get_text(strip=True))
        sf = try_castint(sf_td.get_text(strip=True))
        ibb = try_castint(ibb_td.get_text(strip=True))
        pos = pos_td.get_text(strip=True)
        
        players_df.loc[len(players_df)] = [name, age, team, league, war, games_played, pa, ab, r, h, doubles, triples, hr, rbi, sb, cs, bb, so, ba, obp, slg, ops, ops_plus, roba, rbat_plus, tb, gidp, hbp, sh, sf, ibb, pos]
        index+=1
    else:
        continue
driver.close()

In [4]:
players_df

Unnamed: 0,name,age,team,league,war,games_played,pa,ab,r,h,...,ops_plus,roba,rbat_plus,tb,gidp,hbp,sh,sf,ibb,pos
0,Jarren Duran,28,BOS,AL,1.5,71,332,304,38,81,...,107,0.336,111,128,3,5,0,2,0,*7/89H
1,Rafael Devers,28,BOS,AL,2.2,72,330,269,46,73,...,149,0.388,152,133,5,4,0,2,7,*D
2,Shohei Ohtani,30,LAD,NL,3.4,69,322,276,71,80,...,185,0.428,181,176,3,1,0,2,9,*D
3,Bo Bichette,27,TOR,AL,0.9,70,317,295,32,79,...,102,0.322,104,122,6,2,0,2,1,*6/DH
4,Jackson Chourio,21,MIL,NL,0.8,71,317,305,43,79,...,106,0.329,107,139,9,1,0,1,0,*879/D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,Tanner Scott,30,LAD,NL,0.0,1,0,0,0,0,...,0,0.000,0,0,0,0,0,0,0,1
606,Ryan Thompson,33,ARI,NL,0.0,1,0,0,0,0,...,0,0.000,0,0,0,0,0,0,0,1
607,Luis Vázquez,25,BAL,AL,0.0,1,0,0,0,0,...,0,0.000,0,0,0,0,0,0,0,/DH
608,Ryan Walker,29,SFG,NL,0.0,1,0,0,0,0,...,0,0.000,0,0,0,0,0,0,0,1


## Team DataFrame

In [11]:
team_table = soup.find('table', id='team_standard_batting')

teams_df = pd.DataFrame(data=None, index=None, columns=['team', 'num_batters_used', 'batters_avg_age', 'runs_per_game', 'games', 'pa', 'ab', 'r', 'h', '2b', '3b', 'hr', 'rbi', 'sb', 'cs', 'bb', 'so', 'ba', 'obp', 'ops', 'ops_plus', 'tb', 'gidp', 'hbp', 'sh', 'sf', 'ibb', 'lob'])

## Most Home Runs AL

In [5]:
# Most home runs
pd.options.display.max_columns = 100
pd.options.display.max_rows = 10

top_hr_al = players_df[players_df['league'] == 'AL']
top_hr_al = top_hr_al.sort_values('hr', ascending=False)

top_hr_al.head(10)

Unnamed: 0,name,age,team,league,war,games_played,pa,ab,r,h,doubles,triples,hr,rbi,sb,cs,bb,so,ba,obp,slg,ops,ops_plus,roba,rbat_plus,tb,gidp,hbp,sh,sf,ibb,pos
30,Cal Raleigh,28,SEA,AL,3.4,68,298,251,45,66,12,0,26,54,7,2,42,76,0.263,0.376,0.622,0.997,187,0.42,186,156,3,4,0,1,5,*2D/H
14,Aaron Judge,33,NYY,AL,5.6,69,307,258,65,99,17,2,26,60,6,3,45,75,0.384,0.479,0.767,1.246,244,0.512,250,198,7,3,0,1,14,*9D/H
53,Taylor Ward,31,LAA,AL,0.7,67,286,259,38,55,13,1,18,47,1,1,21,79,0.212,0.269,0.479,0.748,106,0.324,106,124,7,1,0,5,1,*7/D
52,Spencer Torkelson,25,DET,AL,1.4,68,286,241,40,57,15,0,16,47,1,0,36,70,0.237,0.346,0.498,0.844,136,0.359,132,120,7,6,0,3,5,*3/DH
78,Junior Caminero,21,TBR,AL,1.0,66,276,257,38,63,15,0,16,42,5,1,15,50,0.245,0.286,0.49,0.777,117,0.348,124,126,17,1,0,3,0,*5/DH3
42,Isaac Paredes,26,HOU,AL,2.2,67,292,251,37,64,9,1,15,42,0,1,36,50,0.255,0.353,0.478,0.831,131,0.362,134,120,7,3,0,2,0,*5/D
6,Brent Rooker,30,ATH,AL,1.5,72,316,285,44,78,15,1,15,41,2,1,29,66,0.274,0.345,0.491,0.836,132,0.367,136,140,7,2,0,0,0,*D/97H
161,Logan O'Hoppe,25,LAA,AL,0.0,58,218,208,21,48,2,0,14,31,0,0,8,73,0.231,0.261,0.442,0.704,94,0.31,96,92,4,1,0,1,0,2/DH
1,Rafael Devers,28,BOS,AL,2.2,72,330,269,46,73,18,0,14,57,1,1,55,76,0.271,0.4,0.494,0.894,149,0.388,152,133,5,4,0,2,7,*D
21,Tyler Soderstrom,23,ATH,AL,1.4,72,305,268,38,68,11,1,14,45,4,0,32,73,0.254,0.341,0.459,0.8,123,0.349,121,123,11,4,0,1,1,37/DH


## Most Home Runs NL

In [6]:
top_hr_nl = players_df[players_df['league'] == 'NL']
top_hr_nl = top_hr_nl.sort_values('hr', ascending=False)

top_hr_nl.head(10)

Unnamed: 0,name,age,team,league,war,games_played,pa,ab,r,h,doubles,triples,hr,rbi,sb,cs,bb,so,ba,obp,slg,ops,ops_plus,roba,rbat_plus,tb,gidp,hbp,sh,sf,ibb,pos
2,Shohei Ohtani,30,LAD,NL,3.4,69,322,276,71,80,11,5,25,41,11,4,43,79,0.29,0.385,0.638,1.023,185,0.428,181,176,3,1,0,2,9,*D
12,Kyle Schwarber,32,PHI,NL,2.0,70,309,256,49,62,9,1,22,50,5,1,49,82,0.242,0.372,0.543,0.915,151,0.396,150,139,1,4,0,0,4,*D/7
50,Eugenio Suárez,33,ARI,NL,1.7,70,287,255,42,59,13,0,20,55,1,0,21,74,0.231,0.307,0.518,0.824,126,0.362,128,132,5,8,0,3,1,*5/H
10,Corbin Carroll,24,ARI,NL,2.7,69,310,277,55,71,12,8,20,44,10,4,27,75,0.256,0.335,0.574,0.909,149,0.389,148,159,2,6,0,0,1,*9/H
35,Pete Crow-Armstrong,23,CHC,NL,4.1,70,295,276,51,75,16,3,18,57,23,3,13,73,0.272,0.305,0.547,0.852,140,0.382,142,151,1,1,3,2,0,*8
31,Seiya Suzuki,30,CHC,NL,1.6,67,298,268,37,71,18,2,17,57,2,2,25,79,0.265,0.322,0.537,0.859,143,0.372,145,144,6,0,0,5,0,D79
15,James Wood,22,WSN,NL,2.9,70,307,266,42,75,17,0,17,49,9,3,38,82,0.282,0.371,0.538,0.909,157,0.392,155,143,5,1,0,2,2,*7D
9,Pete Alonso,30,NYM,NL,2.8,71,314,266,42,79,22,1,17,63,0,2,35,65,0.297,0.395,0.579,0.974,177,0.412,175,154,11,10,0,3,2,*3
19,Elly De La Cruz,23,CIN,NL,2.3,71,305,270,54,72,14,0,15,47,20,6,32,81,0.267,0.348,0.485,0.833,123,0.369,123,131,3,2,0,1,5,*6/DH
25,Matt Olson,31,ATL,NL,2.9,69,301,255,41,63,16,0,14,43,0,0,41,69,0.247,0.352,0.475,0.827,129,0.359,127,121,9,2,0,3,1,*3


## Qualified Batters

To be a qualified batter, must have 3.1 PA times the number of team games played. Source: MLB

In [8]:
qualified = players_df[players_df['pa'].astype(float) >= players_df['games_played'] * 3.1]

In [9]:
qualified

Unnamed: 0,name,age,team,league,war,games_played,pa,ab,r,h,doubles,triples,hr,rbi,sb,cs,bb,so,ba,obp,slg,ops,ops_plus,roba,rbat_plus,tb,gidp,hbp,sh,sf,ibb,pos
0,Jarren Duran,28,BOS,AL,1.5,71,332,304,38,81,18,7,5,38,14,5,21,78,0.266,0.322,0.421,0.743,107,0.336,111,128,3,5,0,2,0,*7/89H
1,Rafael Devers,28,BOS,AL,2.2,72,330,269,46,73,18,0,14,57,1,1,55,76,0.271,0.400,0.494,0.894,149,0.388,152,133,5,4,0,2,7,*D
2,Shohei Ohtani,30,LAD,NL,3.4,69,322,276,71,80,11,5,25,41,11,4,43,79,0.290,0.385,0.638,1.023,185,0.428,181,176,3,1,0,2,9,*D
3,Bo Bichette,27,TOR,AL,0.9,70,317,295,32,79,17,1,8,40,4,3,18,52,0.268,0.312,0.414,0.726,102,0.322,104,122,6,2,0,2,1,*6/DH
4,Jackson Chourio,21,MIL,NL,0.8,71,317,305,43,79,18,3,12,42,13,5,10,66,0.259,0.284,0.456,0.740,106,0.329,107,139,9,1,0,1,0,*879/D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,Christian Moore,22,LAA,AL,0.0,2,7,6,0,0,0,0,0,0,0,0,1,3,0.000,0.143,0.000,0.143,-55,0.098,-60,0,0,0,0,0,0,/4
540,Dom Nuñez,30,CLE,AL,0.0,2,7,7,2,2,0,0,0,0,0,0,0,4,0.286,0.286,0.286,0.571,63,0.228,35,2,0,0,0,0,0,/2
541,Brett Sullivan,31,PIT,NL,0.1,2,7,4,1,1,0,0,0,2,0,0,1,2,0.250,0.429,0.250,0.679,96,0.333,108,1,0,1,0,1,0,/2
551,Brewer Hicklen,29,DET,AL,0.1,1,4,3,2,2,0,0,0,0,1,0,1,1,0.667,0.750,0.667,1.417,308,0.658,307,2,0,0,0,0,0,/8
