In [1]:
%config Completer.use_jedi = False

In [37]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import datetime
import time

In [6]:
BASE_URL = "https://www.baseball-reference.com/leagues/majors"

In [7]:
def get_date_time():
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")

In [10]:
def log(message):
    print(f"[*] {get_date_time()} - {message}")

In [40]:
def get_yearly_statistics(year):
    log(f"getting statistics for year: {year}")
    url = f"{BASE_URL}/{year}.shtml"
    log(f"url={url}")
    response = requests.get(url)
    bs = BeautifulSoup(response.text, "html.parser")
    
    table = bs.find_all('table', {'id': 'teams_standard_batting'})
    rows = []
    for tbl in table:
        for tr in tbl.find_all('tr'):
            row = []
            team = tr.find('th').text
            if team.lower() != "league average":
                row.append(year)
                row.append(team)
                for td in tr.find_all('td'):
                    row.append(td.text.replace('\n', ''))
                if len(row) == 30 and row[0] != "":
                    rows.append(row)

    return pd.DataFrame(rows, columns=['YEAR', 'TM', '#BAT', 'BATAGE', 'RPG', 'G',
                                          'PA', 'AB', 'R', 'H', '2B',
                                          '3B', 'HR', 'RBI', 'SB', 'CS',
                                          'BB', 'SO', 'BA', 'OBP', 'SLG',
                                          'OPS', 'OPS+', 'TB', 'GDP', 'HBP',
                                          'SH', 'SF', 'IBB', 'LOB'])

In [41]:
statistics = []
for year in range(2000, 2022):
    statistics.append(get_yearly_statistics(year))
    log("sleeping")
    time.sleep(10)
    
df = pd.concat(statistics)

[*] 2022-05-12 14:02:25.438880 - getting statistics for year: 2000
[*] 2022-05-12 14:02:25.439188 - url=https://www.baseball-reference.com/leagues/majors/2000.shtml
[*] 2022-05-12 14:02:25.815133 - sleeping
[*] 2022-05-12 14:02:35.819364 - getting statistics for year: 2001
[*] 2022-05-12 14:02:35.819632 - url=https://www.baseball-reference.com/leagues/majors/2001.shtml
[*] 2022-05-12 14:02:36.077657 - sleeping
[*] 2022-05-12 14:02:46.081590 - getting statistics for year: 2002
[*] 2022-05-12 14:02:46.081709 - url=https://www.baseball-reference.com/leagues/majors/2002.shtml
[*] 2022-05-12 14:02:46.324805 - sleeping
[*] 2022-05-12 14:02:56.329358 - getting statistics for year: 2003
[*] 2022-05-12 14:02:56.329624 - url=https://www.baseball-reference.com/leagues/majors/2003.shtml
[*] 2022-05-12 14:02:56.588216 - sleeping
[*] 2022-05-12 14:03:06.593702 - getting statistics for year: 2004
[*] 2022-05-12 14:03:06.593973 - url=https://www.baseball-reference.com/leagues/majors/2004.shtml
[*] 202

In [42]:
df.head()

Unnamed: 0,YEAR,TM,#BAT,BATAGE,RPG,G,PA,AB,R,H,...,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,LOB
0,2000,Anaheim Angels,45,27.6,5.33,162,6373,5628,864,1574,...,0.472,0.825,105,2659,126,47,47,43,43,1173
1,2000,Arizona Diamondbacks,41,30.8,4.89,162,6241,5527,792,1466,...,0.429,0.763,88,2373,114,59,61,58,37,1128
2,2000,Atlanta Braves,47,30.8,5.0,162,6275,5489,810,1490,...,0.429,0.775,95,2353,127,59,87,45,38,1192
3,2000,Baltimore Orioles,50,32.1,4.9,162,6238,5549,794,1508,...,0.435,0.776,100,2414,148,49,27,54,34,1129
4,2000,Boston Red Sox,52,29.3,4.89,162,6371,5630,792,1503,...,0.423,0.764,90,2384,115,42,40,48,40,1226


In [43]:
df.to_excel("yearly_batting_2000_2021.xlsx", index=False)    