In [1]:
import pandas as pd
import altair as alt

In [2]:
import requests
from bs4 import BeautifulSoup

In [91]:
def parse_hr_log(player, year):
    url = f"https://www.baseball-reference.com/players/gl.fcgi?id={player}&t=b&year={year}"
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find(class_="stats_table")
    row_list = table.find_all("tr")
    cumulative = 0
    result_list = []
    for row in row_list:
        cell_list = row.find_all("td")
        if not len(cell_list):
            continue
        team_game = row.find('td', attrs={"data-stat": "team_game_num"}).text
        if not team_game:
            continue
        team_game = int(team_game.split()[0])
        homeruns = int(row.find('td', attrs={"data-stat": "HR"}).text)
        cumulative += homeruns
        result = dict(
            player=player,
            year=year,
            season=f"{player}-{year}",
            game=team_game,
            homeruns=homeruns,
            cumulative=cumulative
        )
        result_list.append(result)
    return result_list

In [92]:
season_list = [
    ('bondsba01', 2001),
    ('mcgwima01', 1998),
    ('sosasa01', 1998),
    ('mcgwima01', 1999),
    ('sosasa01', 2001),
    ('sosasa01', 1999),
    ('marisro01', 1961),
    ('ruthba01', 1927),
    ('ruthba01', 1921),
    ('stantmi03', 2017),
    ('judgeaa01', 2022),
]

In [93]:
result_list = []
for player, year in season_list:
    result_list += parse_hr_log(player, year)

In [94]:
df = pd.DataFrame(result_list)

In [95]:
df.head()

Unnamed: 0,player,year,season,game,homeruns,cumulative
0,bondsba01,2001,bondsba01-2001,1,1,1
1,bondsba01,2001,bondsba01-2001,2,0,1
2,bondsba01,2001,bondsba01-2001,3,0,1
3,bondsba01,2001,bondsba01-2001,4,0,1
4,bondsba01,2001,bondsba01-2001,5,0,1


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1694 entries, 0 to 1693
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   player      1694 non-null   object
 1   year        1694 non-null   int64 
 2   season      1694 non-null   object
 3   game        1694 non-null   int64 
 4   homeruns    1694 non-null   int64 
 5   cumulative  1694 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 79.5+ KB


In [None]:
df.to_csv("hr-leader-logs.csv", index=False)

In [101]:
alt.Chart(df).mark_line(interpolate='step-after').encode(
    x=alt.X("game:O"),
    y=alt.Y("cumulative:Q"),
    color=alt.Color("season:N")
).properties(width=500)

In [104]:
filtered_df = df[df.season.isin(['bondsba01-2001', 'marisro01-1961', 'judgeaa01-2022'])]

In [106]:
names = {
    "bondsba01-2001": "Bonds '01",
    "marisro01-1961": "Maris '61",
    "judgeaa01-2022": "Judge '22"
}

In [107]:
filtered_df['label'] = filtered_df.season.map(names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['label'] = filtered_df.season.map(names)


In [108]:
alt.Chart(filtered_df).mark_line(interpolate='step-after').encode(
    x=alt.X("game:O"),
    y=alt.Y("cumulative:Q"),
    color=alt.Color("label:N")
).properties(width=500)

In [122]:
pivot = filtered_df[['label', 'game', 'cumulative']].pivot_table(columns="label", index="game", values="cumulative")

In [123]:
pivot

label,Bonds '01,Judge '22,Maris '61
game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
5,1.0,0.0,0.0
...,...,...,...
159,70.0,,60.0
160,72.0,,
161,72.0,,60.0
162,73.0,,60.0


In [124]:
pivot["Maris '61"].ffill(inplace=True)
pivot["Bonds '01"].ffill(inplace=True)

In [125]:
pivot.to_csv("hr-chart-logs.csv", index=True)