In [1]:
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from datetime import datetime
# gather urls for each year
def yearLinks(soup):
    """
    Each year data for a player shown in a different page. 
    This function creates a list of urls from a given soup object of the player's page.
    Parameters
    --------
    soup: bs4.BeautifulSoup object. Created from player's page html.
    Returns
    ---------
    links: list. List of strings. Each string links to the player's page for different year's data.
    """
    links = []
    base_url = 'https://www.cbssports.com'
    drop_down = soup.find_all('ul', 'Dropdown-list')[0]
    for url in drop_down.find_all('a'):
        links.append(base_url + url['href'] )
    return links

# get name and id from url
def player_info_from_url(url):
    player_name = url.split('/')[-2]
    player_id = url.split('/')[-3]
    return player_name, player_id

# given year, player_name and id create the url.
def create_url(year, player_name, player_id):
    base = 'https://www.cbssports.com/'
    extension = f'fantasy/baseball/players/game-log/{year}/{player_id}/{player_name}/'
    return base+extension

# from a url create soup object
def make_soup(url):
    response = requests.get(url)
    soup = bs(response.content, features="html.parser")
    return soup

def cell_reader(cells):
    row_data = []
    for c in cells:
        if c.text:
            row_data.append(c.text)
        else:
            row_data.append(c.find('a')['href'])
    return  row_data

header = ['DATE','TEAM','home/away', 'OPPONENT','RESULT',
        'AB','R' ,'H','2B','3B','HR','RBI','BB','K',
        'SB','CS','SH','SF','HBP','AVG','SLG','YTDOBP',
        'FPTS']
new_header = header + ['date']

def scraper(url):
    # to keep the list of years a player played in MLB
    years = []
    player_name, player_id = player_info_from_url(url)
    # create an url for one of the player's pages.
    player_url = create_url('2021', player_name, player_id)
        # create soup
    soup = make_soup(player_url)
        # create links for each year
    links = yearLinks(soup)
    body = []
    for url in links:
        # get the year
        year = url.split('/')[-4]
        # soup for a particular year data
        soup = make_soup(url)
        try:
            years.append(year)
            table = soup.find('table', 'data compact')
            for month in table.findAll('tbody'):
                for row in month.findAll('tr'):
                    cells = row.findAll('td')
                    row_data = cell_reader(cells)
                    day = row_data[0]
                    date = datetime.strptime(day+ '/' + year, '%m/%d/%Y' )
                    row_data.append(date)
                    body.append(row_data)
        except:
            print('No content for:', url)
    df = pd.DataFrame(body, columns= new_header)
    df.loc[:, 'AB': 'HBP'] = df.loc[:,'AB': 'HBP'].astype(int)
    df.loc[:, 'AVG': 'FPTS'] = df.loc[:, 'AVG': 'FPTS'].astype(float)
    df['dayofweek'] = df.date.dt.dayofweek
    df['Year'] = df.date.dt.year
    return df, years

# Hits divided by At Bats
def daily_avg(df):
    avg = df.H/df.AB
    return avg.fillna(0)

# Hits + Walks + Hit By Pitch) divided by (At-bats + Walks + Hit By Pitch + Sacrifices)
def daily_obp(df):
    numerator = df.H + df.BB + df.HBP
    denom = df.AB + df.BB + df.HBP + df.SF
    return (numerator/denom).fillna(0)

# Total number of bases divided by At-bats
def daily_slg(df):
    first_base = df.H - (df['2B'] + df['3B'] + df['HR'])
    num = first_base + 2 * df['2B'] + 3*df['3B'] + df['HR']
    return (num/df.AB).fillna(0)

# returns the final version of the data
def cbs_data(url):
    df, years = scraper(url)
    df['daily_avg'] = daily_avg(df)
    df['daily_slg'] = daily_slg(df)
    df['daily_obp'] = daily_obp(df)
    return df, years

    

In [3]:
url = 'https://www.cbssports.com/fantasy/baseball/players/game-log/2021/2211777/ronald-acuna/'

In [4]:
df, years = cbs_data(url)

In [10]:
df.set_index('date', inplace = True)

In [11]:
dataframe = df.copy()


In [17]:
df.head()

Unnamed: 0_level_0,DATE,TEAM,home/away,OPPONENT,RESULT,AB,R,H,2B,3B,...,HBP,AVG,SLG,YTDOBP,FPTS,dayofweek,Year,daily_avg,daily_slg,daily_obp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-04-01,04/01,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/PHI/philadelphia-phillies/,L 3-2,4,0,2,1,0,...,0,0.5,0.75,0.5,3.5,3,2021,0.5,0.75,0.6
2021-04-03,04/03,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/PHI/philadelphia-phillies/,L 4-0,4,0,0,0,0,...,0,0.25,0.375,0.3,-1.0,5,2021,0.0,0.0,0.0
2021-04-04,04/04,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/PHI/philadelphia-phillies/,L 2-1,4,0,0,0,0,...,0,0.167,0.25,0.214,-1.0,6,2021,0.0,0.0,0.0
2021-04-06,04/06,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/WAS/washington-nationals/,L 6-5,5,2,2,0,0,...,0,0.235,0.647,0.263,13.0,1,2021,0.4,0.4,0.4
2021-04-07,04/07,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/WAS/washington-nationals/,W 7-6,6,2,3,1,0,...,0,0.304,0.652,0.333,11.5,2,2021,0.5,0.666667,0.571429


In [26]:
dataframe.loc['2018'].daily_avg.rolling(4).mean()

date
2018-04-25         NaN
2018-04-26         NaN
2018-04-27         NaN
2018-04-28    0.362500
2018-04-29    0.479167
                ...   
2018-09-26    0.208333
2018-09-27    0.250000
2018-09-28    0.375000
2018-09-29    0.375000
2018-09-30    0.395833
Name: daily_avg, Length: 109, dtype: float64

In [31]:
df.head()

Unnamed: 0_level_0,DATE,TEAM,home/away,OPPONENT,RESULT,AB,R,H,2B,3B,...,HBP,AVG,SLG,YTDOBP,FPTS,dayofweek,Year,daily_avg,daily_slg,daily_obp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-04-01,04/01,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/PHI/philadelphia-phillies/,L 3-2,4,0,2,1,0,...,0,0.5,0.75,0.5,3.5,3,2021,0.5,0.75,0.6
2021-04-03,04/03,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/PHI/philadelphia-phillies/,L 4-0,4,0,0,0,0,...,0,0.25,0.375,0.3,-1.0,5,2021,0.0,0.0,0.0
2021-04-04,04/04,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/PHI/philadelphia-phillies/,L 2-1,4,0,0,0,0,...,0,0.167,0.25,0.214,-1.0,6,2021,0.0,0.0,0.0
2021-04-06,04/06,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/WAS/washington-nationals/,L 6-5,5,2,2,0,0,...,0,0.235,0.647,0.263,13.0,1,2021,0.4,0.4,0.4
2021-04-07,04/07,/mlb/teams/ATL/atlanta-braves/,@,/mlb/teams/WAS/washington-nationals/,W 7-6,6,2,3,1,0,...,0,0.304,0.652,0.333,11.5,2,2021,0.5,0.666667,0.571429


In [30]:
df.H.cumsum()/ df.AB.cumsum()

date
2021-04-01    0.500000
2021-04-03    0.250000
2021-04-04    0.166667
2021-04-06    0.235294
2021-04-07    0.304348
                ...   
2018-09-26    0.279520
2018-09-27    0.280106
2018-09-28    0.280981
2018-09-29    0.280899
2018-09-30    0.281003
Length: 385, dtype: float64

In [27]:
import plotly.graph_objects as go
import streamlit as st
import numpy as np
# Create visualization
fig = go.Figure()
for year in ['2019', '2020']:
    year_data_length = dataframe.loc[year].shape[0]
    x = np.arange(year_data_length)
    year_data = dataframe.loc[year, 'daily_avg'].rolling(4, ).mean()
    fig.add_trace(go.Scatter(x=x, y=year_data,
                    mode='lines',
                    name= year + " " + 'daily_avg'))



In [28]:
fig.show()

In [None]:
avg = df.H.astype(int)/df.AB.astype(int)
t = avg.fillna(0)