This notebook serves to be where we test code for any step of the data pipeline (Acquire, Clean, Explore, test/model, etc.)

If we want to later have a cleaner notebook of our findings, viz's, conclusions, etc., we can make another notebook called something like `report.ipynb`

Import below. Make a small comment on what import is for if the import isn't a common library/module.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
pd.set_option('display.max_columns', 100)

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
import acquire

## Acquire data

Data from [Baseball Reference](https://www.baseball-reference.com)


Use this area to test code that will help you write data acquisition funnctions

## Clean/Wrangle data

Test code to clean data. If you want to clean the data a certain way later on for a certain ML model or stats test, you can come back to this section.

## Explore data (EDA)

Test code to explore data. Analyze and make visualizations.

In [None]:
mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(14, 8))
    ax = sns.heatmap(df.corr(), mask=mask, cmap='coolwarm',
                     linewidths=.5, annot=True)

## Tests

Test code here if you want to run stats tests.

## Models

Test code here if you want to run ML models.

## Conclusions

<div style="border: 5px solid black;"></div>

Published Functions

Use this section to refer to published functions. Feel free to copy them to either improve them, or come up. with new ideas and functions. **Don't edit**

#### `Acquire.py`

Function to store tables read on a website to pandas DataFrames

In [100]:
def read_website_tables(url, webdriver_path = '~/chromedriver'):
    """
        reads tables from a url and returns them as DataFrames
    
        necessary imports:
        import pandas as pd
        from bs4 import BeautifulSoup
        from selenium import webdriver
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.common.by import By
        from selenium.webdriver.chrome.options import Options
        from selenium.webdriver.common.action_chains import ActionChains
        from selenium.webdriver.common.keys import Keys
    """
    # Path to chromedriver executable
    webdriver_path = webdriver_path

    # Set up the Selenium driver with options
    options = Options()
    options.add_argument('--headless')  # Run in headless mode
    driver = webdriver.Chrome(service=Service(webdriver_path), options=options)

    # Load the webpage
    driver.get(url)

    # Wait for the dynamic content to load (if necessary)
    # You can use driver.implicitly_wait() or other wait methods

    # Extract the page source after the dynamic content has loaded
    source = driver.page_source

    # Close the Selenium driver
    driver.quit()

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(source, 'lxml')

    # Find all the table elements
    table_elements = soup.find_all('table')

    # Extract the HTML content from each table element
    html_tables = [str(table) for table in table_elements]

    # Pass the list of HTML tables to pd.read_html()
    dfs = pd.read_html('\n'.join(html_tables))
    
    return dfs


Save data. Pass in df and specify file name (without extension).

In [None]:
def save_data(df, filename):
    """
        save DataFrame to csv file
    """
    df.to_csv(f'{filename}.csv', index=False)

Gets team acronyms and puts them into a list. List can be useful for things like navigating team url's.

In [None]:
def get_mlb_acronyms():
    
    url = 'https://www.baseball-reference.com'
    source = requests.get(url).text # html of website
    
    soup = BeautifulSoup(source, 'lxml')
    
    # All team acronyms are in the first `div` tag with  
    # class = "formfield". They are then in option tags (except the first).
    #     print(soup)
    #     print(soup.find_all('div', class_='formfield')[0])
    #     print(len(soup.find_all('div', class_='formfield')[0]))
    div_formfield = soup.find_all('div', class_='formfield')[0]
    option_tags = div_formfield.find_all('option')[1:]
    
    # Should be 30 acronyms
    team_acronyms = [str(tag)[15:18] for tag in option_tags]

    return team_acronyms

Gets team stats

In [None]:
def get_team_stats(start_year=2015, end_year=2022, 
                   bat_or_pitch='batting', measures=['standard'], full_seasons_only = True):
    """
        Reads and stores team batting or pitching stats from start_year to end_year.
        Can't merge pre-2015 data to 2015-present data because there are 2 new stats for post-2014 years
        Batting measures: standard, advanced, sabermetric
        Pitching measures: standard, advanced, batting (against), ratio
        Returns pandas DataFrame.
    """
    # for each year
    for year in range(start_year, end_year+1):
        # skip non-full-seasons
        if year in [2020, 2023]:
            continue   
        
        if bat_or_pitch == 'batting':
            # get Win-Loss records
            url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
            w_l_records = read_website_tables(url, webdriver_path = '~/chromedriver')[0][:30][['W','L','W-L%']] 

        # get stats by measure
        measure_dfs =[]
        for measure in measures:
            # read and store team stats table for given year. 2 tables are on this url with the first being team stats
            url = f'https://www.baseball-reference.com/leagues/majors/{year}-{measure}-{bat_or_pitch}.shtml'
            measure_df = read_website_tables(url, webdriver_path = '~/chromedriver')[0][:30]

            if measure == 'advanced':
                measure_df.columns = [col[1] for col in measure_df.columns]
            
            measure_dfs.append(measure_df)
        
        # join tables to get stats for the year
        year_stats = pd.concat(measure_dfs, axis=1)
        
        # add year column
        year_stats['year'] = year
#         # move column to be first
#         cols = list(year_stats.columns)
#         cols.insert(0, cols.pop(cols.index('year')))
#         year_stats = year_stats[cols]
        
        # if team_stats df exists, add year_stats to team_stats
        if 'team_stats' in locals():
            team_stats = pd.concat([team_stats, year_stats])
        # else initialize it
        else:
            team_stats = year_stats
        
    return team_stats

Read and stores player IDS

In [None]:
def read_player_ids(url, webdriver_path = '~/chromedriver'):
    """
        reads player ID's from a url and returns these ID's in a list of strings
    """
    # Path to chromedriver executable
    webdriver_path = webdriver_path

    # Set up the Selenium driver with options
    options = Options()
    options.add_argument('--headless')  # Run in headless mode
    driver = webdriver.Chrome(service=Service(webdriver_path), options=options)
    
    # Load the webpage
    driver.get(url)

    # Wait for the dynamic content to load (if necessary)
    # You can use driver.implicitly_wait() or other wait methods

    # Extract the page source after the dynamic content has loaded
    source = driver.page_source

    # Close the Selenium driver
    driver.quit()

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(source, 'lxml')

    # Find all the table elements
    table_elements = soup.find_all('table')

    # Second table in url has player_stats
    player_table = table_elements[1]
    
    # player IDs are in 'data-append-csv' attribute of 'td' tags that have 'data-stat = player'
    # emit last ID, which is blank
    player_ids = [tag.get('data-append-csv')
                 for tag in player_table.find_all('td', attrs={'data-stat': 'player'})][:-1]
    
    return player_ids

<div style="border: 5px solid black;"></div>

#### Web Scrape

- If you want to try to grab tables with advanced stats, you can use BeautifulSoup, requests, and/or pandas to read the tables.

Basic pitching stats

In [None]:
def get_team_pitching_stats(start_year = 2010, end_year = 2023, full_seasons_only=True):
    
    for year in range(start_year, end_year + 1):
        
        # skip non-full seasons
        if full_seasons_only:
            if year == 2023 or year == 2020:
                continue
        
        # read url with pandas
        url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
        temp_df = pd.read_html(url)[0].iloc[:30,:]
        
        # create year column
        temp_df['year'] = year
        
        # if first year, start batting_df with temp_df, or else concat temp_df to batting_df
        if year == start_year:
            team_pitching_stats = temp_df
        else:
            team_pitching_stats = pd.concat([team_pitching_stats, temp_df])
    
    
    return team_pitching_stats


In [None]:
pitching_df = get_pitching_stats()

In [None]:
pitching_df

Potential urls to scrape

In [None]:
'''
    batting_url = f'https://www.baseball-reference.com/teams/{team}/{year}-{bat_pit_field}.shtml'
'''

'''https://baseballsavant.mlb.com/leaderboard/custom?
            year=2022,2021,2020,2019,2018,2017,2016,2015,2014
            &type=batter&filter=&sort=0&sortDir=asc&min=q
            &selections=player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,
            b_home_run,b_strikeout,b_walk,b_k_percent,b_bb_percent,batting_avg,slg_percent,
            on_base_percent,on_base_plus_slg,isolated_power,b_rbi,b_lob,b_total_bases,r_total_caught_stealing,
            r_total_stolen_base,b_ab_scoring,b_ball,b_called_strike,b_catcher_interf,b_foul,b_foul_tip,b_game,
            b_gnd_into_dp,b_gnd_into_tp,b_gnd_rule_double,b_hit_by_pitch,b_hit_ground,b_hit_fly,b_hit_into_play,
            b_hit_line_drive,b_hit_popup,b_out_fly,b_out_ground,b_out_line_drive,b_out_popup,b_intent_ball,
            b_intent_walk,b_interference,b_pinch_hit,b_pinch_run,b_pitchout,b_played_dh,b_sac_bunt,b_sac_fly,
            b_swinging_strike,r_caught_stealing_2b,r_caught_stealing_3b,r_caught_stealing_home,r_defensive_indiff,
            r_interference,r_pickoff_1b,r_pickoff_2b,r_pickoff_3b,r_run,r_stolen_base_2b,r_stolen_base_3b,
            r_stolen_base_home,b_total_ball,b_total_sacrifices,b_total_strike,b_total_swinging_strike,b_total_pitches,
            r_stolen_base_pct,r_total_pickoff,b_reached_on_error,b_walkoff,b_reached_on_int,xba,xslg,woba,xwoba,xobp,
            xiso,wobacon,xwobacon,bacon,xbacon,xbadiff,xslgdiff,wobadiff,exit_velocity_avg,launch_angle_avg,
            sweet_spot_percent,barrel,barrel_batted_rate,solidcontact_percent,flareburner_percent,poorlyunder_percent,
            poorlytopped_percent,poorlyweak_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,
            &chart=false&x=xba&y=xba&r=no&chartType=beeswarm'''

print()