# Notebook to house code used to scrape from baseball reference

### See Project 2 Savant Code notebook for further detail, had issues merging.

Importing the needed packages:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 20,10
import numpy as np
import glob
from scipy import stats
from bs4 import BeautifulSoup
import requests
import re
from IPython.core.display import display, HTML    # make sure Jupyter knows to display it as HTML

Making a function to use the URL's:

In [2]:
def Player_URL_Generator(player_list):
    '''
    Takes a list of player names, with each name being an entry of first name and last name.
    Returns a URL on baseball reference to each player's original page.
    '''
    separated_names = [player.split(" ") for player in player_list]
    url_list = []
    for i, player in enumerate(separated_names):
        #Stripping punctuation from names:
        new_name = [name.replace(".","") for name in player]
        new_name = [name.replace("'","") for name in new_name]
        #print(len(new_name))
        #In other words, if a player has 3 names not two:
        if len(new_name) == 3:
            comb_name = [new_name[0], new_name[1]+new_name[2]]
            #print(comb_name)
            if len(comb_name[1]) <= 5:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(comb_name[1][0].lower(), comb_name[1].lower(), comb_name[0][0:2].lower())
            else:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(comb_name[1][0].lower(), comb_name[1][0:5].lower(), comb_name[0][0:2].lower())
        else:    
            if len(new_name[1]) <= 5:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(new_name[1][0].lower(), new_name[1].lower(), new_name[0][0:2].lower())
            else:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(new_name[1][0].lower(), new_name[1][0:5].lower(), new_name[0][0:2].lower())
        url_list.append(url)
    return url_list 

Testing URL Generator w/ edge cases (apostrophes, periods in name, spaces in name, etc.)

In [3]:
Player_URL_Generator(["Tyler O'Neill", "Chase d'Arnaud", "C.J. Cron", 'Tommy La Stella','Alejandro De Aza'])

['https://www.baseball-reference.com/players/o/oneilty01.shtml',
 'https://www.baseball-reference.com/players/d/darnach01.shtml',
 'https://www.baseball-reference.com/players/c/croncj01.shtml',
 'https://www.baseball-reference.com/players/l/lasteto01.shtml',
 'https://www.baseball-reference.com/players/d/deazaal01.shtml']

From the Player URL Function, generating a BS4 object:

In [5]:
def Player_Soup_Generator(player_list):
    '''
    Takes in a list of players and returns Beautiful Soup objects.
    '''
    url_list = Player_URL_Generator(player_list)
    response_list = [requests.get(url) for url in url_list]
    soup_list = []
    for response in response_list:
        page = response.text
        soup_object = BeautifulSoup(page,'lxml')
        name = player.title.text.split("Stats")[0]
        #if player_list[i].strip() != name.strip():
            #Add in code for updating URL in this case
        soup_list.append(BeautifulSoup(page,'lxml'))
    return soup_list

Pull the years the player played from baseball reference:

In [6]:
def Pull_Player_Seasons(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of the season the player played in.
    '''
    seasons = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
    #seasons = player_soup.find_all('tr', attrs={'data-stat':re.compile('')})
    seasons_played = []
    for season in seasons:
        season_item = [str(item.get_text()) for item in season.find_all('th')]
        seasons_played.append(season_item)
    return seasons_played

Pull the year by year stats from BBREF:

In [7]:
def Pull_Player_Stats(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of stat lines by season.
    '''
    stat_line = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
    career_stats = []
    for season in stat_line:
        season_stat_line = [item.get_text() for item in season.find_all('td')]
        career_stats.append(season_stat_line)
    for i,year in enumerate(career_stats):
        career_stats[i].insert(0,Pull_Player_Seasons(player_soup)[i][0])
    return career_stats

From the years and stats, build a pandas dataframe:

In [8]:
def Player_Dataframe_Builder(player_soup_list):
    '''
    Takes in a list of BeautifulSoup Objects, returns a data frame of their Baseball-Reference stats.
    '''
    i=0
    
    for player in player_soup_list:
        header = player.find_all('th', attrs={'class': 'poptip'})
        columns = [col.get_text() for col in header]
        current_player_df = pd.DataFrame(Pull_Player_Stats(player), columns=columns)
        if i ==0:
            compiled_player_df = pd.DataFrame(columns=columns)
            compiled_player_df['Name'] = ''
            i += 1
        #Adding Player Name as a column:
        name = player.title.text.split("Stats")[0]
        current_player_df['Name'] = name
            
        compiled_player_df = pd.concat([compiled_player_df, current_player_df], ignore_index=True)
    num_cols = ['Age', 'G', 'PA', 'AB', 'R','H','2B','3B','HR','RBI','SB','CS','BB','SO','BA','OBP','SLG','OPS','OPS+','TB','GDP','HBP','SH','SF','IBB']
    compiled_player_df[num_cols] = compiled_player_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
    return compiled_player_df

Test Case, using the final_player_list from the BasebalL Savant Data:

In [9]:
my_soup_list = Player_Soup_Generator(final_player_list)

NameError: name 'final_player_list' is not defined

Testing for players whose URL and names don't match using the normal method:

In [None]:
for i, player in enumerate(my_soup_list):
    name = player.title.text.split("Stats")[0]
    if final_player_list[i].strip() != name.strip():
        print('WRONG!')
        print(name, final_player_list[i])

In [None]:
multiple_player_df = Player_Dataframe_Builder(my_soup_list)

In [None]:
multiple_player_df.Name.value_counts()

In [None]:
multiple_player_df.head()

In [None]:
my_list = ['Eloy Jimenez','Jose Abreu','Mike Trout','Adam Eaton','Frank Thomas']

In [None]:
url_list = Player_URL_Generator(my_list)
response_list = [requests.get(url) for url in url_list]
soup_list = []
for response in response_list:
    page = response.text
    soup_list.append(BeautifulSoup(page,'lxml'))

In [None]:
for i, player in enumerate(soup_list):
    name = player.title.text.split("Stats")[0]
    print(name)
    print(my_list[i])
    if str(name) != str(my_list[i]):
        print('WRONG!')
    else:
        print('MATC!')

## Baseball reference: had too much trouble using Savant List to pull data based on URL differences from typical function (too many edge/special cases).  
**Decision: use wRC+ data from FanGraphs.  FanGraphs has easier access to wRC+, the target stat I want to use with the Savant Data features.**