# Initial Testing of webscraping from Baseball Reference:

First, importing various packages:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 20,10
import numpy as np
import glob
from scipy import stats
from bs4 import BeautifulSoup
import requests
import re
from IPython.core.display import display, HTML    # make sure Jupyter knows to display it as HTML

Link for Jose Abreu's baseball reference page:

In [2]:
abreu_bbref_url = 'https://www.baseball-reference.com/players/a/abreujo02.shtml'
response = requests.get(abreu_bbref_url)

In [3]:
response.status_code

200

In [4]:
response.text[:1000]

'\n<!DOCTYPE html>\n<html data-version="klecko-" data-root="/home/br/build" itemscope itemtype="https://schema.org/WebSite" lang="en" class="no-js" >\n<head>\n<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->\n<script type="text/javascript" async=true>\n    (function() {\n\tvar host = window.location.hostname;\n\tvar element = document.createElement(\'script\');\n\tvar firstScript = document.getElementsByTagName(\'script\')[0];\n\tvar url = \'https://quantcast.mgr.consensu.org\'\n\t    .concat(\'/choice/\', \'XwNYEpNeFfhfr\', \'/\', host, \'/choice.js\')\n\tvar uspTries = 0;\n\tvar uspTriesLimit = 3;\n\telement.async = true;\n\telement.type = \'text/javascript\';\n\telement.src = url;\n\t\n\tfirstScript.parentNode.insertBefore(element, firstScript);\n\t\n\tfunction makeStub() {\n\t    var TCF_LOCATOR_NAME = \'__tcfapiLocator\';\n\t    var queue = [];\n\t    var win = window;\n\t    var cmpFrame;\n\t    \n\t    function addFrame() {\n\t\tvar doc = win.document;\n\t\tvar 

In [5]:
page = response.text

Using Beautiful Soup:

In [6]:
abreu_soup = BeautifulSoup(page,'lxml')

In [8]:
def Player_list_creator(player_soup):
    '''
    Takes in a BeautifulSoup Object.
    Returns a list of the player's stats, with each list object containing a string.
    '''
    player_list = []
    for element in player_soup.find(class_='table_container').find_all(class_='full'):
        player_list.append(str(element.text))
    return(player_list)

In [9]:
abreu_list = Player_list_creator(abreu_soup)

In [10]:
abreu_soup.title.text.split(" ")[0:2]

['José', 'Abreu']

## Obsolete Function
from collections import defaultdict
def Pull_Player_Stats(player_soup) :
    '''
    Takes in a BeautifulSoup object for a player's baseball reference page.
    Returns a dictionary with the year-by-year stats mentioned below.
    '''
    my_dict = defaultdict(list)
    name = player_soup.title.text.split(" ")[0] + player_soup.title.text.split(" ")[1]
    player_list = Player_list_creator(player_soup)
    for i, year in enumerate(player_list):
        season = player_list[i][0:4]
        #Name:
        my_dict[name+season].append(name)
        #Year
        my_dict[name+season].append(player_list[i][0:4])
        #Age
        my_dict[name+season].append(player_list[i][4:6])
        #Team
        my_dict[name+season].append(player_list[i][6:9])
        #League
        my_dict[name+season].append(player_list[i][9:11])
        #Gmaes
        my_dict[name+season].append(player_list[i][11:14])
        #PA
        my_dict[name+season].append(player_list[i][14:17])
        #AB
        my_dict[name+season].append(player_list[i][17:20])
        #H
        my_dict[name+season].append(player_list[i][22:25])
        #HR
        my_dict[name+season].append(player_list[i][28:30])
        #OPS
        my_dict[name+season].append(player_list[i][52:56])
        #OPS+
        my_dict[name+season].append(player_list[i][56:59])
    return(my_dict)

Turning Dictionary to Dataframe:

In [11]:
#player_df = pd.DataFrame.from_dict(Pull_Player_Stats(abreu_soup), orient = 'index',columns = ['Name', 'Year','Age','Team','League','Games Played','PA','AB','H','HR','OPS','OPS+'])

Adding Babe Ruth Stats to dataframe player_df:

In [13]:
babe_ruth_url = 'https://www.baseball-reference.com/players/r/ruthba01.shtml'
ruth_response = requests.get(babe_ruth_url)

In [14]:
ruth_page = ruth_response.text

In [15]:
ruth_soup = BeautifulSoup(ruth_page,'lxml')

In [16]:
Pull_Player_Stats(ruth_soup)

NameError: name 'Pull_Player_Stats' is not defined

Building a function to build the dataframe:

def Player_Dataframe_Builder(player_soup_list):
    '''
    Takes in a list of BeautifulSoup Objects, returns a data frame of their Baseball-Reference stats.
    '''
    players_df = pd.DataFrame(columns = ['Name', 'Year','Age','Team','League','Games Played','PA','AB','H','HR','OPS','OPS+'])
    for player in player_soup_list:
        player_df = pd.DataFrame.from_dict(Pull_Player_Stats(player), orient = 'index',columns = ['Name', 'Year','Age','Team','League','Games Played','PA','AB','H','HR','OPS','OPS+'])
        players_df = pd.concat([players_df, player_df], axis = 0)
    return players_df

In [17]:
player_soup_list = [abreu_soup, ruth_soup]

In [18]:
Player_Dataframe_Builder(player_soup_list)

NameError: name 'Player_Dataframe_Builder' is not defined

Baseball Reference URL Generator Function:

In [20]:
def Player_URL_Generator(player_list):
    '''
    Takes a list of player names, with each name being an entry of first name and last name.
    Returns a URL on baseball reference to each player's original page.
    '''
    separated_names = [player.split(" ") for player in player_list]
    url_list = []
    for i, player in enumerate(separated_names):
        if len(separated_names[i][1]) <= 5:
            url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(separated_names[i][1][0].lower(), separated_names[i][1].lower(), separated_names[i][0][0:2].lower())
        else:
            url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(separated_names[i][1][0].lower(), separated_names[i][1][0:5].lower(), separated_names[i][0][0:2].lower())
        url_list.append(url)
    return url_list 

In [21]:
my_list = ['Paul Konerko','Barry Bonds','Babe Ruth', 'Frank Thomas', 'Jim Thome', 'Ted Williams', 'Mike Trout', 'Nellie Fox', 'Sammy Sosa', 'Willie Mays']
Player_URL_Generator(my_list)

['https://www.baseball-reference.com/players/k/konerpa01.shtml',
 'https://www.baseball-reference.com/players/b/bondsba01.shtml',
 'https://www.baseball-reference.com/players/r/ruthba01.shtml',
 'https://www.baseball-reference.com/players/t/thomafr01.shtml',
 'https://www.baseball-reference.com/players/t/thomeji01.shtml',
 'https://www.baseball-reference.com/players/w/willite01.shtml',
 'https://www.baseball-reference.com/players/t/troutmi01.shtml',
 'https://www.baseball-reference.com/players/f/foxne01.shtml',
 'https://www.baseball-reference.com/players/s/sosasa01.shtml',
 'https://www.baseball-reference.com/players/m/mayswi01.shtml']

Soup Generator from Names:

In [22]:
def Player_Soup_Generator(player_list):
    '''
    Takes in a list of players and returns Beautiful Soup objects.
    '''
    url_list = Player_URL_Generator(player_list)
    response_list = [requests.get(url) for url in url_list]
    soup_list = []
    for response in response_list:
        page = response.text
        soup_list.append(BeautifulSoup(page,'lxml'))
    return soup_list

In [24]:
Player_Soup_Generator(my_list)

[<!DOCTYPE html>
 <html class="no-js" data-root="/home/br/build" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
 <head>
 <!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
 <script async="true" type="text/javascript">
     (function() {
 	var host = window.location.hostname;
 	var element = document.createElement('script');
 	var firstScript = document.getElementsByTagName('script')[0];
 	var url = 'https://quantcast.mgr.consensu.org'
 	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, '/choice.js')
 	var uspTries = 0;
 	var uspTriesLimit = 3;
 	element.async = true;
 	element.type = 'text/javascript';
 	element.src = url;
 	
 	firstScript.parentNode.insertBefore(element, firstScript);
 	
 	function makeStub() {
 	    var TCF_LOCATOR_NAME = '__tcfapiLocator';
 	    var queue = [];
 	    var win = window;
 	    var cmpFrame;
 	    
 	    function addFrame() {
 		var doc = win.document;
 		var otherCMP = !!(win.frames[TCF_LOCATOR_NAM

Building a dataframe from the URL's above:

In [24]:
my_soup_list = Player_Soup_Generator(my_list)
multiple_player_df = Player_Dataframe_Builder(my_soup_list)
multiple_player_df.head()

NameError: name 'Pull_Player_Stats' is not defined

In [None]:
multiple_player_df['Name'].value_counts()

In [None]:
multiple_player_df.HR.max()

Next steps: 
- better scraping of table data, as right now it's based off of string slicing.  For example many values in above df don't make sense.
- Pull in other data (advanced stats, injuries, statcast, etc.)
- Other challenge: player's URL's varying by similarly named players (above pulls Frosty Thomas, not Frank Thomas)

## Attempting to pull in a better way:

In [28]:
babe_ruth_url = 'https://www.baseball-reference.com/players/r/ruthba01.shtml'
ruth_response = requests.get(babe_ruth_url)

In [29]:
ruth_page = ruth_response.text
ruth_soup = BeautifulSoup(ruth_page,'lxml')

In [30]:
header = ruth_soup.find_all('th', attrs={'class': 'poptip'})

In [31]:
print(header)

[<th aria-label="Year" class="poptip sort_default_asc show_partial_when_sorting left" data-stat="year_ID" data-tip="A Star indicates an all-star that season.&lt;br&gt;A Ring indicates the player appeared in WS for winning team." scope="col">Year</th>, <th aria-label="Player’s age at midnight of June 30th of that year" class="poptip sort_default_asc show_partial_when_sorting center" data-stat="age" data-tip="Player’s age at midnight of June 30th of that year" scope="col">Age</th>, <th aria-label="Tm" class="poptip sort_default_asc show_partial_when_sorting center" data-stat="team_ID" scope="col">Tm</th>, <th aria-label=" League AL - American League (1901-present) NL - National League (1876-present) AA - American Association (1882-1891) UA - Union Association (1884) PL - Players League (1890) FL - Federal League (1914-1915) NA - National Association (1871-1875)" class="poptip sort_default_asc center" data-stat="lg_ID" data-tip="&lt;strong&gt;League&lt;/strong&gt;&lt;br&gt;&lt;strong&gt;A

In [32]:
columns = [col.get_text() for col in header]

In [33]:
print(columns)

['Year', 'Age', 'Tm', 'Lg', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Pos', 'Awards']


In [34]:
new_df = pd.DataFrame(columns=columns)
new_df.head()

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards


In [35]:
#Find years, set as first column of new data_Frame:
seasons = ruth_soup.find_all('tr', attrs={'id':re.compile('batting_standard')})
seasons_played = []
for season in seasons:
    season_item = [str(item.get_text()) for item in season.find_all('th')]
    seasons_played.append(season_item)
print(seasons_played[0][0])

1914


Adding in stats per season:

In [36]:
stat_line = ruth_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
career_stats = []
for season in stat_line:
    season_stat_line = [item.get_text() for item in season.find_all('td')]
    career_stats.append(season_stat_line)

In [37]:
len(seasons_played), len(career_stats)

(22, 22)

Combining Year with season stats for that year:

In [38]:
for i,year in enumerate(career_stats):
    career_stats[i].insert(0,seasons_played[i][0])

In [39]:
print(career_stats)

[['1914', '19', 'BOS', 'AL', '5', '10', '10', '1', '2', '1', '0', '0', '0', '0', '0', '0', '4', '.200', '.200', '.300', '.500', '50', '3', '', '0', '0', '', '', '/1H', ''], ['1915', '20', 'BOS', 'AL', '42', '103', '92', '16', '29', '10', '1', '4', '20', '0', '0', '9', '23', '.315', '.376', '.576', '.952', '189', '53', '', '0', '2', '', '', '1H', ''], ['1916', '21', 'BOS', 'AL', '67', '152', '136', '18', '37', '5', '3', '3', '16', '0', '', '10', '23', '.272', '.322', '.419', '.741', '122', '57', '', '0', '4', '', '', '1H', ''], ['1917', '22', 'BOS', 'AL', '52', '142', '123', '14', '40', '6', '3', '2', '14', '0', '', '12', '18', '.325', '.385', '.472', '.857', '162', '58', '', '0', '7', '', '', '1H', ''], ['1918', '23', 'BOS', 'AL', '95', '382', '317', '50', '95', '26', '11', '11', '61', '6', '', '58', '58', '.300', '.411', '.555', '.966', '192', '176', '', '2', '3', '', '', '7138/H', ''], ['1919', '24', 'BOS', 'AL', '130', '543', '432', '103', '139', '34', '12', '29', '113', '7', '', '1

In [40]:
ruth_df =  pd.DataFrame(career_stats, columns=columns)   

In [41]:
ruth_df

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,1914,19,BOS,AL,5,10,10,1,2,1,...,0.5,50,3,,0,0,,,/1H,
1,1915,20,BOS,AL,42,103,92,16,29,10,...,0.952,189,53,,0,2,,,1H,
2,1916,21,BOS,AL,67,152,136,18,37,5,...,0.741,122,57,,0,4,,,1H,
3,1917,22,BOS,AL,52,142,123,14,40,6,...,0.857,162,58,,0,7,,,1H,
4,1918,23,BOS,AL,95,382,317,50,95,26,...,0.966,192,176,,2,3,,,7138/H,
5,1919,24,BOS,AL,130,543,432,103,139,34,...,1.114,217,284,,6,3,,,*71/38H,
6,1920,25,NYY,AL,142,617,458,158,172,36,...,1.379,255,388,,3,5,,,*978/31H,
7,1921,26,NYY,AL,152,693,540,177,204,44,...,1.359,239,457,,4,4,,,*78/31H,
8,1922,27,NYY,AL,110,496,406,94,128,24,...,1.106,182,273,,1,4,,,*79/3,
9,1923,28,NYY,AL,152,699,522,151,205,45,...,1.309,239,399,,4,3,,,*97/83,MVP-1


Building Functions from Above:

In [40]:
def Pull_Player_Seasons(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of the season the player played in.
    '''
    #seasons = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard')})
    seasons = player_soup.find_all('tr', attrs={'data-stat'})
    seasons_played = []
    for season in seasons:
        season_item = [str(item.get_text()) for item in season.find_all('th')]
        seasons_played.append(season_item)
    return seasons_played

In [48]:
Pull_Player_Seasons(ruth_soup)

[]

In [46]:
def Pull_Player_Stats(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of stat lines by season.
    '''
    stat_line = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
    career_stats = []
    for season in stat_line:
        season_stat_line = [item.get_text() for item in season.find_all('td')]
        career_stats.append(season_stat_line)
    for i,year in enumerate(career_stats):
        career_stats[i].insert(0,Pull_Player_Seasons(player_soup)[i][0])
    return career_stats

In [42]:
def Player_Dataframe_Builder(player_soup_list):
    '''
    Takes in a list of BeautifulSoup Objects, returns a data frame of their Baseball-Reference stats.
    '''
    i = 0
    for player in player_soup_list:
        header = player.find_all('th', attrs={'class': 'poptip'})
        columns = [col.get_text() for col in header]
        if i == 0:
            compiled_player_df = pd.DataFrame(columns=columns)
            i += 1
        else:
            current_player_df = pd.DataFrame(Pull_Player_Stats(player), columns=columns)
            compiled_player_df = pd.concat([compiled_player_df, current_player_df])
    return compiled_player_df

In [43]:
my_list = ['Gary Sheffield', 'Magglio Ordonez', 'Yoan Moncada', 'Eloy Jimenez','Paul Konerko','Barry Bonds','Babe Ruth', 'Jim Thome', 'Ted Williams', 'Mike Trout', 'Nellie Fox', 'Sammy Sosa', 'Willie Mays']

In [44]:
my_soup_list = Player_Soup_Generator(my_list)
multiple_player_df = Player_Dataframe_Builder(my_soup_list)
multiple_player_df.head(30)

IndexError: list index out of range