In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
# Create empty lists to hold table data to be scraped
away_box_score = []
home_box_score = []
box_score = []
pitching_line = []

# input URL and use BeautifulSoup to parse through the page
url = f'https://www.baseball-reference.com/boxes/SLN/SLN200809060.shtml'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has away team statistics
away_table_placeholder = soup.select_one('#all_FloridaMarlinsbatting') 

# Game box score is commented out in html, so this will grab it out of the comments
for comment in away_table_placeholder.find_all(text=lambda text: isinstance(text, Comment)):
    if comment.find("<table ") > 0:
        comment_soup = BeautifulSoup(comment, 'html.parser')
        away_table = comment_soup.find("table")

# Grab data from table and put it into the list created above
for tr in away_table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    away_box_score.append(tds)

# Grab the table element that has home team statistics
home_table_placeholder = soup.select_one('#all_StLouisCardinalsbatting') 

# Game box score is commented out in html, so this will grab it out of the comments
for comment in home_table_placeholder.find_all(text=lambda text: isinstance(text, Comment)):
    if comment.find("<table ") > 0:
        comment_soup = BeautifulSoup(comment, 'html.parser')
        home_table = comment_soup.find("table")

# Grab data from table and put it into the list created above
for tr in home_table.select('tr:has(td)'):
    tds2 = [td.get_text(strip=True) for td in tr.select('td')]
    home_box_score.append(tds2)

# Grab the table element that has game box score
box_score_table = soup.select('[class*="linescore_wrap"]')

# Grab data from table and put it into the list created above
for tr in box_score_table[0].select('tr:has(td)'):
    tds3 = [td.get_text(strip=True) for td in tr.select('td')]
    box_score.append(tds3)

pitching_line.append(box_score[2][0])

In [3]:
# box_score_table
# home_table
pitching_line

['WP:\xa0Todd\xa0Wellemeyer\xa0(12-6) •\xa0LP:\xa0Scott\xa0Olsen\xa0(6-10) •\xa0SV:\xa0Ryan\xa0Franklin\xa0(15)']

In [4]:
# Create dataframe for away team statistics
away_team_df = pd.DataFrame(away_box_score)

# Create an empty list to store away team statistics header information
away_header_list = []

# Grab the table header information to use as column headers in our away team statistics dataframe
for tr in away_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    away_header_list.append(ths)

away_header_list[0].remove("Batting")
away_team_df.columns = away_header_list[0]

    
# Create dataframe for home team statistics
home_team_df = pd.DataFrame(home_box_score)

# Create an empty list to store home team statistics header information
home_header_list = []

# Grab the table header information to use as column headers in our home team statistics dataframe
for tr in home_table.select('tr:has(th)'):
    ths2 = [th.get_text(strip=True) for th in tr.select('th')]
    home_header_list.append(ths2)

home_header_list[0].remove("Batting")
home_team_df.columns = home_header_list[0]


# Create dataframe for game box score
box_score_df = pd.DataFrame(box_score)

# Create an empty list to store game box score dataframe header information
box_score_list = []

# Grab the table header information to use as column headers in our game box score dataframe
for tr in box_score_table[0].select('tr:has(th)'):
    ths3 = [th.get_text(strip=True) for th in tr.select('th')]
    box_score_list.append(ths3)
    
box_score_list[0][1] = 'Team'
box_score_df.columns = box_score_list
    

In [5]:
box_score_df

Unnamed: 0,Unnamed: 1,Team,1,2,3,4,5,6,7,8,9,R,H,E
0,via Sports Logos.netAbout logos,Florida Marlins,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0,3.0,5.0,0.0
1,via Sports Logos.netAbout logos,St. Louis Cardinals,2.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,X,5.0,6.0,0.0
2,WP: Todd Wellemeyer (12-6) • LP: Scott Olsen (...,,,,,,,,,,,,,


In [6]:
# box_score_df.columns = box_score_list
final_box_score_df = box_score_df.iloc[: , 1:]
final_box_score_df.drop(final_box_score_df.tail(1).index,inplace=True) 

home_away = ['Away','Home']
final_box_score_df.rename(index={0:'Away'},inplace=True)
final_box_score_df.rename(index={1:'Home'},inplace=True)


# test_box_score_df = box_score_df[box_score_df.columns[1:]]
final_box_score_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Team,1,2,3,4,5,6,7,8,9,R,H,E
Away,Florida Marlins,0,0,1,2,0,0,0,0,0,3,5,0
Home,St. Louis Cardinals,2,0,0,1,0,1,0,1,X,5,6,0


In [7]:
# Pull the away team player data and combine the lists into one list
away_starters = away_header_list[1:-1]
# away_bench = away_header_list[8:-1]
# away_players = [away_starters,away_bench]

# Pull the home team player data and combine the lists into one list
home_starters = home_header_list[1:-1]
# home_bench = home_header_list[8:-1]
# home_players = [home_starters,home_bench]

In [8]:
away_starters

[['Hanley RamirezSS'],
 ['Luis GonzalezRF'],
 ['Brett CarrollRF'],
 ['Jorge Cantu3B'],
 ['Mike Jacobs1B'],
 ['Dan Uggla2B'],
 ['Cody RossCF'],
 ['Josh WillinghamLF'],
 ['John BakerC'],
 ['Scott OlsenP'],
 ['Paul Lo DucaPH'],
 ['Logan KensingP'],
 ['Andrew MillerP']]

In [9]:
# Create a dataframe from away team stats
away_stats_df = pd.DataFrame(away_box_score)

# Create a dataframe from home team stats
home_stats_df = pd.DataFrame(home_box_score)

In [10]:
# Consolodate the away team list of lists into one list
away_starters_list = [item for sublist in away_starters for item in sublist]
# away_bench_list = [item for sublist in away_bench for item in sublist]
# away_roster = away_starters_list + away_bench_list

# Consolodate the home team list of lists into one list
home_starters_list = [item for sublist in home_starters for item in sublist]
# home_bench_list = [item for sublist in home_bench for item in sublist]
# home_roster = home_starters_list + home_bench_list

In [11]:
away_header_list[0]

['AB',
 'R',
 'H',
 'RBI',
 'BB',
 'SO',
 'PA',
 'BA',
 'OBP',
 'SLG',
 'OPS',
 'Pit',
 'Str',
 'WPA',
 'aLI',
 'WPA+',
 'WPA-',
 'cWPA',
 'acLI',
 'RE24',
 'PO',
 'A',
 'Details']

In [12]:
# Add Team Totals to the end of the away team list
away_starters_list.append('Team Totals')
away_stats_df.index = away_starters_list

# Add Team Totals to the end of the home team list
home_starters_list.append('Team Totals')
home_stats_df.index = home_starters_list

In [13]:
# Set the home and away dataframe column labels
column_headers = away_header_list[0]
# column_headers.remove("Batting")
away_stats_df.columns = column_headers
home_stats_df.columns = column_headers

In [14]:
away_stats_df

Unnamed: 0,AB,R,H,RBI,BB,SO,PA,BA,OBP,SLG,...,WPA,aLI,WPA+,WPA-,cWPA,acLI,RE24,PO,A,Details
Hanley RamirezSS,4,0,1,0,0,0,4,0.296,0.394,0.529,...,-0.037,1.21,0.052,-0.089,-0.00%,0.17,-0.2,1.0,1.0,
Luis GonzalezRF,4,0,0,0,0,1,4,0.264,0.341,0.416,...,-0.133,1.37,0.0,-0.133,-0.01%,0.19,-0.9,4.0,0.0,
Brett CarrollRF,0,0,0,0,0,0,0,0.059,0.111,0.176,...,,,,,,,,1.0,0.0,
Jorge Cantu3B,4,1,1,0,0,2,4,0.273,0.321,0.46,...,-0.027,1.0,0.052,-0.079,-0.00%,0.14,-0.1,0.0,1.0,2B
Mike Jacobs1B,4,0,0,0,0,0,4,0.239,0.285,0.512,...,-0.141,1.31,0.0,-0.141,-0.01%,0.18,-1.1,6.0,0.0,
Dan Uggla2B,2,1,1,2,1,0,4,0.258,0.356,0.534,...,0.337,1.19,0.353,-0.016,0.03%,0.17,2.3,1.0,3.0,"HR,HBP"
Cody RossCF,4,0,0,0,0,3,4,0.261,0.318,0.493,...,-0.136,1.44,0.0,-0.136,-0.01%,0.2,-0.9,6.0,0.0,
Josh WillinghamLF,4,1,1,1,0,1,4,0.254,0.368,0.46,...,-0.019,1.43,0.101,-0.121,-0.00%,0.2,0.2,2.0,0.0,HR
John BakerC,3,0,1,0,0,1,3,0.287,0.377,0.456,...,0.025,1.17,0.072,-0.048,0.00%,0.17,-0.0,3.0,1.0,2B
Scott OlsenP,2,0,0,0,0,0,2,0.151,0.196,0.151,...,-0.033,0.65,0.0,-0.033,-0.00%,0.09,-0.3,0.0,0.0,


In [15]:
home_stats_df

Unnamed: 0,AB,R,H,RBI,BB,SO,PA,BA,OBP,SLG,...,WPA,aLI,WPA+,WPA-,cWPA,acLI,RE24,PO,A,Details
Cesar IzturisSS,4,0,0,0,0,0,4,0.249,0.311,0.296,...,-0.088,0.9,0.0,-0.088,-0.02%,0.37,-0.8,1.0,4.0,
Troy Glaus3B,2,1,0,0,2,0,4,0.27,0.373,0.474,...,-0.007,0.63,0.035,-0.041,-0.00%,0.26,0.1,2.0,3.0,
Albert Pujols1B,3,2,1,2,1,0,4,0.36,0.468,0.646,...,0.17,1.1,0.22,-0.051,0.04%,0.46,1.7,9.0,1.0,HR
Ryan LudwickCF-RF,4,2,2,0,0,1,4,0.299,0.375,0.59,...,0.04,0.97,0.072,-0.032,0.01%,0.41,0.5,2.0,0.0,2B
Yadier MolinaC,3,0,1,1,0,0,4,0.307,0.35,0.397,...,0.121,1.09,0.157,-0.035,0.03%,0.45,0.5,9.0,0.0,SH
Felipe LopezLF,3,0,1,2,0,0,4,0.259,0.326,0.354,...,0.102,1.07,0.15,-0.048,0.03%,0.45,0.5,1.0,0.0,SF
Nick StavinohaRF,2,0,0,0,1,0,3,0.194,0.231,0.222,...,-0.033,0.88,0.007,-0.039,-0.01%,0.36,-0.3,1.0,0.0,IW
Skip SchumakerCF,0,0,0,0,1,0,1,0.305,0.365,0.417,...,0.003,0.1,0.003,0.0,0.00%,0.04,0.1,0.0,0.0,
Todd WellemeyerP,3,0,1,0,0,0,3,0.157,0.17,0.157,...,-0.002,1.01,0.046,-0.048,-0.00%,0.42,-0.2,1.0,0.0,
Josh PhelpsPH,1,0,0,0,0,1,1,0.25,0.25,0.25,...,-0.007,0.23,0.0,-0.007,-0.00%,0.1,-0.2,,,


In [16]:
final_box_score_df

Unnamed: 0,Team,1,2,3,4,5,6,7,8,9,R,H,E
Away,Florida Marlins,0,0,1,2,0,0,0,0,0,3,5,0
Home,St. Louis Cardinals,2,0,0,1,0,1,0,1,X,5,6,0


In [17]:
text = soup.get_text()
attendance_string = re.findall(r"Attendance:\s+\d*,\d*", text)
game_time_string = re.findall(r"Game Duration:\s+\d:\d*", text)
new_attendance_string = attendance_string[0].strip('([^a-z]xa)')
new_game_time_string = game_time_string[0].strip('([^a-z]xa)')
attendance_list = list(new_attendance_string)
game_time_list = list(new_game_time_string)
attendance_list.pop(11)
game_time_list.pop(13)

':'

In [18]:
game_time_string

['Game Duration: 2:24']

In [19]:
final_attendance_string = "".join(attendance_list)
final_game_time_sting = "".join(game_time_list)

In [20]:
final_attendance_string

'Attendance:42,814'

In [21]:
final_game_time_sting

'Game Duration 2:24'

In [22]:
x = final_attendance_string.split(":")
y = final_game_time_sting.split(" ")

y[0:2] = [' '.join(y[0:2])]


In [23]:
attendance_df = pd.DataFrame(x, columns = [x[0]])
attendance_df = attendance_df.drop([0])

In [24]:
game_time_df = pd.DataFrame(y, columns = [y[0]])
game_time_df = game_time_df.drop([0])

In [25]:
game_logistics_df = pd.concat([attendance_df, game_time_df], ignore_index=True, sort=False)
game_logistics_df['Game Duration'] = game_logistics_df['Game Duration'].shift(-1)
game_logistics_df = game_logistics_df.dropna()

In [26]:
divparent = soup.find('div', attrs={'class':'scorebox_meta'})
text = divparent.text
result = re.split(r'\n', text)
almost_done = result[1]
final_result = almost_done.partition("Venue: ")[2]
real_final_result = final_result.split("Game Duration:")[0]

game_logistics_df = game_logistics_df.assign(Venue=[real_final_result])
game_logistics_df



Unnamed: 0,Attendance,Game Duration,Venue
0,42814,2:24,Busch Stadium III


In [None]:
############################################################################################################
###########################    Working on Scraping one game of pitching stats    ###########################
############################################################################################################


pitching_table_placeholder = soup.select('#all_2420024094')[0]

away_pitching_box_score = []
home_pitching_box_score = []
away_pitching_header_list = []
home_pitching_header_list = []

# Game pitching box score is commented out in html, so this will grab it out of the comments
for comment in pitching_table_placeholder.find_all(text=lambda text: isinstance(text, Comment)):
    if comment.find("<table ") > 0:
        comment_soup = BeautifulSoup(comment, 'html.parser')
        pitching_table = comment_soup.find_all("table")

# Grab away pitching data from table and put it into the list created above
for tr in pitching_table[0].select('tr:has(td)'):
    tdsAP = [td.get_text(strip=True) for td in tr.select('td')]
    away_pitching_box_score.append(tdsAP)

# Grab home pitching data from table and put it into the list created above
for tr in pitching_table[1].select('tr:has(td)'):
    tdsHP = [td.get_text(strip=True) for td in tr.select('td')]
    home_pitching_box_score.append(tdsHP)

dataframe_dictionary['P_' + Away_code_list[i] + Date_list[i]] = pd.DataFrame(away_pitching_box_score)
dataframe_dictionary['P_' + Home_code_list[i] + Date_list[i]] = pd.DataFrame(home_pitching_box_score)

# Grab the table header information to use as column headers in our away pitching statistics dataframe
for tr in pitching_table[0].select('tr:has(th)'):
    thsAP = [th.get_text(strip=True) for th in tr.select('th')]
    away_pitching_header_list.append(thsAP)

# Grab the table header information to use as column headers in our home pitching statistics dataframe
for tr in pitching_table[1].select('tr:has(th)'):
    thsHP = [th.get_text(strip=True) for th in tr.select('th')]
    home_pitching_header_list.append(thsHP)

away_pitching_header_list[0].remove("Pitching")   
home_pitching_header_list[0].remove("Pitching")   
away_pitchers = away_pitching_header_list[1:-1]
home_pitchers = home_pitching_header_list[1:-1]

away_pitchers_list = [item for sublist in away_pitchers for item in sublist]
away_pitchers_list.append(f'{Away_team_list[i]} Totals')
home_pitchers_list = [item for sublist in home_pitchers for item in sublist]
home_pitchers_list.append(f'{Home_team_list[i]} Totals')
dataframe_dictionary['P_' + Away_code_list[i] + Date_list[i]].index = away_pitchers_list
dataframe_dictionary['P_' + Home_code_list[i] + Date_list[i]].index = home_pitchers_list


away_pitcher_column_headers = away_pitching_header_list[0]
home_pitcher_column_headers = home_pitching_header_list[0]

dataframe_dictionary['P_' + Away_code_list[i] + Date_list[i]].columns = away_pitcher_column_headers
dataframe_dictionary['P_' + Home_code_list[i] + Date_list[i]].columns = home_pitcher_column_headers
    

# dataframe_dictionary['P_FLO200707270'] 
# dataframe_dictionary['P_SFN200707270'] 