# Merging Code from Fangraphs and Baseball Savant

First, import needed packages.

In [117]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
from IPython.core.display import display, HTML    # make sure Jupyter knows to display it as HTML
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import unicodedata

Now, import the functions I need:

In [118]:
from baseball_savant_code import *
from fangraphs_wrc_code import *

Add in the URL's: 

In [119]:
savant_url = 'https://baseballsavant.mlb.com/leaderboard/custom?year=2020,2019,2018,2017,2016,2015&type=batter&filter=&sort=16&sortDir=desc&min=200&selections=player_age,b_total_pa,b_game,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,solidcontact_percent,hard_hit_percent,z_swing_percent,oz_swing_percent,pull_percent,straightaway_percent,opposite_percent,groundballs_percent,flyballs_percent,linedrives_percent,sprint_speed,&chart=true&x=player_age&y=player_age&r=no&chartType=beeswarm'
fg_url = 'https://www.fangraphs.com/leaders/season-stat-grid?position=B&seasonStart=2015&seasonEnd=2020&stat=wRC%2B&pastMinPt=200&curMinPt=0&mode=normal'

Create dataframes:

In [121]:
savant_df = Savant_DataFrame_Builder(savant_url)

In [123]:
fg_df = wRC_DataFrame_Shifter(fg_url)

In [124]:
savant_df.head()

Unnamed: 0_level_0,Player,Year,Age,PA,G,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,Zone Swing %,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed,Player Link
Rk.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,"Saltalamacchia, Jarrod",2016.0,31.0,292.0,92.0,89.2,21.4,11.6,6.8,35.4,65.7,23.9,44.2,34.0,21.8,31.3,43.5,19.7,25.6,[/savant-player/457454]
2,"Schimpf, Ryan",2016.0,28.0,330.0,89.0,90.3,29.7,16.7,8.0,41.4,64.7,23.4,41.4,33.9,24.7,20.1,42.0,21.8,28.2,[/savant-player/572114]
3,"Gallo, Joey",2017.0,24.0,532.0,145.0,93.3,23.0,22.1,8.3,52.2,72.7,28.2,50.2,29.2,20.6,29.2,37.9,22.1,27.8,[/savant-player/608336]
4,"Carpenter, Matt",2018.0,33.0,677.0,156.0,90.7,21.6,13.7,11.5,44.7,57.1,18.5,47.6,30.7,21.7,28.3,37.8,28.5,26.5,[/savant-player/572761]
5,"Bruce, Jay",2019.0,32.0,333.0,98.0,90.2,21.4,13.4,8.2,43.7,79.6,37.0,42.9,29.9,27.3,29.4,37.7,19.9,26.7,[/savant-player/457803]


In [125]:
fg_df.head()

Unnamed: 0,Name-Year,wRC+
0,Michael Lorenzen-2015,0
1,Michael Lorenzen-2016,0
2,Michael Lorenzen-2017,0
3,Michael Lorenzen-2018,0
4,Michael Lorenzen-2019,0


Excellent, both are as expected.

## Merge The DF's
*Note: this code is originally from Project_2_Savant notebook, moving here to clean up the workflow.*

In [126]:
fg_df.rename(columns={'Name-Year':'Player_Year_ID'}, inplace=True)

In [127]:
fg_df.head(2)

Unnamed: 0,Player_Year_ID,wRC+
0,Michael Lorenzen-2015,0
1,Michael Lorenzen-2016,0


Function to strip accent marks from names so they can all be mapped to the savant data:

In [128]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [129]:
player_list = fg_df.Player_Year_ID.to_list()

In [130]:
updated_names = [strip_accents(name).replace('?','e') for name in player_list]

In [131]:
updated_names

['Michael Lorenzen-2015',
 'Michael Lorenzen-2016',
 'Michael Lorenzen-2017',
 'Michael Lorenzen-2018',
 'Michael Lorenzen-2019',
 'Michael Lorenzen-2020',
 'Luis Campusano-2015',
 'Luis Campusano-2016',
 'Luis Campusano-2017',
 'Luis Campusano-2018',
 'Luis Campusano-2019',
 'Luis Campusano-2020',
 'Billy McKinney-2015',
 'Billy McKinney-2016',
 'Billy McKinney-2017',
 'Billy McKinney-2018',
 'Billy McKinney-2019',
 'Billy McKinney-2020',
 'Rafael Marchan-2015',
 'Rafael Marchan-2016',
 'Rafael Marchan-2017',
 'Rafael Marchan-2018',
 'Rafael Marchan-2019',
 'Rafael Marchan-2020',
 'Travis Blankenhorn-2015',
 'Travis Blankenhorn-2016',
 'Travis Blankenhorn-2017',
 'Travis Blankenhorn-2018',
 'Travis Blankenhorn-2019',
 'Travis Blankenhorn-2020',
 'Yolmer Sanchez-2015',
 'Yolmer Sanchez-2016',
 'Yolmer Sanchez-2017',
 'Yolmer Sanchez-2018',
 'Yolmer Sanchez-2019',
 'Yolmer Sanchez-2020',
 'Andrew Stevenson-2015',
 'Andrew Stevenson-2016',
 'Andrew Stevenson-2017',
 'Andrew Stevenson-201

In [132]:
fg_df.Player_Year_ID = updated_names

Back to Baseball Savant:

In [133]:
player_list = savant_df.Player.tolist()

In [134]:
player_list_split = [player.split(', ') for player in player_list]
for player in player_list_split:
    player[0] = player[0].strip()

final_player_list = [(player[1] + ' ' + player[0]) for player in player_list_split]    

In [135]:
print(final_player_list)

['Jarrod Saltalamacchia', 'Ryan Schimpf', 'Joey Gallo', 'Matt Carpenter', 'Jay Bruce', 'Matt Carpenter', 'Joey Gallo', 'Eric Thames', 'Cavan Biggio', 'Rhys Hoskins', 'Joey Gallo', 'Greg Bird', 'Mike Trout', 'Mike Napoli', 'Kyle Seager', 'Mitch Garver', 'Eric Thames', 'Stephen Vogt', 'Jay Bruce', 'Jake Marisnick', 'Brandon Belt', 'Robinson Chirinos', 'Matt Carpenter', 'Nick Castellanos', 'Mike Trout', 'Adam Duvall', 'Chris Davis', 'Rhys Hoskins', 'Miguel Sano', 'Chris Carter', 'Adam Duvall', 'Robinson Chirinos', 'Justin Turner', 'Matt Carpenter', 'Luis Valbuena', 'Lucas Duda', 'Rhys Hoskins', 'Colby Rasmus', 'Eduardo Escobar', 'Brandon Belt', 'Chris Carter', 'Kyle Seager', 'Max Muncy', 'Austin Riley', 'Travis Shaw', 'Mike Trout', 'Miguel Sano', 'Aaron Judge', 'Freddie Freeman', 'Brandon Moss', 'Curtis Granderson', 'Colby Rasmus', 'Hunter Renfroe', 'Luis Valbuena', 'Jay Bruce', 'Joey Votto', 'Tom Murphy', 'Khris Davis', 'Christin Stewart', 'Gary Sanchez', 'Kyle Seager', 'ByungHo Park', '

In [136]:
year_list = savant_df.Year.tolist()

In [137]:
year_list = [int(year) for year in year_list]
print(year_list)

[2016, 2016, 2017, 2018, 2019, 2017, 2018, 2018, 2019, 2017, 2019, 2018, 2019, 2017, 2017, 2019, 2019, 2019, 2018, 2017, 2018, 2017, 2019, 2016, 2020, 2018, 2016, 2018, 2018, 2017, 2020, 2018, 2017, 2016, 2018, 2017, 2019, 2016, 2019, 2019, 2015, 2020, 2018, 2019, 2019, 2018, 2016, 2017, 2017, 2017, 2019, 2015, 2019, 2017, 2017, 2020, 2019, 2018, 2019, 2019, 2019, 2016, 2016, 2018, 2019, 2016, 2016, 2017, 2017, 2018, 2019, 2017, 2015, 2018, 2018, 2018, 2017, 2015, 2018, 2017, 2017, 2019, 2017, 2019, 2018, 2019, 2019, 2016, 2019, 2019, 2017, 2018, 2016, 2016, 2018, 2019, 2019, 2019, 2017, 2018, 2019, 2016, 2016, 2016, 2017, 2015, 2018, 2018, 2018, 2017, 2017, 2019, 2016, 2016, 2016, 2016, 2017, 2017, 2015, 2018, 2016, 2019, 2019, 2017, 2015, 2019, 2019, 2017, 2017, 2017, 2018, 2018, 2019, 2019, 2017, 2015, 2020, 2018, 2018, 2016, 2015, 2015, 2020, 2019, 2019, 2016, 2018, 2018, 2019, 2016, 2016, 2017, 2018, 2019, 2016, 2017, 2017, 2019, 2019, 2019, 2019, 2019, 2016, 2020, 2019, 2019, 201

In [138]:
player_year_id = []
for i, player in enumerate(final_player_list):
    id_str = player + "-" + str(year_list[i])
    player_year_id.append(id_str)
print(player_year_id)

['Jarrod Saltalamacchia-2016', 'Ryan Schimpf-2016', 'Joey Gallo-2017', 'Matt Carpenter-2018', 'Jay Bruce-2019', 'Matt Carpenter-2017', 'Joey Gallo-2018', 'Eric Thames-2018', 'Cavan Biggio-2019', 'Rhys Hoskins-2017', 'Joey Gallo-2019', 'Greg Bird-2018', 'Mike Trout-2019', 'Mike Napoli-2017', 'Kyle Seager-2017', 'Mitch Garver-2019', 'Eric Thames-2019', 'Stephen Vogt-2019', 'Jay Bruce-2018', 'Jake Marisnick-2017', 'Brandon Belt-2018', 'Robinson Chirinos-2017', 'Matt Carpenter-2019', 'Nick Castellanos-2016', 'Mike Trout-2020', 'Adam Duvall-2018', 'Chris Davis-2016', 'Rhys Hoskins-2018', 'Miguel Sano-2018', 'Chris Carter-2017', 'Adam Duvall-2020', 'Robinson Chirinos-2018', 'Justin Turner-2017', 'Matt Carpenter-2016', 'Luis Valbuena-2018', 'Lucas Duda-2017', 'Rhys Hoskins-2019', 'Colby Rasmus-2016', 'Eduardo Escobar-2019', 'Brandon Belt-2019', 'Chris Carter-2015', 'Kyle Seager-2020', 'Max Muncy-2018', 'Austin Riley-2019', 'Travis Shaw-2019', 'Mike Trout-2018', 'Miguel Sano-2016', 'Aaron Judg

In [139]:
savant_df['Player_Year_ID'] = player_year_id

In [140]:
savant_df.sample(15)

Unnamed: 0_level_0,Player,Year,Age,PA,G,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,...,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed,Player Link,Player_Year_ID
Rk.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1764,"Freese, David",2015.0,32.0,470.0,121.0,89.1,6.4,7.2,6.9,38.4,...,28.9,31.6,38.4,30.0,55.0,15.0,25.6,26.7,[/savant-player/501896],David Freese-2015
156,"Flores, Wilmer",2017.0,26.0,362.0,110.0,87.6,16.9,5.9,5.6,34.2,...,31.1,35.4,36.1,28.5,35.8,29.9,25.3,25.7,[/savant-player/527038],Wilmer Flores-2017
860,"Souza Jr., Steven",2018.0,29.0,272.0,72.0,88.7,16.2,7.8,6.0,39.8,...,22.2,37.3,37.3,25.3,38.0,22.9,28.9,28.1,[/savant-player/519306],Steven Souza Jr.-2018
158,"Kepler, Max",2019.0,26.0,596.0,134.0,89.7,18.2,8.9,7.5,42.1,...,27.6,50.8,30.1,19.1,35.9,29.8,21.7,27.7,[/savant-player/596146],Max Kepler-2019
796,"Pujols, Albert",2017.0,37.0,636.0,149.0,88.7,13.4,5.4,7.7,39.0,...,32.0,37.9,43.1,19.0,43.5,23.4,25.0,21.9,[/savant-player/405395],Albert Pujols-2017
178,"Frazier, Todd",2018.0,32.0,472.0,115.0,89.6,18.6,6.9,7.6,36.9,...,24.4,46.1,33.9,20.1,35.9,29.6,22.7,25.6,[/savant-player/453943],Todd Frazier-2018
1220,"Owings, Chris",2015.0,24.0,552.0,147.0,86.9,14.6,2.6,3.9,24.7,...,35.3,35.7,36.2,28.1,44.1,20.2,29.4,28.6,[/savant-player/572008],Chris Owings-2015
555,"Suzuki, Kurt",2016.0,33.0,373.0,106.0,87.3,14.5,2.6,5.0,30.5,...,28.8,39.7,35.8,24.5,40.7,25.2,23.8,25.4,[/savant-player/435559],Kurt Suzuki-2016
975,"Lucroy, Jonathan",2019.0,33.0,328.0,101.0,86.9,12.5,2.9,6.9,32.5,...,24.7,36.7,40.0,23.3,46.1,22.0,23.7,25.4,[/savant-player/518960],Jonathan Lucroy-2019
1865,"Morse, Michael",2015.0,33.0,256.0,98.0,90.3,0.4,9.8,4.6,39.2,...,30.5,35.3,34.0,30.7,59.5,12.4,28.1,24.6,[/savant-player/434604],Michael Morse-2015


Actual merging:

In [141]:
combined_df = savant_df.merge(fg_df, how='left', on='Player_Year_ID')

In [142]:
combined_df

Unnamed: 0,Player,Year,Age,PA,G,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,...,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed,Player Link,Player_Year_ID,wRC+
0,"Saltalamacchia, Jarrod",2016.0,31.0,292.0,92.0,89.2,21.4,11.6,6.8,35.4,...,44.2,34.0,21.8,31.3,43.5,19.7,25.6,[/savant-player/457454],Jarrod Saltalamacchia-2016,70
1,"Schimpf, Ryan",2016.0,28.0,330.0,89.0,90.3,29.7,16.7,8.0,41.4,...,41.4,33.9,24.7,20.1,42.0,21.8,28.2,[/savant-player/572114],Ryan Schimpf-2016,128
2,"Gallo, Joey",2017.0,24.0,532.0,145.0,93.3,23.0,22.1,8.3,52.2,...,50.2,29.2,20.6,29.2,37.9,22.1,27.8,[/savant-player/608336],Joey Gallo-2017,119
3,"Carpenter, Matt",2018.0,33.0,677.0,156.0,90.7,21.6,13.7,11.5,44.7,...,47.6,30.7,21.7,28.3,37.8,28.5,26.5,[/savant-player/572761],Matt Carpenter-2018,140
4,"Bruce, Jay",2019.0,32.0,333.0,98.0,90.2,21.4,13.4,8.2,43.7,...,42.9,29.9,27.3,29.4,37.7,19.9,26.7,[/savant-player/457803],Jay Bruce-2019,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1888,"Jankowski, Travis",2016.0,25.0,383.0,131.0,86.2,0.8,1.3,1.3,26.9,...,20.2,43.3,36.6,62.2,10.1,26.5,29.5,[/savant-player/608671],Travis Jankowski-2016,81
1889,"LeMahieu, DJ",2016.0,28.0,635.0,146.0,91.7,5.9,4.9,5.6,47.5,...,20.8,42.0,37.2,52.9,10.1,33.7,27.1,[/savant-player/518934],DJ LeMahieu-2016,130
1890,"Jankowski, Travis",2018.0,27.0,387.0,117.0,85.6,3.3,0.7,1.1,19.3,...,30.1,39.9,30.1,60.1,10.1,25.4,29.0,[/savant-player/608671],Travis Jankowski-2018,90
1891,"Slater, Austin",2018.0,26.0,225.0,74.0,87.3,2.6,2.3,0.8,35.1,...,23.7,36.6,39.7,63.4,9.2,22.9,28.0,[/savant-player/596103],Austin Slater-2018,81


1893 rows as expected.

### Exploring combined DF:

In [143]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player               1893 non-null   object 
 1   Year                 1893 non-null   float64
 2   Age                  1893 non-null   float64
 3   PA                   1893 non-null   float64
 4   G                    1893 non-null   float64
 5   Avg EV (MPH)         1893 non-null   float64
 6   Avg LA (°)           1893 non-null   float64
 7   Barrel%              1893 non-null   float64
 8   Solid Contact %      1893 non-null   float64
 9   Hard Hit %           1893 non-null   float64
 10  Zone Swing %         1893 non-null   float64
 11  Out of Zone Swing %  1893 non-null   float64
 12  Pull %               1893 non-null   float64
 13  Straight Away %      1893 non-null   float64
 14  Oppo %               1893 non-null   float64
 15  GB%                  1893 non-null   f

OK, still 6 null values for wRC+.  Who are they?

In [144]:
print(combined_df[combined_df['wRC+'].isnull()])

              Player    Year   Age     PA      G  Avg EV (MPH)  Avg LA (°)  \
94     Ramirez, Jose  2018.0  26.0  698.0  157.0          89.0        19.0   
165    Ramirez, Jose  2019.0  27.0  542.0  129.0          89.2        19.8   
470    Ramirez, Jose  2020.0  28.0  254.0   58.0          88.7        23.2   
651    Ramirez, Jose  2017.0  25.0  645.0  152.0          88.2        15.0   
807    Ramirez, Jose  2015.0  23.0  355.0   97.0          86.6        12.0   
1226   Ramirez, Jose  2016.0  24.0  618.0  152.0          88.4        13.2   

      Barrel%  Solid Contact %  Hard Hit %  ...  Pull %  Straight Away %  \
94        8.3              7.9        35.1  ...    47.8             33.9   
165       6.3              7.0        35.7  ...    48.1             31.2   
470      10.2              6.2        35.6  ...    47.5             33.9   
651       5.0              7.1        34.9  ...    42.8             34.9   
807       1.8              2.8        24.0  ...    40.3             35.0 

Just Jose Ramirez, but he has wRC+ data.  Since it's only 6 will enter manually. Based on above, those are indices 94, 165, 470, 651, 807, 1226.

In [156]:
combined_df.iloc[94, 21] = 146 #2018
combined_df.iloc[165, 21] = 104 #2019
combined_df.iloc[470, 21] = 163 #2020
combined_df.iloc[651, 21] = 146 #2017
combined_df.iloc[807, 21] = 72 #2015
combined_df.iloc[1226, 21] = 119 #2016

In [157]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player               1893 non-null   object 
 1   Year                 1893 non-null   float64
 2   Age                  1893 non-null   float64
 3   PA                   1893 non-null   float64
 4   G                    1893 non-null   float64
 5   Avg EV (MPH)         1893 non-null   float64
 6   Avg LA (°)           1893 non-null   float64
 7   Barrel%              1893 non-null   float64
 8   Solid Contact %      1893 non-null   float64
 9   Hard Hit %           1893 non-null   float64
 10  Zone Swing %         1893 non-null   float64
 11  Out of Zone Swing %  1893 non-null   float64
 12  Pull %               1893 non-null   float64
 13  Straight Away %      1893 non-null   float64
 14  Oppo %               1893 non-null   float64
 15  GB%                  1893 non-null   f

In [158]:
print(combined_df[combined_df['wRC+'].isnull()])

Empty DataFrame
Columns: [Player, Year, Age, PA, G, Avg EV (MPH), Avg LA (°), Barrel%, Solid Contact %, Hard Hit %, Zone Swing %, Out of Zone Swing %, Pull %, Straight Away %, Oppo %, GB%, FB%, LD %, Sprint Speed, Player Link, Player_Year_ID, wRC+]
Index: []

[0 rows x 22 columns]


We're all fixed!  Just need to change the wRC+ to a numerical value:

In [160]:
combined_df['wRC+'] = combined_df['wRC+'].apply(pd.to_numeric, errors='coerce')

In [161]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player               1893 non-null   object 
 1   Year                 1893 non-null   float64
 2   Age                  1893 non-null   float64
 3   PA                   1893 non-null   float64
 4   G                    1893 non-null   float64
 5   Avg EV (MPH)         1893 non-null   float64
 6   Avg LA (°)           1893 non-null   float64
 7   Barrel%              1893 non-null   float64
 8   Solid Contact %      1893 non-null   float64
 9   Hard Hit %           1893 non-null   float64
 10  Zone Swing %         1893 non-null   float64
 11  Out of Zone Swing %  1893 non-null   float64
 12  Pull %               1893 non-null   float64
 13  Straight Away %      1893 non-null   float64
 14  Oppo %               1893 non-null   float64
 15  GB%                  1893 non-null   f

In [163]:
combined_df.describe()

Unnamed: 0,Year,Age,PA,G,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,Zone Swing %,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed,wRC+
count,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0,1893.0
mean,2017.206022,28.82673,435.196513,114.937137,88.413312,12.229688,6.797834,5.7701,35.749234,66.659641,28.202324,37.0299,37.78748,25.182831,44.503803,22.432224,25.9028,27.015742,101.157422
std,1.557844,3.816009,155.619031,32.113398,2.215767,4.386683,4.020786,2.023787,7.913946,5.884461,5.974287,5.844259,3.847019,4.225243,6.920655,5.208745,3.629189,1.454924,26.002898
min,2015.0,20.0,200.0,46.0,80.2,-1.4,0.0,0.0,7.3,47.4,10.2,16.7,25.9,13.4,20.1,8.5,14.9,21.9,0.0
25%,2016.0,26.0,284.0,89.0,87.0,9.3,3.7,4.4,31.0,62.8,24.0,33.4,35.3,22.3,39.6,18.7,23.5,26.1,84.0
50%,2017.0,28.0,432.0,119.0,88.5,12.3,6.3,5.8,36.2,66.7,28.1,37.0,37.8,25.1,44.4,22.2,25.7,27.1,101.0
75%,2019.0,32.0,570.0,144.0,89.9,15.3,9.3,7.2,41.0,70.6,32.1,40.9,40.4,27.9,49.3,26.0,28.2,28.1,118.0
max,2020.0,44.0,747.0,162.0,96.1,29.7,25.7,18.0,62.2,83.7,51.3,56.8,53.8,43.4,66.2,43.5,40.4,30.8,197.0


None of these values seem out of the ordinary or absurd.

Actual last think i'd like to do is add in player position via Savant - i know this will take a while, so here goes:

In [164]:
player_no_duplicate_df = pd.DataFrame(data=[savant_df['Player'],savant_df['Player Link']]).T
player_no_duplicate_df

Unnamed: 0_level_0,Player,Player Link
Rk.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Saltalamacchia, Jarrod",[/savant-player/457454]
2,"Schimpf, Ryan",[/savant-player/572114]
3,"Gallo, Joey",[/savant-player/608336]
4,"Carpenter, Matt",[/savant-player/572761]
5,"Bruce, Jay",[/savant-player/457803]
...,...,...
1889,"Jankowski, Travis",[/savant-player/608671]
1890,"LeMahieu, DJ",[/savant-player/518934]
1891,"Jankowski, Travis",[/savant-player/608671]
1892,"Slater, Austin",[/savant-player/596103]


Next step, remove duplicates from the player column, since their links point to the same page (removing work for the computer later)

In [165]:
player_no_duplicate_df.drop_duplicates(subset='Player', inplace=True)

In [166]:
player_no_duplicate_df

Unnamed: 0_level_0,Player,Player Link
Rk.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Saltalamacchia, Jarrod",[/savant-player/457454]
2,"Schimpf, Ryan",[/savant-player/572114]
3,"Gallo, Joey",[/savant-player/608336]
4,"Carpenter, Matt",[/savant-player/572761]
5,"Bruce, Jay",[/savant-player/457803]
...,...,...
1885,"Venable, Will",[/savant-player/461416]
1886,"Fuld, Sam",[/savant-player/453539]
1887,"Schumaker, Skip",[/savant-player/435401]
1889,"Jankowski, Travis",[/savant-player/608671]


Mission accomplished, down to only 646 rows! Now, make a list of all the links to feed into a soup generator:

In [167]:
link_list = player_no_duplicate_df['Player Link'].to_list()

In [168]:
savant_page_list = [('https://baseballsavant.mlb.com' + str(link).strip("'[]")) for link in link_list]

In [169]:
print(savant_page_list)

['https://baseballsavant.mlb.com/savant-player/457454', 'https://baseballsavant.mlb.com/savant-player/572114', 'https://baseballsavant.mlb.com/savant-player/608336', 'https://baseballsavant.mlb.com/savant-player/572761', 'https://baseballsavant.mlb.com/savant-player/457803', 'https://baseballsavant.mlb.com/savant-player/519346', 'https://baseballsavant.mlb.com/savant-player/624415', 'https://baseballsavant.mlb.com/savant-player/656555', 'https://baseballsavant.mlb.com/savant-player/595885', 'https://baseballsavant.mlb.com/savant-player/545361', 'https://baseballsavant.mlb.com/savant-player/435063', 'https://baseballsavant.mlb.com/savant-player/572122', 'https://baseballsavant.mlb.com/savant-player/641598', 'https://baseballsavant.mlb.com/savant-player/519390', 'https://baseballsavant.mlb.com/savant-player/545350', 'https://baseballsavant.mlb.com/savant-player/474832', 'https://baseballsavant.mlb.com/savant-player/455139', 'https://baseballsavant.mlb.com/savant-player/592206', 'https://

In [171]:
#Running the len just to audit/sanity check:
len(savant_page_list)

646

Defining a Function to generate the soup objects:

In [181]:
def Soup_Generator(link_list):
    '''
    Takes in a list of players and returns Beautiful Soup objects.
    '''
    response_list = [requests.get(url) for url in link_list]
    soup_list = []
    for response in response_list:
        page = response.text
        soup_list.append(BeautifulSoup(page,'lxml'))
    return soup_list

In [None]:
link_soup = Soup_Generator(savant_page_list)

In [None]:
def Position_Puller(soup_list):
    '''
    Takes in a list of Beautiful Soup Objects from BasebalL Savant
    and returns each player's position.
    '''
    position_list = []
    for player in soup_list:
        position = player.find('div', style='font-size: .8rem;')
        position_list.append(str(position.text))
    final_position = []
    for position in position_list:
        position = position.strip("\n")
        position = position.strip(" ")
        final_position.append(position[0:2].strip(" "))
    return final_position

In [None]:
final_list = Position_Puller(link_soup)

In [None]:
print(final_list)