# Get WAR, Salary and Position for each player: 2000-2019

In [1]:
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import sqlite3
import re

#### Scrape Baseball Reference data to get every hitter's data for each season

In [2]:
tables = pd.DataFrame()
for year in range(2000, 2020):
    response = requests.get('https://www.baseball-reference.com/leagues/MLB/{}-value-batting.shtml'.format(year))

    soup = BeautifulSoup(response.text, 'html.parser')

    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    for each in comments:
        if 'table' in each:
            try:
                table = pd.read_html(each)[0]
                table['Year'] = year
                if tables.empty:
                    tables = table.copy()
                else:
                    tables = pd.concat([tables, table])
            except:
                continue
tables.head()

Unnamed: 0,Rk,Name,Age,Tm,G,PA,Rbat,Rbaser,Rdp,Rfield,...,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary,Acquired,Pos Summary,Year
0,1,Jeff Abbott,27,CHW,80,242,-7,1,1,-6,...,-0.3,0.486,0.493,0.3,-0.6,2,"$255,000",Amateur Draft,87H9/D,2000
1,2,Kurt Abbott,31,NYM,79,173,-9,0,0,-2,...,-0.4,0.489,0.495,-0.2,0.0,-1,"$500,000",Free Agency,6H4/58,2000
2,3,Paul Abbott,32,SEA,2,6,0,0,0,0,...,0.1,0.554,0.501,0.1,0.0,1,"$285,000",Free Agency,1,2000
3,4,Bobby Abreu*,26,PHI,154,680,38,-2,1,15,...,6.2,0.528,0.527,4.9,0.8,52,"$2,933,333",Traded,*9/H,2000
4,5,Juan Acevedo,30,MIL,60,2,0,0,0,0,...,0.0,0.501,0.5,0.0,0.0,0,"$612,500",Traded,1,2000


#### Filter out the columns we do not need

In [3]:
needed_cols = tables[["Year","Tm","WAR","Salary"]].copy()
for col in tables.columns:
    if "Pos" in col:
        needed_cols['Position'] = tables[col]
        break
needed_cols.head()

Unnamed: 0,Year,Tm,WAR,Salary,Position
0,2000,CHW,-0.3,"$255,000",87H9/D
1,2000,NYM,-0.4,"$500,000",6H4/58
2,2000,SEA,0.1,"$285,000",1
3,2000,PHI,6.2,"$2,933,333",*9/H
4,2000,MIL,0.0,"$612,500",1


#### Convert Position numbers to abbreviations

In [4]:
position_dict = {'1':'P','2':'C','3':'1B','4':'2B','5':'3B','6':'SS','7':'LF','8':'CF','9':'RF'}
clean_position = needed_cols.copy()
clean_position['Position'] = clean_position['Position'].str.extract('(\d)')
clean_position['Position'].replace(position_dict, inplace=True)
clean_position.head()

Unnamed: 0,Year,Tm,WAR,Salary,Position
0,2000,CHW,-0.3,"$255,000",CF
1,2000,NYM,-0.4,"$500,000",SS
2,2000,SEA,0.1,"$285,000",P
3,2000,PHI,6.2,"$2,933,333",RF
4,2000,MIL,0.0,"$612,500",P


#### Filter out rows with invalid Salary or WAR data as well as Pitcher data, and convert Salary to float

In [5]:
position_dict.pop('1')
clean_salary = clean_position[(clean_position['Salary'] != 'Salary') & (clean_position['Position'].isin(position_dict.values()))].copy()
clean_salary['Salary'] = clean_salary['Salary'].str.replace(',', '')
clean_salary['Salary'] = clean_salary['Salary'].str.replace('$', '').astype('float')
clean_salary = clean_salary[(clean_salary['Salary'] > 0) & (clean_salary['WAR'].notna())]
clean_salary.head()

Unnamed: 0,Year,Tm,WAR,Salary,Position
0,2000,CHW,-0.3,255000.0,CF
1,2000,NYM,-0.4,500000.0,SS
3,2000,PHI,6.2,2933333.0,RF
6,2000,NYM,2.0,220000.0,LF
8,2000,BOS,0.3,200000.0,RF


#### The WAR/Salary/Position data is now clean

# Get Team Success: 2000-2019

#### Get the standings/playoffs data from each season

In [6]:
con = sqlite3.connect("lahmansbaseballdb.sqlite")
cur = con.cursor()
teams = pd.read_sql_query("SELECT yearID as Year, teamID as Tm, W, L, DivWin, WCWin, LgWin, WSWin FROM teams WHERE yearID>=2000", con)
con.close()
teams.head()

Unnamed: 0,Year,Tm,W,L,DivWin,WCWin,LgWin,WSWin
0,2000,ANA,82,80,N,N,N,N
1,2000,BAL,74,88,N,N,N,N
2,2000,BOS,85,77,N,N,N,N
3,2000,CHA,95,67,Y,N,N,N
4,2000,CLE,90,72,N,N,N,N


#### Standardize Team abbreviations and remove data points where player played for multiple teams

In [7]:
team_dict = {'CHA':'CHW','CHN':'CHC','SLN':'STL','SFN':'SFG','SDN':'SDP','NYN':'NYM','NYA':'NYY','KCA':'KCR','FLO':'FLA','TBA':'TBR','WSN':'WAS','LAN':'LAD','TBD':'TBR'}
teams['Tm'].replace(team_dict, inplace=True)
clean_salary['Tm'].replace(team_dict, inplace=True)
WAR_and_Salary_by_Position = clean_salary.merge(teams, how='outer', on=["Year", "Tm"])
WAR_and_Salary_by_Position = WAR_and_Salary_by_Position[~WAR_and_Salary_by_Position['Tm'].str.contains('\d')]
WAR_and_Salary_by_Position.head()

Unnamed: 0,Year,Tm,WAR,Salary,Position,W,L,DivWin,WCWin,LgWin,WSWin
0,2000,CHW,-0.3,255000.0,CF,95.0,67.0,Y,N,N,N
1,2000,CHW,-0.2,212500.0,CF,95.0,67.0,Y,N,N,N
2,2000,CHW,3.1,4900000.0,2B,95.0,67.0,Y,N,N,N
3,2000,CHW,0.5,250000.0,C,95.0,67.0,Y,N,N,N
4,2000,CHW,1.4,305000.0,1B,95.0,67.0,Y,N,N,N


#### Export data to csv

In [8]:
WAR_and_Salary_by_Position.to_csv('WAR_and_Salary_by_Position.csv', index=False)