# Get WAR, Salary and Position for each player: 2000-2019

In [50]:
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import sqlite3
import re

#### Scrape Baseball Reference data to get every hitter's data for each season

In [51]:
def bbref_scrape(stat_type, year, tables):
    response = requests.get('https://www.baseball-reference.com/leagues/MLB/{}-value-{}.shtml'.format(str(year), stat_type))

    soup = BeautifulSoup(response.text, 'html.parser')

    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    for each in comments:
        if 'table' in each:
            try:
                table = pd.read_html(each)[0]
                table['Year'] = year
                table['WAR_Type'] = stat_type
                if tables.empty:
                    tables = table.copy()
                else:
                    tables = pd.concat([tables, table])
            except:
                continue
    return tables

In [52]:
hitting_tables = pd.DataFrame()
pitching_tables = pd.DataFrame()
for year in range(2000, 2020):
    hitting_tables = bbref_scrape('batting', year, hitting_tables)
    pitching_tables =  bbref_scrape('pitching', year, pitching_tables)
hitting_tables

Unnamed: 0,Rk,Name,Age,Tm,G,PA,Rbat,Rbaser,Rdp,Rfield,...,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary,Acquired,Pos Summary,Year,WAR_Type
0,1,Jeff Abbott,27,CHW,80,242,-7,1,1,-6,...,.486,.493,0.3,-0.6,2,"$255,000",Amateur Draft,87H9/D,2000,batting
1,2,Kurt Abbott,31,NYM,79,173,-9,0,0,-2,...,.489,.495,-0.2,0.0,-1,"$500,000",Free Agency,6H4/58,2000,batting
2,3,Paul Abbott,32,SEA,2,6,0,0,0,0,...,.554,.501,0.1,0.0,1,"$285,000",Free Agency,1,2000,batting
3,4,Bobby Abreu*,26,PHI,154,680,38,-2,1,15,...,.528,.527,4.9,0.8,52,"$2,933,333",Traded,*9/H,2000,batting
4,5,Juan Acevedo,30,MIL,60,2,0,0,0,0,...,.501,.500,0.0,0.0,0,"$612,500",Traded,1,2000,batting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1461,1406,Kyle Zimmer,27,KCR,1,0,0,0,0,0,...,.500,.500,0.0,0.0,0,"$555,000",Free Agency,1,2019,batting
1462,1407,Ryan Zimmerman,34,WSN,52,190,-4,1,0,3,...,.495,.498,0.0,0.0,0,"$18,000,000",Amateur Draft,3/HD,2019,batting
1463,1408,Jordan Zimmermann,33,DET,1,2,-1,0,0,0,...,.473,.500,0.0,0.0,0,"$25,000,000",Free Agency,1,2019,batting
1464,1409,Ben Zobrist#,38,CHC,47,176,-5,0,0,-2,...,.483,.495,-0.1,-0.2,0,"$12,500,000",Free Agency,49/7HD16,2019,batting


In [53]:
pitching_tables

Unnamed: 0,Rk,Name,Age,Tm,IP,G,GS,R,RA9,RA9opp,...,gmLI,WAAadj,WAR,RAR,waaWL%,162WL%,Salary,Acquired,Year,WAR_Type
0,1,Paul Abbott,32,SEA,179.0,35,27,89,4.47,5.40,...,1.25,-0.1,2.6,32,.524,.505,"$285,000",Free Agency,2000,pitching
1,2,Juan Acevedo,30,MIL,82.2,62,0,38,4.14,5.11,...,1.04,-0.2,0.9,11,.505,.502,"$612,500",Traded,2000,pitching
2,3,Terry Adams,27,LAD,84.1,66,0,42,4.48,5.16,...,1.38,-0.2,0.6,8,.500,.500,"$1,400,000",Traded,2000,pitching
3,4,Rick Aguilera,38,CHC,47.2,54,0,28,5.29,5.07,...,2.14,-0.3,-0.2,1,.494,.498,"$3,500,000",Traded,2000,pitching
4,5,Scott Aldred*,32,PHI,20.1,23,0,14,6.20,5.17,...,.80,0.0,-0.1,-1,.487,.498,"$600,000",Traded,2000,pitching
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,827,Daniel Zamora*,26,NYM,8.2,17,0,5,5.19,5.09,...,1.40,0.0,0.0,0,.498,.500,,Traded,2019,pitching
860,828,T.J. Zeuch,23,TOR,22.2,5,3,13,5.16,5.49,...,.74,0.0,0.3,4,.524,.501,,Amateur Draft,2019,pitching
861,829,Kyle Zimmer,27,KCR,18.1,15,0,22,10.80,4.91,...,.42,0.3,-0.7,-10,.425,.493,"$555,000",Free Agency,2019,pitching
862,830,Jordan Zimmermann,33,DET,112.0,23,23,89,7.15,5.07,...,,-0.1,-0.2,-1,.446,.492,"$25,000,000",Free Agency,2019,pitching


#### Filter out the columns we do not need

In [75]:
def needed_cols(tables):
    needed_cols = tables[["Year","Name","Tm","WAR","Salary","WAR_Type"]].copy()
    for col in tables.columns:
        if "Pos" in col:
            needed_cols['Position'] = tables[col]
            break
        else:
            needed_cols['Position'] = '1'
    needed_cols['Name'] = needed_cols['Name'].map(lambda x: x.rstrip('#*'))
    return needed_cols

In [76]:
needed_cols = pd.concat([needed_cols(hitting_tables), needed_cols(pitching_tables)])
needed_cols

Unnamed: 0,Year,Name,Tm,WAR,Salary,WAR_Type,Position
0,2000,Jeff Abbott,CHW,-0.3,"$255,000",batting,87H9/D
1,2000,Kurt Abbott,NYM,-0.4,"$500,000",batting,6H4/58
2,2000,Paul Abbott,SEA,0.1,"$285,000",batting,1
3,2000,Bobby Abreu,PHI,6.2,"$2,933,333",batting,*9/H
4,2000,Juan Acevedo,MIL,0.0,"$612,500",batting,1
...,...,...,...,...,...,...,...
859,2019,Daniel Zamora,NYM,0.0,,pitching,1
860,2019,T.J. Zeuch,TOR,0.3,,pitching,1
861,2019,Kyle Zimmer,KCR,-0.7,"$555,000",pitching,1
862,2019,Jordan Zimmermann,DET,-0.2,"$25,000,000",pitching,1


#### Convert Position numbers to abbreviations

In [77]:
position_dict = {'1':'P','2':'C','3':'1B','4':'2B','5':'3B','6':'SS','7':'LF','8':'CF','9':'RF','0':'DH'}
clean_position = needed_cols.copy()
clean_position['Position'] = clean_position['Position'].str.replace('[^\dA-Za-z]', '')
clean_position['Position'] = clean_position['Position'].str.replace('DH', '0')
clean_position['Position'] = clean_position['Position'].str.extract('(\d)')
clean_position['Position'].replace(position_dict, inplace=True)
clean_position.head()

Unnamed: 0,Year,Name,Tm,WAR,Salary,WAR_Type,Position
0,2000,Jeff Abbott,CHW,-0.3,"$255,000",batting,CF
1,2000,Kurt Abbott,NYM,-0.4,"$500,000",batting,SS
2,2000,Paul Abbott,SEA,0.1,"$285,000",batting,P
3,2000,Bobby Abreu,PHI,6.2,"$2,933,333",batting,RF
4,2000,Juan Acevedo,MIL,0.0,"$612,500",batting,P


#### Filter out rows with invalid Salary or WAR data as well as Pitcher data, and convert Salary to float

In [78]:
clean_salary = clean_position[(clean_position['Salary'] != 'Salary') & (clean_position['Position'].isin(position_dict.values()))].copy()
clean_salary['Salary'] = clean_salary['Salary'].str.replace(',', '')
clean_salary['Salary'] = clean_salary['Salary'].str.replace('$', '').astype('float')
clean_salary = clean_salary[(clean_salary['Salary'] > 0) & (clean_salary['WAR'].notna())]
clean_salary.head()

Unnamed: 0,Year,Name,Tm,WAR,Salary,WAR_Type,Position
0,2000,Jeff Abbott,CHW,-0.3,255000.0,batting,CF
1,2000,Kurt Abbott,NYM,-0.4,500000.0,batting,SS
2,2000,Paul Abbott,SEA,0.1,285000.0,batting,P
3,2000,Bobby Abreu,PHI,6.2,2933333.0,batting,RF
4,2000,Juan Acevedo,MIL,0.0,612500.0,batting,P


#### The WAR/Salary/Position data is now clean

# Get Team Success: 2000-2019

#### Get the standings/playoffs data from each season

In [72]:
con = sqlite3.connect("lahmansbaseballdb.sqlite")
cur = con.cursor()
teams = pd.read_sql_query("SELECT yearID as Year, teamID as Tm, W, L, DivWin, WCWin, LgWin, WSWin FROM teams WHERE yearID>=2000", con)
con.close()
teams.head()

Unnamed: 0,Year,Tm,W,L,DivWin,WCWin,LgWin,WSWin
0,2000,ANA,82,80,N,N,N,N
1,2000,BAL,74,88,N,N,N,N
2,2000,BOS,85,77,N,N,N,N
3,2000,CHA,95,67,Y,N,N,N
4,2000,CLE,90,72,N,N,N,N


#### Standardize Team abbreviations and remove data points where player played for multiple teams

In [89]:
team_dict = {'CHA':'CHW','CHN':'CHC','SLN':'STL','SFN':'SFG','SDN':'SDP','NYN':'NYM','NYA':'NYY','KCA':'KCR','FLO':'FLA','TBA':'TBR','WSN':'WAS','LAN':'LAD','TBD':'TBR'}
teams['Tm'].replace(team_dict, inplace=True)
clean_salary['Tm'].replace(team_dict, inplace=True)
WAR_and_Salary_by_Position = clean_salary.merge(teams, how='outer', on=["Year", "Tm"])
WAR_and_Salary_by_Position = WAR_and_Salary_by_Position[(WAR_and_Salary_by_Position['Position'] != 'P') | (WAR_and_Salary_by_Position['WAR_Type'] != 'batting')]
WAR_and_Salary_by_Position.drop_duplicates(subset=['Year','Name','Tm','Salary'])
WAR_and_Salary_by_Position.head()

Unnamed: 0,Year,Name,Tm,WAR,Salary,WAR_Type,Position,W,L,DivWin,WCWin,LgWin,WSWin
0,2000,Jeff Abbott,CHW,-0.3,255000.0,batting,CF,95.0,67.0,Y,N,N,N
2,2000,McKay Christensen,CHW,-0.2,212500.0,batting,CF,95.0,67.0,Y,N,N,N
3,2000,Ray Durham,CHW,3.1,4900000.0,batting,2B,95.0,67.0,Y,N,N,N
8,2000,Mark Johnson,CHW,0.5,250000.0,batting,C,95.0,67.0,Y,N,N,N
9,2000,Paul Konerko,CHW,1.4,305000.0,batting,1B,95.0,67.0,Y,N,N,N


#### Export data to csv

In [90]:
WAR_and_Salary_by_Position.to_csv('WAR_and_Salary_by_Position.csv', index=False)