In [1]:
from requests import Session
from requests.exceptions import HTTPError
import json
import time
import pandas as pd
from random import uniform
import os
import copy
from matplotlib import pyplot as plt
import seaborn as sns

In [19]:
# Create a class to retry failed requests
class ResilientSession(Session):
    """
    This class is supposed to retry requests that return temporary errors.
    At this moment it supports: 500, 502, 503, 504
    """

    def request(self, method, url, **kwargs):
        counter = 0

        while True:
            counter += 1

            r = super(ResilientSession, self).request(method, url, **kwargs)

            if r.status_code in [ 500, 502, 503, 504 ]:
                delay = 10 * counter
                logging.warn("Got recoverable error [%s] from %s %s, retry #%s in %ss" % (r.status_code, method, url, counter, delay))
                time.sleep(delay)
                continue

            return r

In [20]:
# Create session object
s = ResilientSession()

In [8]:
# Load the data downloaded from api
hitting_temp = pd.read_csv('hitting_data_2012_2019.csv')

In [9]:
# Didn't save it correctly, so drop a column
hitting_temp.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
# Season 2019 plans to be test data, so let's remove it from our analysis now
hitting_temp.drop(hitting_temp[hitting_temp['season'] !=2019].index, inplace = True) 

In [11]:
# Some players move teams during the season, I just want their overall stats to calculate their overall score
# This series contains IDs of players that moved.
movedPlayers = hitting_temp[hitting_temp['team'].isnull()]['id']

In [12]:
# Remove team specific playerIDs leaving rolled up stats
hitting_temp.drop(hitting_temp[hitting_temp['id'].isin(movedPlayers) & 
                               hitting_temp['team'].notnull()].index,inplace = True)

In [13]:
# If primary position is null, fill it with DH
hitting_temp['primaryPosition'].fillna('DH',inplace = True)

In [14]:
# Create labels based on games played
bin_labels = ['role','bench','platoon','everyday']
hitting_temp['playing_time'] = hitting_temp.groupby('primaryPosition')['gamesPlayed'].transform(
    lambda x: pd.qcut(x,q=4, labels=bin_labels))

In [16]:
# Some stats come in as string because of some "filler" data. Coerce them into numbers
hitting_temp['avg'] = pd.to_numeric(hitting_temp['avg'],errors='coerce')
hitting_temp['obp'] = pd.to_numeric(hitting_temp['obp'],errors='coerce')
hitting_temp['slg'] = pd.to_numeric(hitting_temp['slg'],errors='coerce')
hitting_temp['stolenBasePercentage'] = pd.to_numeric(hitting_temp['stolenBasePercentage'],errors='coerce')
hitting_temp['babip'] = pd.to_numeric(hitting_temp['babip'],errors='coerce')
hitting_temp['groundOutsToAirouts'] = pd.to_numeric(hitting_temp['groundOutsToAirouts'],errors='coerce')
hitting_temp['atBatsPerHomeRun'] = pd.to_numeric(hitting_temp['atBatsPerHomeRun'],errors='coerce')

In [21]:
# get a list of season starts
# Get the **ALL** seasons
seasonStarts = {}
try:
    seasonResponse = s.get('https://statsapi.mlb.com/api/v1/seasons/all?sportId=1')
    seasonResponse.raise_for_status()

    # access JSON content
    jsonSeasonResponse = seasonResponse.json()

except HTTPError as http_err:
    print(f'HTTP error occurred: {http_err}')
except Exception as err:
    print(f'Other error occurred[season]: {err}')

for season in jsonSeasonResponse['seasons']:
    # Get data for 2012 through 2019
    if int(season['seasonId']) >= 2012 and int(season['seasonId']) < 2020:
        seasonStarts[season['seasonId']] = season['regularSeasonStartDate']
    else:
        continue


In [22]:
# Add a season start column to our data set
hitting_temp['season_start'] = hitting_temp['season'].apply(lambda x:seasonStarts[str(x)] )

In [23]:
# Use season start to calculate a players age at the start of a season
hitting_temp['age_at_start'] = (np.floor((pd.to_datetime(hitting_temp['season_start']) - 
             pd.to_datetime(hitting_temp['birthDate'])).dt.days / 365.25)).astype(int)

In [24]:
# categorize players by age at start of the season
hitting_temp['age_group'] = pd.qcut(hitting_temp['age_at_start'], 4, labels=["young", "prime", "pastprime", "old"])