In [1]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed
import pybaseball
pd.options.mode.chained_assignment = None 
from date_num import date_num
#https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas

In [2]:
#ho_aw = home/away, I wanted to use a different variable than the df to keep them straight
# Can be "Home" or "@"
make_time_double = lambda x: float(x['Time'].split(":")[0]) + (float(x['Time'].split(":")[1])/60)



def get_szn_schedule(year, team, ho_aw = None):
    data = pybaseball.schedule_and_record(year, team)
    data = data[['Date', 'Tm', 'Home_Away', 'Opp', 'R', 'RA', 'Inn', 'Time', 'D/N', 'Attendance', 'cLI']]
    data['Run Difference'] = np.abs(data['R'] - data['RA'])
    
    if ho_aw != None:
        data = data[data["Home_Away"] == ho_aw]

    data['Time Double'] = data.apply(make_time_double, axis = 1)
    data = data.drop('Time', axis = 1)
    
    data['Year'] = year
    return data

In [3]:
team_list = ["ARI", "ATL", "BAL", "BOS", "CHW", "CHC", "CIN", "CLE", "COL", "DET", "HOU", "KC", "LAA", 
             "LAD", "MIA", "MIL", "MIN", "NYM", "NYY", "OAK", "PHI", "PIT", "SD", "SF", "SEA", "STL", 
            "TBR", "TEX", "TOR", "WSN"]
year_list = [2017, 2018, 2019, 2020, 2021]

calc_total_cLI = lambda y: float(y['Home cLI']) + float(y['Away cLI'])

In [4]:
df_list = []
for year in year_list:
    print(f'Starting year: {year}')
    
    series_list = Parallel(n_jobs = -1)(
        delayed(get_szn_schedule)(year, team, "Home") for team in team_list)
    home_games = pd.concat(series_list, axis = 0)
    
    series_list = Parallel(n_jobs = -1)(
        delayed(get_szn_schedule)(year, team, "@") for team in team_list)
    away_games = pd.concat(series_list, axis = 0)\
        .drop(['Home_Away', 'Run Difference', 'Time Double', 'Year', 'D/N', 'Inn', 
               'Tm', 'Attendance'], axis = 1)
    
    
    all_games = pd.merge(left = home_games, 
                    right = away_games, 
                    left_on = ['Date', 'Tm', 'R', 'RA'],
                    right_on = ['Date', 'Opp', 'RA', 'R']
        ).drop(['Opp_y', 'R_y', 'RA_y'], axis = 1)
    
    df_list.append(all_games)
    #break
 
print("Concattenating and dropping")
all_games = pd.concat(df_list, axis = 0, ignore_index = True)
all_games = all_games.rename({
    'Tm' : 'Home Team',
    'Opp_x' : 'Away Team',
    'R_x' : 'Home Score',
    'RA_x' : 'Away Score',
    'cLI_x' : 'Home cLI',
    'cLI_y' : 'Away cLI',
}, axis = 'columns').drop('Home_Away', axis = 1)


all_games['Total cLI'] = pd.NA
#Def not best practice to iterrows but it works
#and I don't have time to figure out why the lambda fxn didn't
for index, row in all_games.iterrows():
    
    try:
        all_games.at[index, 'Total cLI'] = \
            float(row['Home cLI']) + float(row['Away cLI'])
    except:
        all_games.at[index, 'Total cLI'] = pd.NA
    
    #break
    #print(row['Date'])    
    temp_date = row['Date'].split(',')[1].split(' ')
    
    if int(temp_date[2]) < 10:
        temp_date[2] = '0' + temp_date[2]
    
    if temp_date[1] == 'Mar':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-03-' + temp_date[2]
        
    elif temp_date[1] == 'Apr':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-04-' + temp_date[2]
    
    elif temp_date[1] == 'May':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-05-' + temp_date[2]
    
    elif temp_date[1] == 'Jun':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-06-' + temp_date[2]
    
    elif temp_date[1] == 'Jul':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-07-' + temp_date[2]
    
    elif temp_date[1] == 'Aug':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-08-' + temp_date[2]
    
    elif temp_date[1] == 'Sep':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-09-' + temp_date[2]
    
    elif temp_date[1] == 'Oct':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-10-' + temp_date[2]
    else:
        print(row['Date'])
    
print("Done")

Starting year: 2017
Starting year: 2018
Starting year: 2019
Starting year: 2020
Starting year: 2021
Concattenating and dropping
Done


I have successfully compiled a df that has date, home, away, home score, away score, num inns, D/N, Attendance, cLI for both teams, run differential and time. 

Now I want to get weather conditions, so I have temp and precip, and then I think that would be enough for a regression. 

I think it'd be best if I wrote a quick .py file that took a home team as an input and returned a meteostat point with the ballpark's location

Next, lets write a function that takes in a series (of a game) and then returns the series with the weather data. Since its a function we can parallelize it again.

In [5]:
from get_weather import park_weather

def add_weather(game):
    
    weather = park_weather(game['Home Team'], game['Date'])
    game = game.append(weather.iloc[0])
    game = game.append(pd.Series({
        'day_num_of_year' : date_num(game['Date'])
    }))
    return game.drop(['tmin', 'tmax', 'wdir', 'wpgt', 'pres', 'tsun'])

In [7]:
series_list = Parallel(n_jobs = 3, verbose = 2)(delayed(add_weather)(game) for index, game in all_games.iterrows())
home_games = pd.DataFrame(series_list)


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  43 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done 285 tasks      | elapsed:    8.8s
[Parallel(n_jobs=3)]: Done 691 tasks      | elapsed:   20.0s
[Parallel(n_jobs=3)]: Done 1257 tasks      | elapsed:   33.5s
[Parallel(n_jobs=3)]: Done 1987 tasks      | elapsed:   50.6s
[Parallel(n_jobs=3)]: Done 2877 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done 3931 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 5145 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done 6523 tasks      | elapsed:  2.6min
[Parallel(n_jobs=3)]: Done 8061 tasks      | elapsed:  3.2min
[Parallel(n_jobs=3)]: Done 9763 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done 10617 out of 10617 | elapsed:  4.2min finished


In [None]:
"""
for index, game in all_games.iterrows():
    try:
        add_weather(game)
    except:
        print(game, "\n\n")
    if (index % 100) == 0:
        print("Index:", index)
"""

In [None]:
home_games.columns

In [8]:
#print(np.asarray(np.unique(home_games['prcp'], return_counts = True)).T)
#home_games[home_games['tsun'] == 'NaN']
home_games.at[home_games['snow'].isna(), 'snow'] = 0


home_games = home_games.rename(columns = {'D/N' : 'Day'})
home_games['Day'] = np.where((home_games['Day'] == 'D'), 1, 0 )
np.unique(home_games['Day'])
home_games.to_csv('data_files/other_factors.csv', index = False)

In [None]:
home_games