In [4]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed
import pybaseball
pd.options.mode.chained_assignment = None 
from date_num import date_num
#https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas

In [5]:
#ho_aw = home/away, I wanted to use a different variable than the df to keep them straight
# Can be "Home" or "@"
make_time_double = lambda x: float(x['Time'].split(":")[0]) + (float(x['Time'].split(":")[1])/60)



def get_szn_schedule(year, team, ho_aw = None):
    data = pybaseball.schedule_and_record(year, team)
    data = data[['Date', 'Tm', 'Home_Away', 'Opp', 'R', 'RA', 'Inn', 'Time', 'D/N', 'Attendance', 'cLI']]
    data['Run Difference'] = np.abs(data['R'] - data['RA'])
    
    if ho_aw != None:
        data = data[data["Home_Away"] == ho_aw]

    data['Time Double'] = data.apply(make_time_double, axis = 1)
    data = data.drop('Time', axis = 1)
    
    data['Year'] = year
    return data

In [6]:
team_list = ["ARI", "ATL", "BAL", "BOS", "CHW", "CHC", "CIN", "CLE", "COL", "DET", "HOU", "KC", "LAA", 
             "LAD", "MIA", "MIL", "MIN", "NYM", "NYY", "OAK", "PHI", "PIT", "SD", "SF", "SEA", "STL", 
            "TBR", "TEX", "TOR", "WSN"]
year_list = [2017, 2018, 2019, 2020, 2021]

calc_total_cLI = lambda y: float(y['Home cLI']) + float(y['Away cLI'])

In [7]:
df_list = []
for year in year_list:
    print(f'Starting year: {year}')
    
    series_list = Parallel(n_jobs = -1)(
        delayed(get_szn_schedule)(year, team, "Home") for team in team_list)
    home_games = pd.concat(series_list, axis = 0)
    
    series_list = Parallel(n_jobs = -1)(
        delayed(get_szn_schedule)(year, team, "@") for team in team_list)
    away_games = pd.concat(series_list, axis = 0)\
        .drop(['Home_Away', 'Run Difference', 'Time Double', 'Year', 'D/N', 'Inn', 
               'Tm', 'Attendance'], axis = 1)
    
    
    all_games = pd.merge(left = home_games, 
                    right = away_games, 
                    left_on = ['Date', 'Tm', 'R', 'RA'],
                    right_on = ['Date', 'Opp', 'RA', 'R']
        ).drop(['Opp_y', 'R_y', 'RA_y'], axis = 1)
    
    df_list.append(all_games)
    #break
 
print("Concattenating and dropping")
all_games = pd.concat(df_list, axis = 0, ignore_index = True)
all_games = all_games.rename({
    'Tm' : 'Home Team',
    'Opp_x' : 'Away Team',
    'R_x' : 'Home Score',
    'RA_x' : 'Away Score',
    'cLI_x' : 'Home cLI',
    'cLI_y' : 'Away cLI',
}, axis = 'columns').drop('Home_Away', axis = 1)


all_games['Total cLI'] = pd.NA
#Def not best practice to iterrows but it works
#and I don't have time to figure out why the lambda fxn didn't
for index, row in all_games.iterrows():
    
    try:
        all_games.at[index, 'Total cLI'] = \
            float(row['Home cLI']) + float(row['Away cLI'])
    except:
        all_games.at[index, 'Total cLI'] = pd.NA
    
    #break
    #print(row['Date'])    
    temp_date = row['Date'].split(',')[1].split(' ')
    
    if int(temp_date[2]) < 10:
        temp_date[2] = '0' + temp_date[2]
    
    if temp_date[1] == 'Mar':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-03-' + temp_date[2]
        
    elif temp_date[1] == 'Apr':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-04-' + temp_date[2]
    
    elif temp_date[1] == 'May':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-05-' + temp_date[2]
    
    elif temp_date[1] == 'Jun':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-06-' + temp_date[2]
    
    elif temp_date[1] == 'Jul':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-07-' + temp_date[2]
    
    elif temp_date[1] == 'Aug':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-08-' + temp_date[2]
    
    elif temp_date[1] == 'Sep':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-09-' + temp_date[2]
    
    elif temp_date[1] == 'Oct':
        all_games.at[index, 'Date'] = \
            str(row['Year']) + '-10-' + temp_date[2]
    else:
        print(row['Date'])
    
print("Done")

Starting year: 2017
Starting year: 2018
Starting year: 2019
Starting year: 2020
Starting year: 2021
Concattenating and dropping
Done


I have successfully compiled a df that has date, home, away, home score, away score, num inns, D/N, Attendance, cLI for both teams, run differential and time. 

Now I want to get weather conditions, so I have temp and precip, and then I think that would be enough for a regression. 

I think it'd be best if I wrote a quick .py file that took a home team as an input and returned a meteostat point with the ballpark's location

Next, lets write a function that takes in a series (of a game) and then returns the series with the weather data. Since its a function we can parallelize it again.

In [10]:
from get_weather import park_weather

def add_weather(game):
    
    weather = park_weather(game['Home Team'], game['Date'])
    game = game.append(weather.iloc[0])
    game = game.append(pd.Series({
        'day_num_of_year' : date_num(game['Date'])
    }))
    return game.drop(['tmin', 'tmax', 'wdir', 'wpgt', 'pres', 'tsun'])

In [30]:
series_list = Parallel(n_jobs = -4, verbose = 2)(delayed(add_weather)(game) for index, game in all_games.iterrows())
home_games = pd.DataFrame(series_list)


[Parallel(n_jobs=-4)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=-4)]: Done  31 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-4)]: Done 259 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-4)]: Done 665 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-4)]: Done 1231 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-4)]: Done 1961 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-4)]: Done 2851 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-4)]: Done 3905 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-4)]: Done 5119 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-4)]: Done 6497 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-4)]: Done 8035 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-4)]: Done 9737 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-4)]: Done 10617 out of 10617 | elapsed:  3.6min finished


In [None]:
"""
for index, game in all_games.iterrows():
    try:
        add_weather(game)
    except:
        print(game, "\n\n")
    if (index % 100) == 0:
        print("Index:", index)
"""

In [12]:
home_games.columns

Index(['Date', 'Home Team', 'Away Team', 'Home Score', 'Away Score', 'Inn',
       'D/N', 'Attendance', 'Home cLI', 'Run Difference', 'Time Double',
       'Year', 'Away cLI', 'Total cLI', 'tavg', 'prcp', 'snow', 'wspd',
       'day_num_of_year'],
      dtype='object')

In [38]:
#print(np.asarray(np.unique(home_games['prcp'], return_counts = True)).T)
#home_games[home_games['tsun'] == 'NaN']
home_games.at[home_games['snow'].isna(), 'snow'] = 0


home_games = home_games.rename(columns = {'D/N' : 'Day'})
home_games['Day'] = np.where((home_games['Day'] == 'D'), 1, 0 )
np.unique(home_games['Day'])
home_games.to_csv('data_files/other_factors.csv', index = False)

In [37]:
home_games

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Inn,Day,Attendance,Home cLI,Run Difference,Time Double,Year,Away cLI,Total cLI,tavg,prcp,snow,wspd,day_num_of_year
0,2017-04-02,ARI,SFG,6.0,5.0,9.0,1,49016.0,1.06,1.0,3.383333,2017,1.06,2.12,18.9,0.0,0.0,11.9,92
1,2017-04-04,ARI,SFG,4.0,8.0,9.0,0,19378.0,1.12,4.0,3.433333,2017,1.01,2.13,21.0,0.0,0.0,12.2,94
2,2017-04-05,ARI,SFG,8.0,6.0,9.0,0,14675.0,1.03,2.0,3.250000,2017,1.05,2.08,21.7,0.0,0.0,9.7,95
3,2017-04-06,ARI,SFG,9.0,3.0,9.0,0,15308.0,1.10,6.0,3.000000,2017,1.05,2.15,23.6,0.0,0.0,7.9,96
4,2017-04-07,ARI,CLE,7.0,3.0,9.0,0,22443.0,1.00,4.0,3.316667,2017,1.02,2.02,25.3,0.0,0.0,10.1,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10612,2021-09-18,WSN,COL,0.0,6.0,9.0,1,29315.0,.00,6.0,3.100000,2021,.00,0.0,24.4,0.0,0.0,6.1,261
10613,2021-09-19,WSN,COL,3.0,0.0,9.0,1,26303.0,.00,3.0,3.016667,2021,.00,0.0,24.9,0.0,0.0,11.2,262
10614,2021-10-01,WSN,BOS,2.0,4.0,9.0,0,32521.0,.00,2.0,3.416667,2021,3.14,3.14,17.9,0.0,0.0,6.8,274
10615,2021-10-02,WSN,BOS,3.0,5.0,9.0,1,41465.0,.00,2.0,3.883333,2021,3.93,3.93,18.9,0.0,0.0,9.0,275
