In [1]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed
import pybaseball
pd.options.mode.chained_assignment = None 
#https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas

In [2]:
#ho_aw = home/away, I wanted to use a different variable than the df to keep them straight
# Can be "Home" or "@"
make_time_double = lambda x: float(x['Time'].split(":")[0]) + (float(x['Time'].split(":")[1])/60)

def get_szn_schedule(year, team, ho_aw = None):
    data = pybaseball.schedule_and_record(year, team)
    data = data[['Date', 'Tm', 'Home_Away', 'Opp', 'R', 'RA', 'Inn', 'Time', 'D/N', 'Attendance', 'cLI']]
    data['Run Difference'] = np.abs(data['R'] - data['RA'])
    
    if ho_aw != None:
        data = data[data["Home_Away"] == ho_aw]

    data['Time Double'] = data.apply(make_time_double, axis = 1)
    data = data.drop('Time', axis = 1)
    
    data['Year'] = year
    return data

In [3]:
team_list = ["ARI", "ATL", "BAL", "BOS", "CHW", "CHC", "CIN", "CLE", "COL", "DET", "HOU", "KC", "LAA", 
             "LAD", "MIA", "MIL", "MIN", "NYM", "NYY", "OAK", "PHI", "PIT", "SD", "SF", "SEA", "STL", 
            "TBR", "TEX", "TOR", "WSN"]
year_list = [2017, 2018, 2019, 2020, 2021]

In [4]:
"""#Written more parallel in next loop
for year in year_list:
    print(f'Starting year: {year}')
    
    series_list = Parallel(n_jobs = -1)(
        delayed(get_szn_schedule)(year, team, "Home") for team in team_list)
    
    home_games = pd.concat(series_list, axis = 0)
    
    series_list = Parallel(n_jobs = -1)(
        delayed(get_szn_schedule)(year, team, "@") for team in team_list)
    
    away_games = pd.concat(series_list, axis = 0)\
        .drop(['Home_Away', 'Run Difference', 'Time Double', 'Year', 'D/N', 'Inn', 
               'Tm', 'Attendance'], axis = 1)
    
    
    temp = pd.merge(left = home_games, 
                    right = away_games, 
                    left_on = ['Date', 'Tm', 'R', 'RA'],
                    right_on = ['Date', 'Opp', 'RA', 'R']
        ).drop(['Opp_y', 'R_y', 'RA_y'], axis = 1)
    print(temp.columns)
    print(temp)
    
    break
"""

Starting year: 2017
Index(['Date', 'Tm', 'Home_Away', 'Opp_x', 'R_x', 'RA_x', 'Inn', 'D/N',
       'Attendance', 'cLI_x', 'Run Difference', 'Time Double', 'Year',
       'cLI_y'],
      dtype='object')
                  Date   Tm Home_Away Opp_x  R_x  RA_x  Inn D/N  Attendance  \
0        Sunday, Apr 2  ARI      Home   SFG  6.0   5.0  9.0   D     49016.0   
1       Tuesday, Apr 4  ARI      Home   SFG  4.0   8.0  9.0   N     19378.0   
2     Wednesday, Apr 5  ARI      Home   SFG  8.0   6.0  9.0   N     14675.0   
3      Thursday, Apr 6  ARI      Home   SFG  9.0   3.0  9.0   N     15308.0   
4        Friday, Apr 7  ARI      Home   CLE  7.0   3.0  9.0   N     22443.0   
...                ...  ...       ...   ...  ...   ...  ...  ..         ...   
2425    Sunday, Sep 17  WSN      Home   LAD  7.0   1.0  9.0   N     29155.0   
2426  Thursday, Sep 28  WSN      Home   PIT  5.0   4.0  9.0   N     26380.0   
2427    Friday, Sep 29  WSN      Home   PIT  6.0   1.0  9.0   N     36339.0   
2428  Sa

In [7]:
print("Starting Home")
series_list = Parallel(n_jobs = -1)(delayed(get_szn_schedule)(year, team, "Home") 
    for year, team in zip(year_list*30, team_list*5))
home_games = pd.concat(series_list, axis = 0)

print("Starting away")
series_list = Parallel(n_jobs = -1)(delayed(get_szn_schedule)(year, team, "@") 
    for year, team in zip((year_list*30), (team_list*5))) 
away_games = pd.concat(series_list, axis = 0)\
    .drop(['Home_Away', 'Run Difference', 'Time Double', 'Year', 'D/N', 'Inn', 
           'Tm', 'Attendance'], axis = 1)

print("Starting Merge")
temp = pd.merge(left = home_games, 
                right = away_games, 
                left_on = ['Date', 'Tm', 'R', 'RA'],
                right_on = ['Date', 'Opp', 'RA', 'R']
    ).drop(['Opp_y', 'R_y', 'RA_y'], axis = 1)
print(temp.columns)
print(temp)

Starting Home
Starting away
Starting Merge
Index(['Date', 'Tm', 'Home_Away', 'Opp_x', 'R_x', 'RA_x', 'Inn', 'D/N',
       'Attendance', 'cLI_x', 'Run Difference', 'Time Double', 'Year',
       'cLI_y'],
      dtype='object')
                    Date   Tm Home_Away Opp_x  R_x  RA_x  Inn D/N  Attendance  \
0          Friday, Jun 9  ARI      Home   MIL  6.0   8.0  9.0   N     25009.0   
1          Friday, Jun 9  ARI      Home   MIL  6.0   8.0  9.0   N     25009.0   
2          Friday, Jun 9  ARI      Home   MIL  6.0   8.0  9.0   N     25009.0   
3          Friday, Jun 9  ARI      Home   MIL  6.0   8.0  9.0   N     25009.0   
4          Friday, Jun 9  ARI      Home   MIL  6.0   8.0  9.0   N     25009.0   
...                  ...  ...       ...   ...  ...   ...  ...  ..         ...   
10045  Wednesday, Sep 15  WSN      Home   MIA  6.0   8.0  9.0   D     16309.0   
10046  Wednesday, Sep 15  WSN      Home   MIA  6.0   8.0  9.0   D     16309.0   
10047  Wednesday, Sep 15  WSN      Home   MIA 