In [1]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed

In [2]:
ump_performance = pd.read_csv("data_files/umpire_performance.csv")
other_factors = pd.read_csv("data_files/other_factors.csv")

In [6]:
print("Ump_performance columns: ", ump_performance.columns)
print("ump_performance rows: ", len(ump_performance))
print("\n\nother_factors columns:", other_factors.columns)
print("other_factors rows: ", len(other_factors))

Ump_performance columns:  Index(['game_pk', 'game_date_x', 'game_year', 'umpire_accuracy',
       'umpire_consistency', 'home_team', 'post_home_score', 'post_away_score',
       'id', 'name'],
      dtype='object')
ump_performance rows:  10216


other_factors columns: Index(['Date', 'Home Team', 'Away Team', 'Home Score', 'Away Score', 'Inn',
       'Day', 'Attendance', 'Home cLI', 'Run Difference', 'Time Double',
       'Year', 'Away cLI', 'Total cLI', 'tavg', 'prcp', 'snow', 'wspd',
       'day_num_of_year'],
      dtype='object')
other_factors rows:  10617


In [13]:
df = pd.merge(
    left = ump_performance,
    right = other_factors, 
    left_on = ['game_date_x', 'home_team', 'post_home_score', 'post_away_score'],
    right_on = ['Date', 'Home Team', 'Home Score', 'Away Score']
)
df.columns

Index(['game_pk', 'game_date_x', 'game_year', 'umpire_accuracy',
       'umpire_consistency', 'home_team', 'post_home_score', 'post_away_score',
       'id', 'name', 'Date', 'Home Team', 'Away Team', 'Home Score',
       'Away Score', 'Inn', 'Day', 'Attendance', 'Home cLI', 'Run Difference',
       'Time Double', 'Year', 'Away cLI', 'Total cLI', 'tavg', 'prcp', 'snow',
       'wspd', 'day_num_of_year'],
      dtype='object')

In [93]:
def build_data(year_id, ump_df):
    ump_df = ump_df.reset_index()
    
    #print(year_id)
    ump_df['year'] = year_id[0]
    ump_df['id'] = year_id[1]
    
    ump_df['last_accuracy'] = pd.NA
    ump_df['last_consistency'] = pd.NA
    ump_df['days_between'] = pd.NA
    for index in range(1, len(ump_df)):
        ump_df.at[index, 'last_accuracy'] = ump_df.iloc[index-1]['umpire_accuracy']
        ump_df.at[index, 'last_consistency'] = ump_df.iloc[index-1]['umpire_consistency']
        ump_df.at[index, 'days_between'] = \
            ump_df.iloc[index]['day_num_of_year'] - ump_df.iloc[index-1]['day_num_of_year']
        
    return ump_df[1:]

In [111]:
start = time.time()
year_groups = df.groupby(by = ['game_year', 'id'])
for year, year_df in year_groups.__iter__():
    temp = build_data(year, year_df) 
    #print(temp)
time_elapsed =  time.time() - start #Takes ~1/3 of time in parallel
print(f"Time elapsed: {time_elapsed:.2f} seconds.")  
#Evidently I didn't need to make this parallel 
#but I am proud of myself nonetheless

Time elapsed: 4.68 seconds.


In [101]:
year_groups = df.groupby(by = ['game_year', 'id'])
umpire_list = Parallel(n_jobs = -1, verbose = 4)(
    delayed(build_data)(year, year_df) for year, year_df in year_groups.__iter__())


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 445 out of 460 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 460 out of 460 | elapsed:    1.2s finished


In [106]:
game_summary = pd.concat(umpire_list, axis = 0, ignore_index = True)
game_summary.columns

Index(['index', 'game_pk', 'game_date_x', 'game_year', 'umpire_accuracy',
       'umpire_consistency', 'home_team', 'post_home_score', 'post_away_score',
       'id', 'name', 'Date', 'Home Team', 'Away Team', 'Home Score',
       'Away Score', 'Inn', 'Day', 'Attendance', 'Home cLI', 'Run Difference',
       'Time Double', 'Year', 'Away cLI', 'Total cLI', 'tavg', 'prcp', 'snow',
       'wspd', 'day_num_of_year', 'year', 'last_accuracy', 'last_consistency',
       'days_between'],
      dtype='object')