Double Play Analytics

(1) Forecast the outcome of games; (2) understand the most important factors that influence the outcome of games.

- Most important factors of a win? 
    - Pitching (start), pitching (reliever/closer), defense, runs/hitting (consistent), homeruns, the park, OBP, other?
    - run consistency - can you get at least 3 runs per game x percent of the time?


In [None]:
# Bringing in packages for EDA, pre-processing, modeling, and visualizations
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, plot_confusion_matrix, recall_score

#### Create a single dataset

In [None]:
#Each year is saved in a separate .txt file
file_paths = ["data/gl2010.txt", "data/gl2011.txt", "data/gl2012.txt", "data/gl2013.txt", "data/gl2014.txt",
              "data/gl2015.txt", "data/gl2016.txt", "data/gl2017.txt", "data/gl2018.txt", "data/gl2019.txt",
              "data/gl2020.txt", "data/gl2021.txt", "data/gl2022.txt", "data/gl2023.txt"]

In [None]:
dfs=[]

In [None]:
for file_path in file_paths:
    with open(file_path, "r") as f:
        # Read lines from the file
        data = f.readlines()
        # Split each line by comma and create a list of lists
        data_split = [line.strip().split(",") for line in data]
        # Create DataFrame from the data
        df_initial = pd.DataFrame(data_split)
        # Append DataFrame to the list
        dfs.append(df_initial)

In [None]:
# Concatenate all DataFrames in the list
df = pd.concat(dfs, ignore_index=True)

#### Understand what the data look like

In [None]:

# Set the display width to accommodate more characters per row
pd.set_option('display.width', 1000)  # Adjust as needed

# Set display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Display the head of the DataFrame
df.info()

In [None]:
df.head()

In [None]:

df.tail()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
nan_values = df.isna().any()
nan_values

In [None]:
df.dtypes

In [None]:
df.count()

#### Preparing data for more EDA (easier to use with names)

In [None]:
# Dropping the columns with minimalinformation at the END of the dataset and individual player, manager, and umpire info
# Additional dropping needed--> 
df.drop(columns=df.columns[77:179], inplace=True)

In [None]:
# Renaming the columns to something more descriptive
new_column_names = {
    0: 'date',
    1: 'num_games',
    2: 'day_of_week',
    3: 'team_visiting',
    4: 'league_visiting',
    5: 'game_num_visiting',
    6: 'team_home',
    7: 'league_home',
    8: 'game_num_home',
    9: 'score_visiting',
    10: 'score_home',
    11: 'outs_in_game',
    12: 'time_of_day',
    13: 'game_completed',
    14: 'forfeit',
    15: 'protest',
    16: 'park_id',
    17: 'attendance',
    18: 'length_min',
    19: 'line_score_visiting',
    20: 'line_score_home',   
    
    # Offense - Visiting
    21: 'at_bats_visiting',
    22: 'hits_visiting',
    23: 'double_visiting',
    24: 'triple_visiting',
    25: 'home_run_visiting',
    26: 'rbi_visiting',
    27: 'sacrifice_hit_visiting',
    28: 'sacrifine_fly_visiting',
    29: 'hit_by_pitch_visiting',
    30: 'walk_visiting',
    31: 'intent_walk_visiting',
    32: 'strikeout_visiting',
    33: 'stolen_base_visiting',
    34: 'caught_stealing_visiting',
    35: 'grounded_into_double_plays_visiting',
    36: 'first_catcher_interfere_visiting',
    37: 'left_on_base_visiting',
    
    # Pitching - Visiting
    38: 'pitchers_used_visiting',
    39: 'individual_earned_runs_visiting',
    40: 'team_earned_runs_visiting',
    41: 'wild_pitches_visiting',
    42: 'balks_visiting',
    
    # Defense - Visiting
    43: 'putouts_visiting',   
    44: 'assists_visiting',
    45: 'errors_visiting',
    46: 'passed_balls_visiting',
    47: 'double_def_visiting',
    48: 'triple_def_visiting',   
    
    # Offense - Home
    49: 'at_bats_home',
    50: 'hits_home',
    51: 'double_home',
    52: 'triple_home',
    53: 'home_run_home',
    54: 'rbi_home',
    55: 'sacrifice_hit_home',
    56: 'sacrifine_fly_home',
    57: 'hit_by_pitch_home',
    58: 'walk_home',
    59: 'intent_walk_home',
    60: 'strikeout_home',
    61: 'stolen_base_home',
    62: 'caught_stealing_home',
    63: 'grounded_into_double_plays_home',
    64: 'first_catcher_interfere_home',
    65: 'left_on_base_home',
    
    # Pitching - Home
    66: 'pitchers_used_home',
    67: 'individual_earned_runs_home',
    68: 'team_earned_runs_home',
    69: 'wild_pitches_home',
    70: 'balks_home',
    
    # Defense - Home
    71: 'putouts_home',
    72: 'assists_home',
    73: 'errors_home',
    74: 'passed_balls_home',
    75: 'double_def_home',
    76: 'triple_def_home',    
}

# Rename columns
df.rename(columns=new_column_names, inplace=True)

In [None]:
# Converting most numeric columns from strings to integers
columns_to_convert = [
    "num_games", 
    "game_num_visiting", 
    "game_num_home", 
    "score_visiting", 
    'game_num_home', 
    'score_visiting',    
    'score_home', 
    "outs_in_game", 
    "attendance", 
    "length_min", 
    "at_bats_visiting",                      
    "hits_visiting", 
    "double_visiting", 
    "triple_visiting", 
    "home_run_visiting", 
    "rbi_visiting", 
    "sacrifice_hit_visiting",  
    'sacrifine_fly_visiting',
    'hit_by_pitch_visiting',
    'walk_visiting',
    'intent_walk_visiting',
    'strikeout_visiting',
    'stolen_base_visiting',
    'caught_stealing_visiting',
    'grounded_into_double_plays_visiting',
    'first_catcher_interfere_visiting',
    'left_on_base_visiting',
    'pitchers_used_visiting',
    'individual_earned_runs_visiting',
    'team_earned_runs_visiting',
    'wild_pitches_visiting',
    'balks_visiting',
    'putouts_visiting',   
    'assists_visiting',
    'errors_visiting',
    'passed_balls_visiting',
    'double_def_visiting',
    'triple_def_visiting',       
    'at_bats_home',
    'hits_home',
    'double_home',
    'triple_home',
    'home_run_home',
    'rbi_home',
    'sacrifice_hit_home',
    'sacrifine_fly_home',
    'hit_by_pitch_home',
    'walk_home',
    'intent_walk_home',
    'strikeout_home',
    'stolen_base_home',
    'caught_stealing_home',
    'grounded_into_double_plays_home',
    'first_catcher_interfere_home',
    'left_on_base_home',
    'pitchers_used_home',
    'individual_earned_runs_home',
    'team_earned_runs_home',
    'wild_pitches_home',
    'balks_home',
    'putouts_home',
    'assists_home',
    'errors_home',
    'passed_balls_home',
    'double_def_home',
    'triple_def_home']

# Player positions are numeric, but are being left as strings

In [None]:
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [None]:
# Removing quotations around strings--
columns_to_convert_strings = [
  'date',
  'num_games',
  'day_of_week',
  'team_visiting',
  'league_visiting',
  'team_home',
  'league_home',   
  'time_of_day',
  'park_id',    
  'line_score_visiting',
  'line_score_home'] 

In [None]:
df[columns_to_convert_strings] = df[columns_to_convert_strings].applymap(lambda x: x.strip('""') if isinstance(x, str) else x)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.count()

In [None]:
df.dtypes

##### Columns with missing data- 
I think it's likely I won't be using attendance or length_min- will likely drop full columns
For the other three I will likely drop the rows, not a noteable number of missings, no reason to believe there is anything systematic about these missings, but I'll look to confirm, e.g., same season, same team, etc.

Initial total rows = 32484 

Dropped column
- attendance                             31549
- length_min                             32446
- protest
- misc

Dropped rows
- year = 2020; removed this, but not so sure. If I'm not analyzing by year it feels weird to remove it.
- at_bats_visiting                       32479
- double_visiting                        32446
- triple_visiting                        32446


##### Fields with large 'missing' /NA - how to handle these missing? 
 - saving_pitcher_id
 - saving_pitcher_name
 - game_winning_rbi_id
 - game_winning_rbi_name
 


In [None]:
# Dropping rows with missing values in the listed columns
df.dropna(subset=["at_bats_visiting", "double_visiting", "triple_visiting"], inplace=True)

In [None]:
# Drop row where game ended in a tie; found this obs because when calculated run differential there was one obs with a 0 differential
df.drop(index=16957, inplace=True)

In [None]:
# Drop rows where there was a protest
df.dropna(subset=['protest'], inplace=True)
df = df[(df['protest'] != '"V"') & (df['protest'] != '"H"')]

In [None]:
# This drops the partial 2020 season; not sure I want to drop it. It was an odd year, 
# but unless I'm looking at year as a factor what is the impact of keeping it in?
# df = df[df['year'] != 2020]

In [None]:
df.drop(columns=["attendance", "num_games", "length_min", "game_completed", "forfeit", "protest", "day_of_week", "league_visiting", "league_home", "park_id", "time_of_day"], inplace=True)
# both game_completed and forfeit had zero values/100% null for this timeframe

In [None]:
# FLO/MIA is the only team that switched three letter codes during this time period. Combining them as MIA
df['team_visiting'].replace({"FLO": "MIA"}, inplace=True)
df['team_home'].replace({"FLO": "MIA"}, inplace=True)

In [None]:
df.head()

#### Create new features

In [None]:
# Creating an integer four-digit year feature

# Extract the first four characters
year_digits = df['date'].str[0:4]

# Convert the extracted substring to numeric
df['year'] = pd.to_numeric(year_digits)

df['year'] = df['year'].astype(int)


In [None]:
# Revert this back to if else, don't need elif since dropping tie game
# Creating a feature for the LOSING team for each game/row
def compare_and_get_value(row):
    if row['score_visiting'] < row['score_home']:
        return row['team_visiting']
    elif row ['score_home'] < row['score_visiting']:
        return row['team_home']
    else:
        return 'tie_game'

# Create a new feature based on the comparison
df['losing_team'] = df.apply(compare_and_get_value, axis=1)

# There is one game that ended in a tie, CNC and PIT Sept 29, 2016 

In [None]:
# Revert this back to if else, don't need elif since dropping tie game
# Creating a feature for the WINNING team for each game/row
def compare_and_get_value(row):
    if row['score_visiting'] > row['score_home']:
        return row['team_visiting']
    elif row ['score_home'] > row['score_visiting']:
        return row['team_home']
    else:
        return 'tie_game'

# Create a new feature based on the comparison
df['winning_team'] = df.apply(compare_and_get_value, axis=1)

# There is one game that ended in a tie, CNC and PIT Sept 29, 2016 

In [None]:
df['winning_team'].value_counts()

In [None]:
def calculate_run_differential(row):
    if row['winning_team'] == row['team_home']:
        return row['score_home'] - row['score_visiting']
    else:
        return row['score_visiting'] - row['score_home']

# Apply the function to create a new column 'goal_difference'
df['run_differential'] = df.apply(calculate_run_differential, axis=1)

In [None]:
df['run_differential'].value_counts()

In [None]:
# Filter the DataFrame to include only rows where 'run differential' is 0, i.e., a tie game
observation = df[df['run_differential'] == 0]

# Print the entire row(s) corresponding to the observation(s) with a goal difference of 0
observation

# There is one game that ended in a tie, CNC and PIT Sept 29, 2016 

In [None]:
# Total Runs per Game
df['total_runs'] = df['score_home'] + df['score_visiting']
df['total_runs'].value_counts()


In [None]:
# I don't think this works/makes sense. It is only reporting one team per row still - 
# Combine 'team_home' and 'team_visiting' columns into a single column
#df['teams'] = df['team_home'].append(df['team_visiting'], ignore_index=True)

#df['teams'].value_counts()

In [None]:
# Combine 'team_home' and 'team_visiting' columns into a single column
df['strikeouts'] = df['strikeout_home'] + df['strikeout_visiting']

df['strikeouts'].value_counts()

In [None]:
df.head()

In [None]:
# Breaking up the season into three groups; do factors of success differ depending on where in the season you are
# 1 is beginning, 2 is middle, and 3 is end of season
# There are missing data for tertiles (grouped and home and visiting) - need to look at this
# This won't work for 2020!

#total_games = 162

# Calculate tertile boundaries
#tertile_boundaries = [0, total_games // 3, (total_games // 3) * 2, total_games]


# Function to assign tertile
#def assign_segment(game_num, boundaries):
#    for i in range(len(boundaries) - 1):
#        if boundaries[i] < game_num <= boundaries[i + 1]:
#            return i + 1
#    return None

# Assign tertile for home games
#df['tertile_home'] = df['game_num_home'].apply(lambda x: assign_segment(x, tertile_boundaries))

# Assign tertile for visiting games
#df['tertile_visiting'] = df['game_num_home'].apply(lambda x: assign_segment(x, tertile_boundaries))


In [None]:
#df['tertile'] = df['tertile_home'].append(df['tertile_visiting'], ignore_index=True)

In [None]:
#df['tertile'].value_counts()

In [None]:
df.tail()

#### Crosstabs, Groupings, and Value Counts

In [None]:
# Add unique numeric IDs
df['ID'] = range(1, len(df) + 1)
df['ID'] = df['ID'].astype(str) + '_w'  # Adding suffix '_w' to original IDs

# Duplicate rows and modify IDs for duplicates
duplicates = df.copy()
duplicates['ID'] = duplicates['ID'].apply(lambda x: x.replace('_w', '_l'))  # Changing suffix to '_l'

# Concatenate original DataFrame and duplicates
result = pd.concat([df, duplicates], ignore_index=True)


In [None]:
result['win'] = result['ID'].apply(lambda x: 1 if x.endswith('_w') else 0)

In [None]:
result.info()

In [None]:
result_sorted = result.sort_values(by='ID')
result_sorted.head(6)

## NN - test/template

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms
import numpy as np

In [2]:
# Generate synthetic data
np.random.seed(42)
X = np.random.rand(100, 10)  # 100 samples, 10 features
y = (X.sum(axis=1) > 5).astype(int)  # Binary classification

# Convert numpy arrays to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Create a TensorDataset and DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)


In [3]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(10, 50)  # 10 input features, 50 hidden units
        self.fc2 = nn.Linear(50, 2)   # 50 hidden units, 2 output classes

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize the model
model = SimpleNN()


In [4]:
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer


In [5]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader)}")


Epoch 1, Loss: 0.7029610812664032
Epoch 2, Loss: 0.6922645628452301
Epoch 3, Loss: 0.6868016362190247
Epoch 4, Loss: 0.6812235176563263
Epoch 5, Loss: 0.6764574229717255
Epoch 6, Loss: 0.6726328253746032
Epoch 7, Loss: 0.6686449408531189
Epoch 8, Loss: 0.6623287498950958
Epoch 9, Loss: 0.6570148348808289
Epoch 10, Loss: 0.6515888392925262
Epoch 11, Loss: 0.6465299844741821
Epoch 12, Loss: 0.6416837990283966
Epoch 13, Loss: 0.6361145555973053
Epoch 14, Loss: 0.6312559425830842
Epoch 15, Loss: 0.6251914799213409
Epoch 16, Loss: 0.6213668823242188
Epoch 17, Loss: 0.6161412835121155
Epoch 18, Loss: 0.6077877402305603
Epoch 19, Loss: 0.6008476138114929
Epoch 20, Loss: 0.5950348436832428


In [6]:
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy: {accuracy * 100}%")


Accuracy: 81.0%


## NN First Try

In [None]:
# Generate synthetic data
np.random.seed(42)
X = np.random.rand(100, 10)  # 100 samples, 10 features
y = (X.sum(axis=1) > 5).astype(int)  # Binary classification

# Convert numpy arrays to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Create a TensorDataset and DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)


In [None]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(10, 50)  # 10 input features, 50 hidden units
        self.fc2 = nn.Linear(50, 2)   # 50 hidden units, 2 output classes

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize the model
model = SimpleNN()

In [None]:
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

In [None]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader)}")

In [None]:
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy: {accuracy * 100}%")