In [1]:
import os
import numpy as np
import pandas as pd

# Combine Team ID's with Game ID's for Master List

- Open Team ID's File
- Open Game ID's File
- Create Master List (DataFrame) of Game ID's per Team

### Open Team ID's File to be used for both 2016 and 2016 Game ID's

In [2]:
# path to team_ids.csv file
team_ids_path = 'data/id_files/team_ids.csv'

# open team_ids.csv file, select 'id', 'name', 'abbr' columns, set 'id' as index
df_team_ids = pd.read_csv(team_ids_path, usecols=['id','name', 'abbr'], index_col='id')

In [3]:
# confirm there are 30 MLB teams and 2 All-Star teams (AL, NL) 
df_team_ids.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, dd59d49e-caee-4443-9220-f05d0d9bd1e1 to 3bbb3b39-b5cb-4fc9-bd22-522521f0f329
Data columns (total 2 columns):
name    32 non-null object
abbr    32 non-null object
dtypes: object(2)
memory usage: 768.0+ bytes


### Open Game ID's Files

In [4]:
# path to game_ids.csv files for 2016 and 2017
game_ids_2016_path = 'data/id_files/game_ids_2016.csv'
game_ids_2017_path = 'data/id_files/game_ids_2017.csv'
game_ids_2018_path = 'data/id_files/game_ids_2018.csv'

# open game_ids.csv file to combine with team IDs
df_game_ids_2016 = pd.read_csv(game_ids_2016_path, index_col='id')
df_game_ids_2017 = pd.read_csv(game_ids_2017_path, index_col='id')
df_game_ids_2018 = pd.read_csv(game_ids_2018_path, index_col='id')

In [5]:
# confirm there are 2,430 game id's
# (162 games * 15 team pairs) + 1 all-star game = 2,431
# 26 games have been rescheduled for various reasons
df_game_ids_2016.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2431 entries, b28c65d4-280d-4a63-851e-6d13408e0b49 to c7101abc-c787-4c26-b78b-684ad196cc58
Data columns (total 4 columns):
away_team      2431 non-null object
home_team      2431 non-null object
scheduled      2431 non-null object
rescheduled    26 non-null object
dtypes: object(4)
memory usage: 95.0+ KB


In [6]:
# confirm there are 2,430 game id's
# (162 games * 15 team pairs) + 1 all-star game = 2,431
# 39 games have been rescheduled for various reasons
df_game_ids_2017.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2431 entries, 3191adc6-10f8-49b1-bece-9ae50800f274 to 9e524d09-e82b-42e4-a3ed-6e56e090bffd
Data columns (total 4 columns):
away_team      2431 non-null object
home_team      2431 non-null object
scheduled      2431 non-null object
rescheduled    39 non-null object
dtypes: object(4)
memory usage: 95.0+ KB


In [7]:
# confirm there are 2,430 regular game id's + 2 regular season tie-breaker game id's
# (162 games * 15 team pairs) + (2 regular season tie-breaker games * 1 team pair) + 1 all-star game = 2,433
# 54 games have been rescheduled for various reasons
df_game_ids_2018.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2433 entries, 0e2e9445-819b-41f0-926f-9235eff0fb2b to c99e1fda-6b0f-4c84-b939-758ce202653f
Data columns (total 4 columns):
away_team      2433 non-null object
home_team      2433 non-null object
scheduled      2433 non-null object
rescheduled    54 non-null object
dtypes: object(4)
memory usage: 95.0+ KB


### Function to associate Game ID's to Team ID's

In [8]:
# check_id function to return datafram of combined team and game id's
def check_id(team_frame, game_frame):
    """
    Create new DataFrame with combined Team and Game ID's.

    Keyword arguments:
    team_frame -- pd.DataFrame containing unique Team ID's
    game_frame -- pd.DataFrame containing unique Game ID's
    """
    # list to store selected values
    temp_list = []
    # list of column names for new dataframe
    columns = ['team_id', 'team_name', 'team_abbr', 'game_id', 
               'at_away', 'at_home', 'scheduled', 'rescheduled']
    
    for t_idx, t_row in team_frame.iterrows(): # iterate through each row of team id's
        for g_idx, g_row in game_frame.iterrows(): # iterate through each row of game id's
            if t_idx == g_row['home_team']: # check if team id == home_team id in game_id row
                temp_list.append([t_idx, t_row['name'], t_row['abbr'], g_idx, 0, 1, g_row['scheduled'], 
                                g_row['rescheduled']])
            elif t_idx == g_row['away_team']: # check if team id == away_team id in game_id row
                temp_list.append([t_idx, t_row['name'], t_row['abbr'], g_idx, 1, 0, g_row['scheduled'], 
                                g_row['rescheduled']])
    # create and return new DataFrame with combined team and game id's
    new_frame = pd.DataFrame(temp_list, columns=columns) 
    return new_frame 

# test check_id function        
#result = check_id(df_team_ids, df_game_ids)

In [9]:
df_combined_ids_2016 = check_id(df_team_ids, df_game_ids_2016)
df_combined_ids_2017 = check_id(df_team_ids, df_game_ids_2017)
df_combined_ids_2018 = check_id(df_team_ids, df_game_ids_2018)

## Save Combined Team and Game ID's to File

In [10]:
df_combined_ids_2016.to_csv('data/id_files/combined_ids_2016.csv', index=False)
df_combined_ids_2017.to_csv('data/id_files/combined_ids_2017.csv', index=False)
df_combined_ids_2018.to_csv('data/id_files/combined_ids_2018.csv', index=False)

In [11]:
# path to team_ids.csv file
path_2016 = 'data/id_files/combined_ids_2016.csv'
path_2017 = 'data/id_files/combined_ids_2017.csv'
path_2018 = 'data/id_files/combined_ids_2018.csv'

# open game_ids.csv file to combine with team IDs
combined_ids_2016 = pd.read_csv(path_2016)
combined_ids_2017 = pd.read_csv(path_2017)
combined_ids_2018 = pd.read_csv(path_2018)

In [12]:
combined_ids_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4862 entries, 0 to 4861
Data columns (total 8 columns):
team_id        4862 non-null object
team_name      4862 non-null object
team_abbr      4862 non-null object
game_id        4862 non-null object
at_away        4862 non-null int64
at_home        4862 non-null int64
scheduled      4862 non-null object
rescheduled    52 non-null object
dtypes: int64(2), object(6)
memory usage: 304.0+ KB


In [13]:
combined_ids_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4862 entries, 0 to 4861
Data columns (total 8 columns):
team_id        4862 non-null object
team_name      4862 non-null object
team_abbr      4862 non-null object
game_id        4862 non-null object
at_away        4862 non-null int64
at_home        4862 non-null int64
scheduled      4862 non-null object
rescheduled    78 non-null object
dtypes: int64(2), object(6)
memory usage: 304.0+ KB


In [14]:
combined_ids_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4866 entries, 0 to 4865
Data columns (total 8 columns):
team_id        4866 non-null object
team_name      4866 non-null object
team_abbr      4866 non-null object
game_id        4866 non-null object
at_away        4866 non-null int64
at_home        4866 non-null int64
scheduled      4866 non-null object
rescheduled    108 non-null object
dtypes: int64(2), object(6)
memory usage: 304.2+ KB
