In [18]:
import os
import numpy as np
import pandas as pd
from datetime import datetime

## Examples to test Datetime comparisons

- To be used to extract game id's based on specific dates for specific teams

### Open Combined ID's Files

In [19]:
# path to team_ids.csv file
path_2016 = 'data/id_files/combined_ids_2016.csv'
path_2017 = 'data/id_files/combined_ids_2017.csv'
path_2018 = 'data/id_files/combined_ids_2018.csv'

# open game_ids.csv file to combine with team IDs
combined_ids_2016 = pd.read_csv(path_2016)
combined_ids_2017 = pd.read_csv(path_2017)
combined_ids_2018 = pd.read_csv(path_2018)

### Open Team ID's File and Game ID's File to Associate with Pitcher's Games

In [20]:
# path to team_ids.csv and game_ids.csv file
team_id_path = 'data/id_files/team_ids.csv'
#game_id_path = 'data/id_files/game_ids_2016.csv'
#game_id_path = 'data/id_files/game_ids_2017.csv'
game_id_path = 'data/id_files/game_ids_2018.csv'

In [21]:
# open team_ids.csv file
df_team_ids = pd.read_csv(team_id_path)
df_team_ids.columns

Index(['name', 'market', 'abbr', 'id'], dtype='object')

In [22]:
# open game_ids.csv file
df_game_ids = pd.read_csv(game_id_path)
df_game_ids.columns

Index(['id', 'away_team', 'home_team', 'scheduled', 'rescheduled'], dtype='object')

### Format Dates for Pitcher Log(s) to Compare to Game ID Dates

In [23]:
def get_pitcher_dates(pitcher_frame, team_frame, game_frame, yr):
    cols = ['game_id', 'date', 'team', 'team_id', 'opp', 'opp_id']
    team_list = []
    combined_list = []
    
    for t_idx, t_row in team_frame.iterrows():
        team_list.append((t_row['abbr'], t_row['id']))
    team_dict = dict(team_list)
    
    for p_idx, p_row in pitcher_frame.iterrows():
        dt = p_row['Date']
        if type(dt) == str:
            dt = datetime.strptime(dt, '%b %d')
            dt = dt.replace(year=yr)
            dt = datetime.strftime(dt, '%Y-%m-%d')
            
            team_name = p_row['Tm']
            opp_name = p_row['Opp']
            team_id = team_dict[team_name]
            opp_id = team_dict[opp_name]

            for g_idx, g_row in game_frame.iterrows():
                s_dt = g_row['scheduled']
                r_dt = g_row['rescheduled']
                t_id = g_row['home_team']
                o_id = g_row['away_team']
                if (dt == s_dt or dt == r_dt) and (team_id == t_id or team_id == o_id) and (opp_id == t_id or opp_id == o_id):
                    combined_list.append([g_row['id'], dt, team_name, team_id, opp_name, opp_id])
    new_frame = pd.DataFrame(combined_list, columns=cols)
    return new_frame

### Open Pitcher Log(s) Files to Get Specific Game ID's

In [24]:
basepath = 'data/pitcher_logs/'
end_str = '_2018.csv'
paths = []
for fname in os.listdir(basepath):
    path = os.path.join(basepath, fname)
    if not os.path.isdir(path) and path.endswith(end_str):
        paths.append(path)
        
paths

['data/pitcher_logs/jose_quintana_2018.csv',
 'data/pitcher_logs/jake_arrieta_2018.csv',
 'data/pitcher_logs/david_price_2018.csv',
 'data/pitcher_logs/justin_verlander_2018.csv',
 'data/pitcher_logs/gerrit_cole_2018.csv',
 'data/pitcher_logs/michael_fulmer_2018.csv',
 'data/pitcher_logs/max_scherzer_2018.csv',
 'data/pitcher_logs/chris_archer_2018.csv',
 'data/pitcher_logs/yu_darvish_2018.csv',
 'data/pitcher_logs/dallas_keuchel_2018.csv',
 'data/pitcher_logs/chris_sale_2018.csv',
 'data/pitcher_logs/aaron_nola_2018.csv',
 'data/pitcher_logs/zack_greinke_2018.csv',
 'data/pitcher_logs/stephen_strasburg_2018.csv',
 'data/pitcher_logs/carlos_carrasco_2018.csv',
 'data/pitcher_logs/corey_kluber_2018.csv',
 'data/pitcher_logs/jacob_degrom_2018.csv',
 'data/pitcher_logs/marcus_stroman_2018.csv',
 'data/pitcher_logs/clayton_kershaw_2018.csv',
 'data/pitcher_logs/carlos_martinez_2018.csv']

In [25]:
#yr = 2016
#yr = 2017
yr = 2018
for path in paths:
    df = pd.read_csv(path, usecols=['Date', 'Tm', 'Opp', 'Rslt', 'Inngs', 'ERA', 'Pit', 'Entered', 'Exited'])
    df_result = get_pitcher_dates(df, df_team_ids, df_game_ids, yr)
    
    path = path.split(sep='/')
    path = path[2].split(sep='_')
    out_file = "data/id_files/" + path[0] + '_' + path[1] + '_game_ids_' + path[2]
    df_result.to_csv(out_file, index=False)