# Preprocess data

In [3]:
import pandas as pd
import ipywidgets as widgets
import numpy as np
import json
from tqdm.auto import tqdm
import gc
import os

parentpath='../data/'
rawpath=parentpath+'raw_data/'
processedpath=parentpath+'processed_data/'

In [21]:
if 'player_engagement_with_info.pkl' in os.listdir(processedpath) and False:
  player_engagement_with_info=pd.read_pickle(processedpath+'player_engagement_with_info.pkl')
  display(player_engagement_with_info)
else:
  df_names=['awards','players','seasons','teams','train']
  for name in df_names:
    globals()[name] = pd.read_csv(os.path.join(rawpath,name)+ ".csv")
  
  team_rivals=pd.read_excel(processedpath+'teams_rivals.xlsx')
  team_rivals=pd.melt(team_rivals,id_vars='team',var_name='rivalry_score',value_name='away_team')
  team_rivals['rivalry_score']=team_rivals.rivalry_score.apply(lambda x: 3 if x=='first_rival' else 2)
  team_rivals=team_rivals.merge(teams[['id','teamName']],left_on='team',right_on='teamName').drop('teamName',axis=1).merge(teams[['id','teamName']],left_on='away_team',right_on='teamName',suffixes=('_home','_away')).drop('teamName',axis=1)

  seasons.iloc[:,1:]=pd.to_datetime(seasons.iloc[:,1:].stack()).unstack()
  train['date'] = pd.to_datetime(train['date'], format="%Y%m%d")
  awards['dailyDataDate']=pd.to_datetime(awards['awardDate'])
  awards_additional=awards[['dailyDataDate','playerId','awardId']]
  
  ## not using total awards
  # total_awards=awards_additional.groupby(['playerId','awardId']).size().reset_index().pivot_table(index='playerId',columns='awardId').reset_index().fillna(0)
  # total_awards.columns=['award_'+i for i in total_awards.columns.droplevel(0)]
  # total_awards.rename(columns={'award_':'playerId'},inplace=True)

  # Helper function to unpack json found in daily data
  def unpack_json(json_str):
      return np.nan if pd.isna(json_str) else pd.read_json(json_str)

  #### Unnest various nested data within training (daily) data ####
  daily_data_unnested_dfs = pd.DataFrame(data = {
    'dfName': train.drop('date', axis = 1).columns.values.tolist()
    })

  daily_data_unnested_dfs['df'] = [pd.DataFrame() for row in 
    daily_data_unnested_dfs.iterrows()]

  for df_index, df_row in daily_data_unnested_dfs.iterrows():
      nestedTableName = str(df_row['dfName'])
      
      date_nested_table = train[['date', nestedTableName]]
      
      date_nested_table = (date_nested_table[
        ~pd.isna(date_nested_table[nestedTableName])
        ].
        reset_index(drop = True)
        )
      
      daily_dfs_collection = []
      
      for date_index, date_row in date_nested_table.iterrows():
          daily_df = unpack_json(date_row[nestedTableName])
          
          daily_df['dailyDataDate'] = date_row['date']
          
          daily_dfs_collection = daily_dfs_collection + [daily_df]

      unnested_table = pd.concat(daily_dfs_collection,
        ignore_index = True).set_index('dailyDataDate').reset_index()

      # Creates 1 pandas df per unnested df from daily data read in, with same name
      globals()[df_row['dfName']] = unnested_table    
      
      daily_data_unnested_dfs['df'][df_index] = unnested_table

  del train
  gc.collect()

  #### Get some information on each date in daily data (using season dates of interest) ####
  dates = pd.DataFrame(data = 
    {'dailyDataDate': nextDayPlayerEngagement['dailyDataDate'].unique()})

  dates['date'] = pd.to_datetime(dates['dailyDataDate'].astype(str))

  dates['year'] = dates['date'].dt.year
  dates['month'] = dates['date'].dt.month

  dates_with_info = pd.merge(
    dates,
    seasons,
    left_on = 'year',
    right_on = 'seasonId'
    )

  dates_with_info['inSeason'] = (
    dates_with_info['date'].between(
      dates_with_info['regularSeasonStartDate'],
      dates_with_info['postSeasonEndDate'],
      inclusive = True
      )
    )

  dates_with_info['seasonPart'] = np.select(
    [
      dates_with_info['date'] < dates_with_info['preSeasonStartDate'], 
      dates_with_info['date'] < dates_with_info['regularSeasonStartDate'],
      dates_with_info['date'] <= dates_with_info['lastDate1stHalf'],
      dates_with_info['date'] < dates_with_info['firstDate2ndHalf'],
      dates_with_info['date'] <= dates_with_info['regularSeasonEndDate'],
      dates_with_info['date'] < dates_with_info['postSeasonStartDate'],
      dates_with_info['date'] <= dates_with_info['postSeasonEndDate'],
      dates_with_info['date'] > dates_with_info['postSeasonEndDate']
    ], 
    [
      'Offseason',
      'Preseason',
      'Reg Season 1st Half',
      'All-Star Break',
      'Reg Season 2nd Half',
      'Between Reg and Postseason',
      'Postseason',
      'Offseason'
    ], 
    default = np.nan
    )

  dates_with_info['days_since_preseason_start']=(dates_with_info['date']-dates_with_info['preSeasonStartDate']).astype('timedelta64[D]')
  dates_with_info['days_since_preseason_end']=(dates_with_info['date']-dates_with_info['preSeasonEndDate']).astype('timedelta64[D]')
  dates_with_info['days_since_postseason_start']=(dates_with_info['date']-dates_with_info['postSeasonStartDate']).astype('timedelta64[D]')
  dates_with_info['days_since_postseason_end']=(dates_with_info['date']-dates_with_info['postSeasonEndDate']).astype('timedelta64[D]')
  dates_with_info['days_since_regular_season_start_1st']=(dates_with_info['date']-dates_with_info['regularSeasonStartDate']).astype('timedelta64[D]')
  dates_with_info['days_since_regular_season_end_2nd']=(dates_with_info['date']-dates_with_info['regularSeasonEndDate']).astype('timedelta64[D]')
  dates_with_info['days_since_regular_season_end_1st']=(dates_with_info['date']-dates_with_info['lastDate1stHalf']).astype('timedelta64[D]')
  dates_with_info['days_since_regular_season_start_2nd']=(dates_with_info['date']-dates_with_info['firstDate2ndHalf']).astype('timedelta64[D]')
  dates_with_info['days_since_allStar']=(dates_with_info['date']-dates_with_info['allStarDate']).astype('timedelta64[D]')

  #### Add some pitching stats/pieces of info to player game level stats ####

  player_game_stats = (playerBoxScores.copy().
    # Change team Id/name to reflect these come from player game, not roster
    rename(columns = {'teamId': 'gameTeamId', 'teamName': 'gameTeamName'})
    )

  # Adds in field for innings pitched as fraction (better for aggregation)
  player_game_stats['inningsPitchedAsFrac'] = np.where(
    pd.isna(player_game_stats['inningsPitched']),
    np.nan,
    np.floor(player_game_stats['inningsPitched']) +
      (player_game_stats['inningsPitched'] -
        np.floor(player_game_stats['inningsPitched'])) * 10/3
    )

  # Add in Tom Tango pitching game score (https://www.mlb.com/glossary/advanced-stats/game-score)
  player_game_stats['pitchingGameScore'] = (40
  #     + 2 * player_game_stats['outs']
      + 1 * player_game_stats['strikeOutsPitching']
      - 2 * player_game_stats['baseOnBallsPitching']
      - 2 * player_game_stats['hitsPitching']
      - 3 * player_game_stats['runsPitching']
      - 6 * player_game_stats['homeRunsPitching']
      )

  # Add in criteria for no-hitter by pitcher (individual, not multiple pitchers)
  player_game_stats['noHitter'] = np.where(
    (player_game_stats['gamesStartedPitching'] == 1) &
    (player_game_stats['inningsPitched'] >= 9) &
    (player_game_stats['hitsPitching'] == 0),
    1, 0
    )

  player_date_stats_agg = pd.merge(
    (player_game_stats.
      groupby(['dailyDataDate', 'playerId'], as_index = False).
      # Some aggregations that are not simple sums
      agg(
        numGames = ('gamePk', 'nunique'),
        # Should be 1 team per player per day, but adding here for 1 exception:
        # playerId 518617 (Jake Diekman) had 2 games for different teams marked
        # as played on 5/19/19, due to resumption of game after he was traded
        numTeams = ('gameTeamId', 'nunique'),
        # Should be only 1 team for almost all player-dates, taking min to simplify
        gameTeamId = ('gameTeamId', 'min')
        )
      ),
    # Merge with a bunch of player stats that can be summed at date/player level
    (player_game_stats.
      groupby(['dailyDataDate', 'playerId'], as_index = False)
      [['runsScored', 'homeRuns', 'strikeOuts', 'baseOnBalls', 'hits',
        'hitByPitch', 'atBats', 'caughtStealing', 'stolenBases',
        'groundIntoDoublePlay', 'groundIntoTriplePlay', 'plateAppearances',
        'totalBases', 'rbi', 'leftOnBase', 'sacBunts', 'sacFlies',
        'gamesStartedPitching', 'runsPitching', 'homeRunsPitching', 
        'strikeOutsPitching', 'baseOnBallsPitching', 'hitsPitching',
        'inningsPitchedAsFrac', 'earnedRuns', 
        'battersFaced','saves', 'blownSaves', 'pitchingGameScore', 
        'noHitter'
        ]].
      sum()
      ),
    on = ['dailyDataDate', 'playerId'],
    how = 'inner'
    )

  #### Turn games table into 1 row per team-game, then merge with team box scores ####
  # Filter to regular or Postseason games w/ valid scores for this part
  games_for_stats = games[
    np.isin(games['gameType'], ['R', 'F', 'D', 'L', 'W', 'C', 'P']) &
    ~pd.isna(games['homeScore']) &
    ~pd.isna(games['awayScore'])
    ]

  # Get games table from home team perspective
  games_home_perspective = games_for_stats.copy()

  # Change column names so that "team" is "home", "opp" is "away"
  games_home_perspective.columns = [
    col_value.replace('home', 'team').replace('away', 'opp') for 
      col_value in games_home_perspective.columns.values]

  games_home_perspective['isHomeTeam'] = 1

  # Get games table from away team perspective
  games_away_perspective = games_for_stats.copy()

  # Change column names so that "opp" is "home", "team" is "away"
  games_away_perspective.columns = [
    col_value.replace('home', 'opp').replace('away', 'team') for 
      col_value in games_away_perspective.columns.values]

  games_away_perspective['isHomeTeam'] = 0

  # Put together games from home/away perspective to get df w/ 1 row per team game
  team_games = (pd.concat([
    games_home_perspective,
    games_away_perspective
    ],
    ignore_index = True)
    )

  team_games = team_games \
    .merge(team_rivals.drop(['team','away_team'],axis=1),left_on=['teamId','oppId'],right_on=['id_home','id_away'],how='left') \
    .drop(['id_home','id_away'],axis=1)

  team_games['rivalry_score'] = team_games['rivalry_score'].fillna(1)
  
  # Copy over team box scores data to modify
  team_game_stats = teamBoxScores.copy()

  # Change column names to reflect these are all "team" stats - helps 
  # to differentiate from individual player stats if/when joining later
  team_game_stats.columns = [
    (col_value + 'Team') 
    if (col_value not in ['dailyDataDate', 'home', 'teamId', 'gamePk',
      'gameDate', 'gameTimeUTC'])
      else col_value
    for col_value in team_game_stats.columns.values
    ]

  # Merge games table with team game stats
  team_games_with_stats = pd.merge(
    team_games,
    team_game_stats.
      # Drop some fields that are already present in team_games table
      drop(['home', 'gameDate', 'gameTimeUTC'], axis = 1),
    on = ['dailyDataDate', 'gamePk', 'teamId'],
    # Doing this as 'inner' join excludes spring training games, postponed games,
    # etc. from original games table, but this may be fine for purposes here 
    how = 'inner'
    )

  team_date_stats_agg = (team_games_with_stats.
    groupby(['dailyDataDate', 'teamId', 'gameType', 'oppId', 'oppName','rivalry_score'], 
      as_index = False).
    agg(
      numGamesTeam = ('gamePk', 'nunique'),
      winsTeam = ('teamWinner', 'sum'),
      lossesTeam = ('oppWinner', 'sum'),
      runsScoredTeam = ('teamScore', 'sum'),
      runsAllowedTeam = ('oppScore', 'sum')
      )
      )

  # Prepare standings table for merge w/ player digital engagement data
  # Pick only certain fields of interest from standings for merge
  standings_selected_fields = (standings[['dailyDataDate', 'teamId', 
    'streakCode', 'divisionRank', 'leagueRank', 'wildCardRank', 'pct'
    ]].
    rename(columns = {'pct': 'winPct'})
    )

  # Change column names to reflect these are all "team" standings - helps 
  # to differentiate from player-related fields if/when joining later
  standings_selected_fields.columns = [
    (col_value + 'Team') 
    if (col_value not in ['dailyDataDate', 'teamId'])
      else col_value
    for col_value in standings_selected_fields.columns.values
    ]

  standings_selected_fields['streakLengthTeam'] = (
    standings_selected_fields['streakCodeTeam'].
      str.replace('W', '').
      str.replace('L', '').
      astype(float)
      )

  # Add fields to separate winning and losing streak from streak code
  standings_selected_fields['winStreakTeam'] = np.where(
    standings_selected_fields['streakCodeTeam'].str[0] == 'W',
    standings_selected_fields['streakLengthTeam'],
    np.nan
    )

  standings_selected_fields['lossStreakTeam'] = np.where(
    standings_selected_fields['streakCodeTeam'].str[0] == 'L',
    standings_selected_fields['streakLengthTeam'],
    np.nan
    )

  standings_for_digital_engagement_merge = (pd.merge(
    standings_selected_fields,
    dates_with_info[['dailyDataDate', 'inSeason']],
    on = ['dailyDataDate'],
    how = 'left'
    ).
    # Limit down standings to only in season version
    query("inSeason").
    # Drop fields no longer necessary (in derived values, etc.)
    drop(['streakCodeTeam', 'streakLengthTeam', 'inSeason'], axis = 1).
    reset_index(drop = True)
    )

  #### Merge together various data frames to add date, player, roster, and team info ####
  # Copy over player engagement df to add various pieces to it
  player_engagement_with_info = nextDayPlayerEngagement.copy()

  # Take "row mean" across targets to add (helps with studying all 4 targets at once)
  player_engagement_with_info['targetAvg'] = np.mean(
    player_engagement_with_info[['target1', 'target2', 'target3', 'target4']],
    axis = 1)

  # Merge in date information
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    dates_with_info.drop(['seasonId', 'seasonStartDate',
       'seasonEndDate', 'preSeasonStartDate', 'preSeasonEndDate',
       'regularSeasonStartDate', 'regularSeasonEndDate', 'lastDate1stHalf',
       'allStarDate', 'firstDate2ndHalf', 'postSeasonStartDate',
       'postSeasonEndDate'],axis=1),
    on = ['dailyDataDate'],
    how = 'left'
    )

  # Merge in some player information
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    players[['playerId', 'playerName', 'DOB', 'mlbDebutDate', 'birthCity',
      'birthStateProvince', 'birthCountry', 'primaryPositionName']],
      on = ['playerId'],
      how = 'left'
      )

  player_engagement_with_info['age']=(player_engagement_with_info.dailyDataDate - pd.to_datetime(player_engagement_with_info.DOB)).astype('timedelta64[Y]')
  player_engagement_with_info['days_since_debut']=(player_engagement_with_info.dailyDataDate - pd.to_datetime(player_engagement_with_info.mlbDebutDate)).astype('timedelta64[D]')

  player_engagement_with_info.drop(['DOB', 'mlbDebutDate'],axis=1,inplace=True)

  # Merge in some player roster information by date
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    (rosters[['dailyDataDate', 'playerId', 'status', 'teamId']].
      rename(columns = {
        'status': 'rosterStatus',
        'teamId': 'rosterTeamId'
        })
      ),
    on = ['dailyDataDate', 'playerId'],
    how = 'left'
    )
      
  # Merge in team name from player's roster team
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    (teams[['id', 'teamName', 'locationName', 'leagueId', 'divisionId']].
      rename(columns = {
        'id': 'rosterTeamId',
        'teamName': 'rosterTeamName',
        'locationName': 'rosterTeamLocation', 
        'leagueId': 'rosterTeamLeagueId', 
        'divisionId': 'rosterDivisionId'
        })
      ),
    on = ['rosterTeamId'],
    how = 'left'
    )

  # Merge in some player game stats (previously aggregated) from that date
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    player_date_stats_agg,
    on = ['dailyDataDate', 'playerId'],
    how = 'left'
    )

  # Merge in team name from player's game team
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    (teams[['id', 'teamName', 'locationName', 'leagueId', 'divisionId']].
      rename(columns = {
        'id': 'gameTeamId',
        'teamName': 'gameTeamName',
        'locationName': 'gameTeamLocation', 
        'leagueId': 'gameTeamLeagueId', 
        'divisionId': 'gameDivisionId'
        })
      ),
    on = ['gameTeamId'],
    how = 'left'
    )

  # Merge in some team game stats/results (previously aggregated) from that date
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    team_date_stats_agg.rename(columns = {'teamId': 'gameTeamId'}),
    on = ['dailyDataDate', 'gameTeamId'],
    how = 'left'
    )

  # Merge in player transactions of note on that date
  transactions = transactions[['dailyDataDate','playerId','fromTeamId','toTeamId','typeCode']] \
    .merge(teams[['id','leagueId','divisionId']].rename(columns={'id':'fromTeamId','leagueId':'fromLeagueId','divisionId':'fromDivisionId'}),on='fromTeamId',how='left') \
    .merge(teams[['id','leagueId','divisionId']].rename(columns={'id':'toTeamId','leagueId':'toLeagueId','divisionId':'toDivisionId'}),on='toTeamId',how='left')
  
  transactions['transaction_category']=np.select(
      [
        transactions['toLeagueId'].isna(), 
        transactions['fromLeagueId'] != transactions['toLeagueId'],
        transactions['fromDivisionId'] != transactions['toDivisionId']
      ], 
      [
        'to_minor_league',
        'change_league',
        'change_division'
      ], 
      default = np.nan
  )

  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    transactions[['typeCode','dailyDataDate','playerId','transaction_category']],
    on = ['dailyDataDate', 'playerId'],
    how = 'left'
  )
  
  # Merge with awards
  awards=pd.concat([awards[['dailyDataDate','playerId','awardId']],awards_additional])
  awards=awards.sort_values(['awardId','dailyDataDate']).reset_index(drop=True)
  awardId=''
  cnt=0
  playerId=0
  cum_awards=[]
  end_streak=[]
  for i,row in awards.iterrows():
      if row['awardId'] != awardId:
          awardId=row['awardId']
          cnt=0
          playerId=0
      if row['playerId']==playerId:
          cnt+=1
          end_streak.append(0)
      else:
          if cnt > 1:
            end_streak.append(1)
          else:
            end_streak.append(0)
          cnt=1
          playerId=row['playerId']
      cum_awards.append(cnt)
  awards['cum_awards']=cum_awards
  awards['end_streak']=end_streak
  
  agg_awards=awards.groupby(['playerId','dailyDataDate']).cum_awards.agg(['size','max','min','mean']).reset_index()
  agg_awards.columns=['playerId','dailyDataDate','num_awards','max_award_streak','min_award_streak','mean_award_streak']
  agg_awards=agg_awards.merge(awards.groupby(['playerId','dailyDataDate'],as_index=False).end_streak.max())
  
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    agg_awards,
    on = ['dailyDataDate', 'playerId'],
    how = 'left'
  )

  # Merge in some pieces of team standings (previously filter/processed) from that date
  player_engagement_with_info = pd.merge(
    player_engagement_with_info,
    standings_for_digital_engagement_merge.
      rename(columns = {'teamId': 'gameTeamId'}),
    on = ['dailyDataDate', 'gameTeamId'],
    how = 'left'
    )

  player_engagement_with_info.to_pickle(processedpath+'player_engagement_with_info.pkl')
  display(player_engagement_with_info)


Unnamed: 0,dailyDataDate,engagementMetricsDate,playerId,target1,target2,target3,target4,targetAvg,date,year,...,max_award_streak,min_award_streak,mean_award_streak,end_streak,divisionRankTeam,leagueRankTeam,wildCardRankTeam,winPctTeam,winStreakTeam,lossStreakTeam
0,2018-01-01,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294,2.556584,2018-01-01,2018,...,,,,,,,,,,
1,2018-01-01,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118,2.118882,2018-01-01,2018,...,,,,,,,,,,
2,2018-01-01,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667,33.752945,2018-01-01,2018,...,,,,,,,,,,
3,2018-01-01,2018-01-02,607625,0.006700,2.675097,0.005168,1.862745,1.137428,2018-01-01,2018,...,,,,,,,,,,
4,2018-01-01,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373,0.391934,2018-01-01,2018,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2508993,2021-04-30,2021-05-01,451661,0.000000,0.013314,0.000000,0.625925,0.159810,2021-04-30,2021,...,,,,,,,,,,
2508994,2021-04-30,2021-05-01,519301,0.000131,0.003329,0.000000,0.216229,0.054922,2021-04-30,2021,...,,,,,,,,,,
2508995,2021-04-30,2021-05-01,527055,0.000000,0.019971,0.000000,0.273131,0.073276,2021-04-30,2021,...,,,,,,,,,,
2508996,2021-04-30,2021-05-01,543484,0.000131,0.056586,0.000000,1.024240,0.270239,2021-04-30,2021,...,,,,,,,,,,


In [22]:
agg_awards.to_pickle(processedpath+'agg_awards.pkl')
team_games.to_pickle(processedpath+'team_games.pkl')
team_game_stats.to_pickle(processedpath+'team_game_stats.pkl')
dates_with_info.to_pickle(processedpath+'dates_with_info.pkl')
player_game_stats.to_pickle(processedpath+'player_game_stats.pkl')
standings_for_digital_engagement_merge.to_pickle(processedpath+'standings_for_digital_engagement_merge.pkl')
rosters.to_pickle(processedpath+'rosters.pkl')
player_date_stats_agg.to_pickle(processedpath+'player_date_stats_agg.pkl')
team_date_stats_agg.to_pickle(processedpath+'team_date_stats_agg.pkl')
transactions.to_pickle(processedpath+'transactions.pkl')
awards.to_pickle(processedpath+'awards.pkl')
nextDayPlayerEngagement.to_pickle(processedpath+'nextDayPlayerEngagement.pkl')

In [24]:
games

Unnamed: 0,dailyDataDate,gamePk,gameType,season,gameDate,gameTimeUTC,resumeDate,resumedFrom,codedGameState,detailedGameState,...,homeWinner,homeScore,awayId,awayName,awayAbbrev,awayWins,awayLosses,awayWinPct,awayWinner,awayScore
0,2018-02-21,533782,E,2018,2018-02-21,2018-02-21T20:10:00Z,,,F,Final,...,True,7.0,5035,Arizona State Sun Devils,ASU,0.0,1.0,0.000,False,2.0
1,2018-02-22,534461,E,2018,2018-02-22,2018-02-22T18:05:00Z,,,F,Final,...,True,6.0,228,Florida Southern College Mocs,FSC,0.0,1.0,0.000,False,1.0
2,2018-02-22,545334,E,2018,2018-02-22,2018-02-22T18:05:00Z,,,F,Final,...,True,6.0,231,University of Tampa Spartans,UT,0.0,1.0,0.000,False,0.0
3,2018-02-22,547295,E,2018,2018-02-22,2018-02-22T03:33:00Z,,,F,Final,...,True,4.0,227,Boston College Eagles,BC,0.0,1.0,0.000,False,2.0
4,2018-02-22,533784,E,2018,2018-02-22,2018-02-22T23:05:00Z,,,F,Final,...,True,2.0,4864,Minnesota Gophers,UM,0.0,1.0,0.000,False,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7855,2021-04-30,634275,R,2021,2021-04-30,2021-04-30T23:10:00Z,,,F,Final,...,False,2.0,117,Houston Astros,HOU,14.0,12.0,0.538,True,9.0
7856,2021-04-30,634391,R,2021,2021-04-30,2021-05-01T00:10:00Z,,,F,Final,...,True,9.0,118,Kansas City Royals,KC,15.0,9.0,0.625,False,1.0
7857,2021-04-30,634305,R,2021,2021-04-30,2021-04-30T23:10:00Z,,,F,Final,...,True,8.0,112,Chicago Cubs,CHC,11.0,15.0,0.423,False,6.0
7858,2021-04-30,634280,R,2021,2021-04-30,2021-04-30T23:37:00Z,,,F,Final,...,True,13.0,144,Atlanta Braves,ATL,12.0,14.0,0.462,False,5.0


to do:
- transaction
- events
- awrds
- playerTwitterFollowers
- teamTwitterFollowers