In [None]:
def process(df, categories, all_categories, test_or_train='train', convert_categorical=True, tokenizer=None):    
  for col in df.columns: 
    if df[col].dtypes == 'O':
      df[col].fillna("N/A", inplace=True)
    else: 
      df[col].fillna(df[col].mean(), inplace = True) 
    
  def clean_abbrevs(txt):
    if 'CLV' == txt:
      return 'CLE'
    elif 'BLT' == txt:
      return 'BAL'
    elif 'ARZ' == txt:
      return 'ARI'
    elif 'HST' == txt:
      return 'HOU'
    else: 
      return txt


  #StadiumDict = {'out': 'outdoor', 'open':'outdoor', 'in':'indoor','closed':'indoor', 'dome':'rtr roof'}
  def clean_StadiumType(txt):
    if pd.isna(txt):
          return 'N/A'
    txt = txt.lower()
    if 'out' in txt:
      return 'outdoor'
    elif 'open' in txt:
      return 'outdoor'
    elif 'in' in txt:
      return 'indoor'
    elif 'closed' in txt:
      return 'indoor'
    elif 'dome' in txt:
      return 'rtr roof'
    else:
      return 'N/A'


  def clean_Position(txt):
    if txt == 'S':
      txt = txt.replace('S', 'SA')
    if txt == 'SAF':
      txt = txt.replace('SAF', 'SA')
    if txt == 'HB':
      txt = txt.replace('HB', 'RB')
    if txt == 'FB':
      txt = txt.replace('FB', 'RB')
    if txt == 'MLB':
      txt = txt.replace('MLB', "ILB")
    if txt == 'NT':
      txt = txt.replace('NT', 'DT')
    if txt =='OG': 
      txt = txt.replace('OG', 'G')
    if txt =='OT': 
      txt = txt.replace('OT', 'T')
    return txt


  def clean_GameWeather(txt):
    if pd.isna(txt):
          return 'N/A'
    txt = txt.lower()
    if 'rain' in txt:
      return 'rain'
    elif 'snow' in txt:
      return 'snow'
    elif 'cloudy' in txt:
      return 'cloudy'
    elif 'sun' in txt:
      return 'clear'
    elif 'clear' in txt:
      return 'clear'
    elif 'in' in txt:
      return 'clear'
    else:
      return 'N/A'


  directions = ['n','nne', 'ne', 'ene', 'e', 'ese', 'se', 'sse', 's', 'ssw', 'sw', 'wsw', 'w', 'wnw', 'nw', 'nnw']
  def clean_WindDirection(txt):
    if pd.isna(txt):
          return 'N/A'
    txt = txt.lower()
    if txt in directions:
      return txt
    elif 'southwest' in txt:
      return 'sw'
    elif 'southeast' in txt:
      return 'se'
    elif 'northwest' in txt:
      return 'nw'
    elif 'northeast' in txt:
      return 'ne'
    elif 'north' in txt:
      return 'n'
    elif 'south' in txt:
      return 's'
    elif 'west' in txt:
      return 'w'
    elif 'east' in txt:
      return 'e'
    elif 'from' in txt:
      txt = txt.split(" ")
      return txt[1]
    elif '-' in txt:
      txt = txt.split("-")
      return txt[0]+txt[1]
    elif 'calm' in txt:
      return 'calm'
    else:
      return 'N/A'


  def clean_Turf(txt):
      if pd.isna(txt):
          return np.nan
      txt = txt.lower()
      if 'DD' in txt:
        return 'Hybrid'
      elif 'SIS' in txt:
        return 'Hybrid'
      elif 'grass' in txt:
        return 'Grass'
      elif 'natural' in txt:
        return 'Grass'
      else:
        return 'Turf'

  def convert_heights(txt):
    ft, inch = txt.split("-")
    return int(ft)*12 + int(inch)

  def convert_birthday(txt):
    month, day, year = txt.split('/')
    return ((int(month) - 1)*30 + int(day) + (2019-int(year))*365)/365


  def convert_time(txt1, txt2):
    date, time = txt1.split('T')
    time, z = time.split('Z')
    hours, mins, secs = time.split(':')

    date2, time2 = txt2.split('T')
    time2, z2 = time2.split('Z')
    hours2, mins2, secs2 = time2.split(':')

    dhours = float(hours2) - float(hours)
    dmins = float(mins2) - float(mins)
    dsecs = float(secs2) - float(secs)
    dtime = dhours * 3600 + dmins * 60 + dsecs
    if dtime < 0:
      return 86400+dtime
    else: 
      return dtime

  def convert_yardline(poss_team, side, yards):
    try: 
      if side == poss_team:
        return int(yards)
      else:
        return -50+int(yards)
    except: 
      return 50.0
    
  def convert_time_on_clock(quarter, game_clock):
    minutes, seconds, a = game_clock.split(':')
    time_remaining = (int(quarter))*15*60-int(minutes)*60-int(seconds)
    return time_remaining

  def windspeed_fix(txt):
    txt = str(txt)
    try: 
      speed = float(txt.split(' ')[0])
      return speed
    except: 
      return 0.0

  def direction_fix(df):
    if df.PlayDirection.iloc[0] == 'left':
      df.X = 120 - df.X
      df.Y = 53.33333 - df.Y
      df.Orientation = (180 + df.Orientation)%360.
      df.Dir = (180 + df.Dir)%360.
    return df

  print("Cleaning data")
  df['PossessionTeam'] = df['PossessionTeam'].apply(clean_abbrevs)
  df['FieldPosition'] = df['FieldPosition'].apply(clean_abbrevs)

  # clean turf
  df['Turf'] = df['Turf'].apply(clean_Turf)
  Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 
          'UBU Sports Speed S5-M':'Artificial', 'Artificial':'Artificial', 
          'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 
          'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 
          'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
          'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 
          'Naturall Grass':'Natural', 'Field turf':'Artificial', 'SISGrass':'Artificial', 
          'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 
  df['Turf'] = df['Turf'].map(Turf)

  df['WindDirection'] = df['WindDirection'].apply(clean_WindDirection)

  # clean game weather
  indoor = "indoor"
  df['GameWeather'] = df['GameWeather'].apply(lambda x: indoor if not pd.isna(x) and indoor in x else x)
  df['GameWeather'] = df['GameWeather'].apply(lambda x: x.lower().replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly').replace('clear and sunny', 'sunny and clear').replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
  df['GameWeather'] = df['GameWeather'].apply(clean_GameWeather)
    
  df['StadiumType'] = df['StadiumType'].apply(clean_StadiumType)
  df.loc[df.Season == 2017, 'Orientation'] = np.mod(90 + df.loc[df.Season == 2017, 'Orientation'], 360)
  df["PlayerHeight"] = df["PlayerHeight"].apply(convert_heights)
  df["WindSpeed"] = df["WindSpeed"].apply(windspeed_fix)
  df['PlayerAge'] = df['PlayerBirthDate'].apply(convert_birthday)
  df['Position'] = df['Position'].apply(clean_Position)
  df['TrimmedPlayId'] = df.PlayId.apply(lambda x: int(str(x)[10:]))

  def kinetic_energy(W, S): 
    return 0.5*W*S*S

  array=[]
  for i in range(df.shape[0]): 
    array.append(kinetic_energy(df['PlayerWeight'][i],df['S'][i]))
  df['KE'] = array

  array=[]
  for i in range(df.shape[0]): 
    array.append(convert_time(df['TimeSnap'][i],df['TimeHandoff'][i]))
  df['TimeToHandoff'] = array

  array=[]
  for i in range(df.shape[0]): 
    array.append(convert_yardline(df['PossessionTeam'][i], df['FieldPosition'][i],df['YardLine'][i]))
  df['ScrimmageLine'] = array

  array=[]
  for i in range(df.shape[0]): 
    array.append(convert_time_on_clock(5 - df['Quarter'][i], df['GameClock'][i]))
  df['Time'] = array

  array=[]
  for i in range(df.shape[0]): 
    array.append(convert_time_on_clock(1, df['GameClock'][i]))
  df['QuarterTime'] = array

  df = df.groupby("PlayId", group_keys=False).apply(direction_fix)


  def add_positions(df):
    off_pos = ''.join(df.OffensePersonnel.unique())
    off_pos = np.unique(re.findall(r'\b[A-Z][A-Z]+\b', off_pos))
    off_pos = ['Off' + p for p in off_pos]
    
    def_pos = ''.join(df.DefensePersonnel.unique())
    def_pos = np.unique(re.findall(r'\b[A-Z][A-Z]+\b', def_pos))
    def_pos = ['Def' + p for p in def_pos]
  
    df = df.reindex(columns=df.columns.values.tolist() + off_pos + def_pos)
  
    def parse_df(df):
      off_ = df.iloc[0].OffensePersonnel.replace(',', ' ').split()
      nums, cols = off_[::2], off_[1::2]
      cols = ['Off' + c for c in cols]
      df[cols] = nums
      
      def_ = df.iloc[0].DefensePersonnel.replace(',', ' ').split()
      nums, cols = def_[::2], def_[1::2]
      cols = ['Def' + c for c in cols]
      df[cols] = nums
      return df
    
    df = df.groupby('PlayId').apply(parse_df)
    df.drop(columns=['OffensePersonnel', 'DefensePersonnel'], inplace=True)

    for c in off_pos + def_pos:
      df[c] = df[c].fillna(0)
      
    return df

  print("adding positions")
  df = add_positions(df)

  
  def make_vectors(points):
    """
    Take a collection of vertices (points) of a convex shape
    and create a set of vectors from the uppermost point
    to the rest of the points
    """
    points = np.array(sorted(points, key=lambda x: -x[1]))
    vecs = (points - points[0])[1:]
    return vecs

  def order_vectors(vectors):
      """
      top: top of polygon from which all vectors are defined
      vectors: (x, y) of vectors defined by make_vectors
      """
      def angle(vec):
          xhat = np.array([1, 0])
          sin_angle = np.linalg.norm(np.cross(xhat, vec))/np.linalg.norm(vec)
          angle = np.arcsin(sin_angle)
          if vec[0] < 0:
              # because arcsin only gives between -pi/2 and pi/2
              angle = np.pi - angle
          return angle
      
      return sorted(vectors, key=lambda vec: angle(vec))

  def polygon_area(points):
      """
      We break the polygon into triangles and then sum up
      the areas of those individual triangles to determine
      the total area of the polygon. This only works because
      the polygons are convex
      """
      vectors = make_vectors(points)
      ord_vecs = order_vectors(vectors)
      
      area = 0
      i = 0
      while i < len(ord_vecs)-1:
          area += 0.5 * np.linalg.norm(np.cross(ord_vecs[i], ord_vecs[i+1]))
          i += 1
      
      return area

  def get_dx_dy(delta_t, rusher_dir, rusher_or, speed, acceleration):
      dx = (delta_t*speed) * math.cos(rusher_dir)
      dx = dx + 0.5*delta_t**2 * acceleration * math.cos(rusher_or)
      dy = (delta_t*speed) * math.sin(rusher_dir)
      dy = dy + 0.5*delta_t**2 * acceleration * math.sin(rusher_or)
      return dx, dy

  def voronoi_dist_to_rb(df, delta_t):
        """Compute Voronoi areas and distance to RB for an array of Δt offsets."""
        try:
          iter(delta_t)
        except:
          delta_t = [delta_t]

        direction = np.mod(90. - df.Dir, 360.) * np.pi / 180.
        orientation = np.mod(90. - df.Orientation, 360.) * np.pi / 180.

        for dt in delta_t:
          dx = (dt * df.S) * np.cos(direction) + 0.5 * dt ** 2 * df.A * np.cos(orientation)
          dy = (dt * df.S) * np.sin(direction) + 0.5 * dt ** 2 * df.A * np.sin(orientation)
          x_ = df.X + dx; y_ = df.Y + dy
          df['X_' + str(dt)] = x_
          df['Y_' + str(dt)] = y_

        def add_features(df):
          for dt in delta_t:
              # Compute Voronoi areas
              coords = ['X_' + str(dt), 'Y_' + str(dt)]
              vor = Voronoi(df[coords].values)
              areas = np.array([polygon_area([vor.vertices[i] for i in region])
                              if len(region) > 0 else 0. for region in vor.regions])
              df['VoronoiArea_' + str(dt)] = areas[vor.point_region]

              # Compute distance to RB
              rusher = df[df.NflId == df.NflIdRusher]
              dist = np.linalg.norm((df[coords] - rusher[coords].values).values, axis=1)
              df['DistanceToRB_dt_' + str(dt)] = dist
          return df

        df = df.groupby('PlayId', group_keys=False).apply(add_features)
        df = df.drop(columns=['X_' + str(dt) for dt in delta_t] + ['Y_' + str(dt) for dt in delta_t])
        return df

  delta_t = [0., 0.25, 0.5, 0.75]
  print("adding vorobois")
  df = voronoi_dist_to_rb(df, delta_t)

  def centroid_dif(df): 
    homeTeam = df[df.Team == 'home']
    awayTeam = df[df.Team == 'away']
    centroid = np.sqrt((homeTeam.X.mean()-awayTeam.X.mean())* (homeTeam.X.mean()-awayTeam.X.mean()) 
                    +(homeTeam.Y.mean()-awayTeam.Y.mean())*(homeTeam.Y.mean()-awayTeam.Y.mean()))
    df["Centroid"] = np.ones((22,))*centroid
    return df

  print("Adding Centroid Diffs")
  df = df.groupby('PlayId', group_keys=False).apply(centroid_dif)

  df['MinTimeToTackle']= df['DistanceToRB_dt_0.0']/df.S

  direction = np.mod(90. - df.Dir, 360.) * np.pi / 180.0
  orientation = np.mod(90. - df.Orientation, 360.) * np.pi / 180.0
  df['Sx'] =  (df.S) * np.cos(direction)
  df['Sy'] = (df.S) * np.sin(direction)


  def nn_to_rb(df):
    """Get nearest neighbors to running back"""
    is_defense = (~np.any(df.NflId == df.NflIdRusher)).astype('int')
    df = df.sort_values('DistanceToRB_dt_0.0')
    df['NNToRB'] = is_defense * 11 + np.arange(11)
    return df

  print("NN to Rusher")
  df = df.groupby(['PlayId', 'Team'], group_keys=False).apply(nn_to_rb)


  print("Converting categorical") 
  if convert_categorical:
    cat_df = df[categories]
    if test_or_train == 'train':
      # tokenize categorical variables
#       categories = ['PossessionTeam', 'Down', 'NflIdRusher', 'OffenseFormation', 
#                   'NflId', 'JerseyNumber', 'Position', 
#                   'PlayerHeight', 'PlayerWeight','PlayerCollegeName',
#                   'HomeTeamAbbr', 'VisitorTeamAbbr', 'Week', 'Stadium', 'Location', 
#                   'StadiumType', 'Turf', 'GameWeather', 'Temperature', 'Humidity', 
#                   'WindSpeed', 'WindDirection',  'Quarter'] 

      cat_df = cat_df.astype('category')
      cat_df = cat_df.apply(lambda x: x.cat.codes)

      def create_tokenizer(df, cat_df):
        tokenizer = {}
        for cat in categories:
            a = df[cat].unique()
            b = cat_df[cat].unique()
            tokenizer.update({cat: {num: tok for num, tok in zip(a, b)}})
        return tokenizer
    
      print("Making tokenizer")
      tokenizer = create_tokenizer(df, cat_df)
    elif test_or_train == 'test':
        for cat in categories:
          cat_df[cat] = df[cat].map(tokenizer[cat])     
        
        # note: if the dictionary does not exhaustively map all
        # entries then non-matched entries are changed to NaNs
        
    df[categories] = cat_df


  def flatten_data(df): 
    """Flatten grouped dataframe and return that dataframe""" 
    df = df.sort_values(by = ['NNToRB'])
    array = df.values.flatten()
    array = np.transpose(array.reshape(array.shape[0], 1))
    df = pd.DataFrame(data = array, columns = playerDataCol)
    return df

  def game_data(df):
    df = df.iloc[0,:]
    array = df.values.flatten()
    array = np.transpose(array.reshape(array.shape[0], 1))
    df = pd.DataFrame(data = array, columns = gameData)
    return df

  """We need a better way of picking out the columns related to players vs. games 
    although I guess doing it manually isn't that bad.... """

  gameData = ['GameId', 'PlayId', 'TrimmedPlayId', 'Team', 'Season', 'YardLine',
        'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance',
        'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
        'NflIdRusher', 'OffenseFormation', 'DefendersInTheBox', 'PlayDirection',
        'TimeHandoff', 'TimeSnap', 'Yards', 'HomeTeamAbbr',
        'VisitorTeamAbbr', 'Week', 'Stadium', 'Location', 'StadiumType', 'Turf',
        'GameWeather', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection',
        'ScrimmageLine', 'Time']
  if test_or_train == 'test':
        gameData.remove('Yards')

  playerData=['PlayId','X', 'Y', 'S', 'A', 'Dis', 'Orientation',
        'Dir', 'NflId', 'DisplayName', 'JerseyNumber', 'PlayerHeight', 'PlayerWeight',
        'PlayerBirthDate', 'PlayerCollegeName', 'Position', 
        'PlayerAge', 'Centroid', 'Sx', 'Sy', 'KE',
        'DistanceToRB_dt_0.0', 'DistanceToRB_dt_0.25', 'DistanceToRB_dt_0.5',
        'DistanceToRB_dt_0.75','VoronoiArea_0.0','VoronoiArea_0.25',
        'VoronoiArea_0.5','VoronoiArea_0.75',  'NNToRB']
        

  playerDataCol=[]
  sides = ['_O_', '_D_']
  for k,side in enumerate(sides): 
    for i in range(11): 
      for player_values in playerData: 
        title = str(player_values)+side+str(i+k)
        playerDataCol.append(title)

  dfPlayers = df[playerData]
  dfGame = df[gameData]

  print("Flatten data")
  dfPlayers = dfPlayers.groupby(['PlayId'], group_keys=False).apply(flatten_data)
  dfGame = dfGame.groupby(['PlayId'], group_keys=False).apply(game_data)

  df = pd.concat([dfGame, dfPlayers], axis = 1)
  df['HandoffToFD'] = df['Distance'] + (df['X_O_0']-df['ScrimmageLine'])


  df = df.replace([np.inf, -np.inf], np.nan)
  df = df.replace(np.nan, 10)
  
  if convert_categorical:
        return df, tokenizer
  return df