## Setting up Lag Metrics

- days since previous game(s)
-

Note that we'll need to do this per season per team


In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = None

import math

In [2]:
def haversine_np(
    lat1,
    lon1,
    lat2,
    lon2,
):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

Distance between the coordinates: 1506.49 km


In [3]:
# load team data
team_info_df = pd.read_csv("../data/team_info.csv")
print(team_info_df.shape)
team_info_df.head(2)

(30, 7)


Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946


In [4]:
# load arena coordinates
coordinate_df = pd.read_csv("../data/arena_coordinates.csv")
coordinate_df["city_team_key"] = coordinate_df["City"] + " " + coordinate_df["Team"]
coordinate_df = coordinate_df.merge(
    team_info_df,
    left_on="city_team_key",
    right_on="full_name",
)

keeps = [
    "city_team_key",
    "abbreviation",
    "Latitude",
    "Longitude",
]

coordinate_df = coordinate_df[keeps].copy()
print(coordinate_df.shape)
coordinate_df.head(2)

(26, 4)


Unnamed: 0,city_team_key,abbreviation,Latitude,Longitude
0,Atlanta Hawks,ATL,33.757,84.396
1,Boston Celtics,BOS,42.366,71.062


In [5]:
# load schedule data
df = pd.read_csv("../data/2013-2022-regular-season-matchups.csv")

# convert to date type
df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
# df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'], format="%m-%d-%Y")


# drop some extra columns to make things easier to see
df.drop(
    ["W_PCT", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM"]
    + ["FTA", "FT_PCT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF"],
    axis=1,
    inplace=True,
)

print(df.shape)
# df.info()
df.head(3)

(23958, 13)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,Team,SeasonStart,opponent_abbreviation,team_abbreviation,is_home_game
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,1
2,1610612737,21300046,2013-11-03,ATL @ LAL,L,1,2,103,Atlanta Hawks,2013,LAL,ATL,0


In [7]:
# join the arena coordinates
merged = pd.merge(
    df,
    coordinate_df,
    left_on="team_abbreviation",
    right_on="abbreviation",
    how="left",
    suffixes=("", "_team"),
).merge(
    coordinate_df,
    left_on="opponent_abbreviation",
    right_on="abbreviation",
    how="left",
    suffixes=("", "_opponent"),
)

# drop extra columns from merge
merged = merged.drop(
    [
        "Team_ID",
        "Game_ID",
        "GAME_DATE",
        "MATCHUP",
        "WL",
        "W",
        "L",
        "PTS",
        "Team",
        "SeasonStart",
        "opponent_abbreviation",
        "team_abbreviation",
        "is_home_game",
        "city_team_key",
        "abbreviation",
        "Latitude",
        "Longitude",
        # "city_team_key_opponent",
        # "abbreviation_opponent",
        "Latitude_opponent",
        "Longitude_opponent",
    ]
)

print(merged.shape)
merged.head(2)

(23958, 21)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,Team,SeasonStart,opponent_abbreviation,team_abbreviation,is_home_game,city_team_key,abbreviation,Latitude,Longitude,city_team_key_opponent,abbreviation_opponent,Latitude_opponent,Longitude_opponent
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,1,Atlanta Hawks,ATL,33.757,84.396,Toronto Raptors,TOR,43.644,79.379


In [8]:
merged.columns

Index(['Team_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'W', 'L', 'PTS',
       'Team', 'SeasonStart', 'opponent_abbreviation', 'team_abbreviation',
       'is_home_game', 'city_team_key', 'abbreviation', 'Latitude',
       'Longitude', 'city_team_key_opponent', 'abbreviation_opponent',
       'Latitude_opponent', 'Longitude_opponent'],
      dtype='object')

In [11]:
# game coordinates
merged["game_lat"] = np.where(
    merged["is_home_game"], merged["Latitude"], merged["Latitude_opponent"]
)
merged["game_lon"] = np.where(
    merged["is_home_game"], merged["Longitude"], merged["Longitude_opponent"]
)

# merged["distance"] = haversine_np(df['lon1'],df['lat1'],df['lon2'],df['lat2'])
merged.head(2)

Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,Team,SeasonStart,opponent_abbreviation,team_abbreviation,is_home_game,city_team_key,abbreviation,Latitude,Longitude,city_team_key_opponent,abbreviation_opponent,Latitude_opponent,Longitude_opponent,game_lat,game_lon
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,32.791,96.81
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,1,Atlanta Hawks,ATL,33.757,84.396,Toronto Raptors,TOR,43.644,79.379,33.757,84.396


In [None]:
team_datasets = []
for team in df["Team"].unique()[:2]:
    team_df = df[df["Team"] == team].copy()
    for season in team_df["SeasonStart"].unique()[:2]:
        # print(f"{team}: {season}")
        team_season_df = team_df[team_df["SeasonStart"] == season].copy()

        team_season_df["start_of_streak"] = team_season_df["WL"].ne(
            team_season_df["WL"].shift()
        )
        team_season_df["streak_id"] = team_season_df["start_of_streak"].cumsum()

        team_season_df["streak_counter"] = (
            team_season_df.groupby("streak_id").cumcount() + 1
        )

        team_season_df["date_difference_1"] = (
            team_season_df["GAME_DATE"].shift(-1) - team_season_df["GAME_DATE"]
        ).dt.days.fillna(99999)
        team_season_df["date_difference_2"] = (
            team_season_df["GAME_DATE"].shift(-2) - team_season_df["GAME_DATE"]
        ).dt.days.fillna(99999)
        team_season_df["date_difference_3"] = (
            team_season_df["GAME_DATE"].shift(-3) - team_season_df["GAME_DATE"]
        ).dt.days.fillna(99999)
        team_season_df["date_difference_4"] = (
            team_season_df["GAME_DATE"].shift(-4) - team_season_df["GAME_DATE"]
        ).dt.days.fillna(99999)
        team_season_df["date_difference_5"] = (
            team_season_df["GAME_DATE"].shift(-5) - team_season_df["GAME_DATE"]
        ).dt.days.fillna(99999)
        team_season_df["date_difference_6"] = (
            team_season_df["GAME_DATE"].shift(-6) - team_season_df["GAME_DATE"]
        ).dt.days.fillna(99999)
        team_season_df["date_difference_7"] = (
            team_season_df["GAME_DATE"].shift(-7) - team_season_df["GAME_DATE"]
        ).dt.days.fillna(99999)

        team_season_df["days_since_previous_game"] = 0
        team_season_df["games_in_last_4_days"] = 0

        team_datasets.append(team_season_df)

processed_df = pd.concat(team_datasets)
print(processed_df.shape)
processed_df.tail(3)

In [None]:
# processed_df["sum"] = processed_df[
#     [
#         "date_difference_1",
#         "date_difference_2",
#         "date_difference_3",
#         "date_difference_4",
#         "date_difference_5",
#         "date_difference_6",
#         "date_difference_7",
#     ]
# ].sum(axis=1)
# processed_df

In [None]:
team_season_df

In [None]:
# Convert the GAME_DATE column to datetime
df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])

# Initialize the columns for the number of games in the last 4 and 7 days
df["GAMES_IN_LAST_4_DAYS"] = 0
df["GAMES_IN_LAST_7_DAYS"] = 0

# Calculate the number of games in the last 4 and 7 days
for i in range(len(df)):
    if i % 5000 == 0:
        print(i)
    current_date = df.iloc[i]["GAME_DATE"]
    previous_games = df[df["GAME_DATE"] < current_date]  # games before the current one
    df.loc[i, "GAMES_IN_LAST_4_DAYS"] = sum(
        (current_date - previous_games["GAME_DATE"]).dt.days <= 4
    )
    df.loc[i, "GAMES_IN_LAST_7_DAYS"] = sum(
        (current_date - previous_games["GAME_DATE"]).dt.days <= 7
    )

# Sort the DataFrame by GAME_DATE in ascending order
df = df.sort_values("GAME_DATE")

# Calculate the number of days since the previous game
df["DAYS_SINCE_PREV_GAME"] = df["GAME_DATE"].diff().dt.days

# Display the df
df

### Scratch

In [15]:
import numpy as np
import math

# Sample DataFrame
data = {
    'latitude': [33.757, 42.366, 40.683, 35.225],
    'longitude': [-84.396, -71.062, -73.975, -80.839]
}

df = pd.DataFrame(data)

# Convert degrees to radians
df['latitude_rad'] = np.radians(df['latitude'])
df['longitude_rad'] = np.radians(df['longitude'])

# Shift the columns by one row to get the previous latitude and longitude
df_shifted = df.shift(1)

# Calculate differences in radians
df['dlat'] = df['latitude_rad'] - df_shifted['latitude_rad']
df['dlon'] = df['longitude_rad'] - df_shifted['longitude_rad']

# Haversine formula calculation
R = 6371.0  # Earth's radius in kilometers
df['a'] = np.sin(df['dlat'] / 2)**2 + np.cos(df_shifted['latitude_rad']) * np.cos(df['latitude_rad']) * np.sin(df['dlon'] / 2)**2
df['c'] = 2 * np.arctan2(np.sqrt(df['a']), np.sqrt(1 - df['a']))

# Calculate distance
df['distance_km'] = R * df['c']

# Drop intermediate columns if needed
# df.drop(columns=['latitude_rad', 'longitude_rad', 'df_shifted', 'dlat', 'dlon', 'a', 'c'], inplace=True)

df

Unnamed: 0,latitude,longitude,latitude_rad,longitude_rad,dlat,dlon,a,c,distance_km
0,33.757,-84.396,0.589171,-1.472988,,,,,
1,42.366,-71.062,0.739426,-1.240266,0.150255,0.232722,0.013913,0.236461,1506.494029
2,40.683,-73.975,0.710052,-1.291107,-0.029374,-0.050841,0.000578,0.048075,306.283805
3,35.225,-80.839,0.614792,-1.410907,-0.09526,-0.119799,0.004487,0.134069,854.153937
