## Setting up Lag Metrics

- days since previous game(s)
-

Note that we'll need to do this per season per team


In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = None

In [2]:
def haversine_np(lon1, lat1, lon2, lat2, km=True):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2

    c = 2 * np.arcsin(np.sqrt(a))
    if km:
        dist = 6367 * c
    else:
        dist = 3950 * c
    return dist

In [3]:
# load team info data
team_info_df = pd.read_csv("../data/team_info_with_timezone.csv")
print(team_info_df.shape)
team_info_df#.head(2)

(30, 9)


Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded,timezone,time_diff_from_et
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949,America/New_York,0
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946,America/New_York,0
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970,America/New_York,0
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002,America/Chicago,-1
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966,America/Chicago,-1
5,1610612742,Dallas Mavericks,DAL,Mavericks,Dallas,Texas,1980,America/Chicago,-1
6,1610612743,Denver Nuggets,DEN,Nuggets,Denver,Colorado,1976,America/Denver,-2
7,1610612744,Golden State Warriors,GSW,Warriors,Golden State,California,1946,America/Los_Angeles,-3
8,1610612745,Houston Rockets,HOU,Rockets,Houston,Texas,1967,America/Chicago,-1
9,1610612746,Los Angeles Clippers,LAC,Clippers,Los Angeles,California,1970,America/Los_Angeles,-3


In [4]:
# coordinate_df = pd.read_csv("../data/arena_coordinates.csv")

# a = coordinate_df[
#     coordinate_df['Team'].str.contains('Blaze')
# ]["Team"].tolist()

# b = team_info_df[team_info_df['full_name'].str.contains("Blaze")]
# b= b['full_name'].tolist()

# print(a, b)

In [5]:
# load arena coordinates
coordinate_df = pd.read_csv("../data/arena_coordinates.csv")
coordinate_df["city_team_key"] = coordinate_df["City"] + " " + coordinate_df["Team"]
coordinate_df = coordinate_df.merge(
    team_info_df,
    left_on="city_team_key",
    right_on="full_name",
    how="left"
)
# assert coordinate_df.shape[0] == 30

keeps = [
    "city_team_key",
    "abbreviation",
    "Latitude",
    "Longitude",
]

coordinate_df = coordinate_df[keeps].copy()
print(coordinate_df.shape)
coordinate_df.head(2)

(30, 4)


Unnamed: 0,city_team_key,abbreviation,Latitude,Longitude
0,Atlanta Hawks,ATL,33.757,84.396
1,Boston Celtics,BOS,42.366,71.062


In [6]:
# load schedule data
df = pd.read_csv("../data/2013-2022-regular-season-matchups.csv")

# sort by teams, and game date

# convert to date type
df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
# df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'], format="%m-%d-%Y")

# sort by team and game date (there were issues when sorted by game id for some reason)
df = df.sort_values(by=["team_abbreviation", "GAME_DATE"])

# mark covid games
df['is_covid_bubble'] = np.where((df['GAME_DATE'] >= "2020-07-09") & (df['GAME_DATE'] <= "2020-10-11"), 1, 0)


# drop some extra columns to make things easier to see
df.drop(
    ["W_PCT", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM"]
    + ["FTA", "FT_PCT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF"],
    axis=1,
    inplace=True,
)

print(df.shape)
# regular season only
print(f"covid_games: {df['is_covid_bubble'].sum()}")
# df.info()
df.head(3)

(23958, 14)
covid_games: 176


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,is_home_game,is_covid_bubble
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,0
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,1,0
2,1610612737,21300046,2013-11-03,ATL @ LAL,L,1,2,103,Atlanta Hawks,2013,LAL,ATL,0,0


In [7]:
temp = df[df["is_covid_bubble"] == 1]
print(temp.shape)
temp

(176, 14)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,is_home_game,is_covid_bubble
15803,1610612751,21901233,2020-07-31,BKN vs. ORL,L,30,35,118,Brooklyn Nets,2019,ORL,BKN,1,1
15804,1610612751,21901244,2020-08-02,BKN vs. WAS,W,31,35,118,Brooklyn Nets,2019,WAS,BKN,1,1
15805,1610612751,21901256,2020-08-04,BKN @ MIL,W,32,35,119,Brooklyn Nets,2019,MIL,BKN,0,1
15806,1610612751,21901267,2020-08-05,BKN @ BOS,L,32,36,115,Brooklyn Nets,2019,BOS,BKN,0,1
15807,1610612751,21901276,2020-08-07,BKN vs. SAC,W,33,36,119,Brooklyn Nets,2019,SAC,BKN,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16742,1610612764,21901263,2020-08-05,WAS vs. PHI,L,24,44,98,Washington Wizards,2019,PHI,WAS,1,1
16743,1610612764,21901278,2020-08-07,WAS @ NOP,L,24,45,107,Washington Wizards,2019,NOP,WAS,0,1
16744,1610612764,21901285,2020-08-09,WAS @ OKC,L,24,46,103,Washington Wizards,2019,OKC,WAS,0,1
16745,1610612764,21901303,2020-08-11,WAS vs. MIL,L,24,47,113,Washington Wizards,2019,MIL,WAS,1,1


In [8]:
# join the arena coordinates
merged = (
    # get location of team of interest
    pd.merge(
        df,
        coordinate_df,
        left_on="team_abbreviation",
        right_on="abbreviation",
        how="left",
        suffixes=("", "_team"),
    )
    # get location of opponent
    .merge(
        coordinate_df,
        left_on="opponent_abbreviation",
        right_on="abbreviation",
        how="left",
        suffixes=("", "_opponent"),
    )
    # get the points scored by opponent
    .merge(
        df[["Game_ID", "team_abbreviation", "PTS"]],
        left_on=["Game_ID", "opponent_abbreviation"],
        right_on=["Game_ID", "team_abbreviation"],
        suffixes=("", "_opponent"),
    )
    # get timezone diff from ET for each team
    .merge(
        team_info_df[["abbreviation", "timezone" ,"time_diff_from_et"]],
        left_on="abbreviation",
        right_on="abbreviation"
    )
    .merge(
        team_info_df[["abbreviation", "timezone" ,"time_diff_from_et"]],
        left_on="opponent_abbreviation",
        right_on="abbreviation",
        suffixes=("", "_opponent")
    )
)

# # drop extra columns from merge
# merged = merged[
#     [
#         # "Team_ID",
#         # "Game_ID",
#         "season_start_year",
#         "city_team_key",
#         "team",
#         "abbreviation",
#         "opponent_abbreviation",
#         "GAME_DATE",
#         "MATCHUP",
#         "WL",
#         # "W",
#         # "L",
#         "PTS",
#         # "team_abbreviation",
#         "is_home_game",
#         "Latitude",
#         "Longitude",
#         # "city_team_key_opponent",
#         # "abbreviation_opponent",
#         "Latitude_opponent",
#         "Longitude_opponent",
#         ## "team_abbreviation_opponent",
#         "PTS_opponent",
#     ]
# ]

print(merged.shape)
merged.head(2)

(23958, 29)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,is_home_game,is_covid_bubble,city_team_key,abbreviation,Latitude,Longitude,city_team_key_opponent,abbreviation_opponent,Latitude_opponent,Longitude_opponent,team_abbreviation_opponent,PTS_opponent,timezone,time_diff_from_et,abbreviation_opponent.1,timezone_opponent,time_diff_from_et_opponent
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,118,America/New_York,0,DAL,America/Chicago,-1
1,1610612737,21300231,2013-11-29,ATL vs. DAL,W,9,8,88,Atlanta Hawks,2013,DAL,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,87,America/New_York,0,DAL,America/Chicago,-1


In [9]:
# merged.columns

In [10]:

# time diff
merged['game_timezone_vs_ET'] = np.where(merged['is_home_game'], merged['time_diff_from_et'], merged["time_diff_from_et_opponent"])
#TODO: get hours lost from previous games times zone and hours lost from home
# merged['team_time_diff'] = merged['']

# game coordinates
merged["game_lat"] = np.where(
    merged["is_home_game"], merged["Latitude"], merged["Latitude_opponent"]
)
merged["game_lon"] = np.where(
    merged["is_home_game"], merged["Longitude"], merged["Longitude_opponent"]
)
merged["prev_game_lon"] = merged["game_lon"].shift(1).fillna(merged["Longitude"])
merged["prev_game_lat"] = merged["game_lat"].shift(1).fillna(merged["Latitude"])


merged["distance_from_previous_game"] = haversine_np(
    merged["game_lon"],
    merged["game_lat"],
    merged["prev_game_lon"],
    merged["prev_game_lat"],
    km=False
)

# point_difference
merged['point_difference'] = merged['PTS'] - merged['PTS_opponent']

print(merged.shape)
merged.head(15)

(23958, 36)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,is_home_game,is_covid_bubble,city_team_key,abbreviation,Latitude,Longitude,city_team_key_opponent,abbreviation_opponent,Latitude_opponent,Longitude_opponent,team_abbreviation_opponent,PTS_opponent,timezone,time_diff_from_et,abbreviation_opponent.1,timezone_opponent,time_diff_from_et_opponent,game_timezone_vs_ET,game_lat,game_lon,prev_game_lon,prev_game_lat,distance_from_previous_game,point_difference
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,118,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,-9
1,1610612737,21300231,2013-11-29,ATL vs. DAL,W,9,8,88,Atlanta Hawks,2013,DAL,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,87,America/New_York,0,DAL,America/Chicago,-1,0,33.757,84.396,96.81,32.791,718.172413,1
2,1610612737,21400413,2014-12-22,ATL @ DAL,W,20,7,105,Atlanta Hawks,2014,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,102,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,3
3,1610612737,21400843,2015-02-25,ATL vs. DAL,W,45,12,104,Atlanta Hawks,2014,DAL,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,87,America/New_York,0,DAL,America/Chicago,-1,0,33.757,84.396,96.81,32.791,718.172413,17
4,1610612737,21500328,2015-12-09,ATL @ DAL,W,14,9,98,Atlanta Hawks,2015,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,95,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,3
5,1610612737,21500723,2016-02-01,ATL vs. DAL,W,28,22,112,Atlanta Hawks,2015,DAL,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,97,America/New_York,0,DAL,America/Chicago,-1,0,33.757,84.396,96.81,32.791,718.172413,15
6,1610612737,21600557,2017-01-07,ATL @ DAL,W,21,16,97,Atlanta Hawks,2016,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,82,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,15
7,1610612737,21600897,2017-03-01,ATL vs. DAL,W,34,26,100,Atlanta Hawks,2016,DAL,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,95,America/New_York,0,DAL,America/Chicago,-1,0,33.757,84.396,96.81,32.791,718.172413,5
8,1610612737,21700009,2017-10-18,ATL @ DAL,W,1,0,117,Atlanta Hawks,2017,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,111,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,6
9,1610612737,21700484,2017-12-23,ATL vs. DAL,W,8,25,112,Atlanta Hawks,2017,DAL,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,107,America/New_York,0,DAL,America/Chicago,-1,0,33.757,84.396,96.81,32.791,718.172413,5


In [11]:
merged.head(4).to_dict()

  merged.head(4).to_dict()


{'Team_ID': {0: 1610612737, 1: 1610612737, 2: 1610612737, 3: 1610612737},
 'Game_ID': {0: 21300012, 1: 21300231, 2: 21400413, 3: 21400843},
 'GAME_DATE': {0: Timestamp('2013-10-30 00:00:00'),
  1: Timestamp('2013-11-29 00:00:00'),
  2: Timestamp('2014-12-22 00:00:00'),
  3: Timestamp('2015-02-25 00:00:00')},
 'MATCHUP': {0: 'ATL @ DAL',
  1: 'ATL vs. DAL',
  2: 'ATL @ DAL',
  3: 'ATL vs. DAL'},
 'WL': {0: 'L', 1: 'W', 2: 'W', 3: 'W'},
 'W': {0: 0, 1: 9, 2: 20, 3: 45},
 'L': {0: 1, 1: 8, 2: 7, 3: 12},
 'PTS': {0: 109, 1: 88, 2: 105, 3: 104},
 'team': {0: 'Atlanta Hawks',
  1: 'Atlanta Hawks',
  2: 'Atlanta Hawks',
  3: 'Atlanta Hawks'},
 'season_start_year': {0: 2013, 1: 2013, 2: 2014, 3: 2014},
 'opponent_abbreviation': {0: 'DAL', 1: 'DAL', 2: 'DAL', 3: 'DAL'},
 'team_abbreviation': {0: 'ATL', 1: 'ATL', 2: 'ATL', 3: 'ATL'},
 'is_home_game': {0: 0, 1: 1, 2: 0, 3: 1},
 'is_covid_bubble': {0: 0, 1: 0, 2: 0, 3: 0},
 'city_team_key': {0: 'Atlanta Hawks',
  1: 'Atlanta Hawks',
  2: 'Atlanta 

In [12]:
df_merged = pd.merge(df, merged)
df_merged.shape
df_merged.head()

Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,is_home_game,is_covid_bubble,city_team_key,abbreviation,Latitude,Longitude,city_team_key_opponent,abbreviation_opponent,Latitude_opponent,Longitude_opponent,team_abbreviation_opponent,PTS_opponent,timezone,time_diff_from_et,abbreviation_opponent.1,timezone_opponent,time_diff_from_et_opponent,game_timezone_vs_ET,game_lat,game_lon,prev_game_lon,prev_game_lat,distance_from_previous_game,point_difference
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,118,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,-9
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Toronto Raptors,TOR,43.644,79.379,TOR,95,America/New_York,0,TOR,America/Toronto,0,0,33.757,84.396,96.81,32.791,718.172413,7
2,1610612737,21300046,2013-11-03,ATL @ LAL,L,1,2,103,Atlanta Hawks,2013,LAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Los Angeles Lakers,LAL,34.043,118.267,LAL,105,America/New_York,0,LAL,America/Los_Angeles,-3,-3,34.043,118.267,79.379,43.644,2168.135335,-2
3,1610612737,21300059,2013-11-05,ATL @ SAC,W,2,2,105,Atlanta Hawks,2013,SAC,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Sacramento Kings,SAC,38.58,121.5,SAC,100,America/New_York,0,SAC,America/Los_Angeles,-3,-3,38.58,121.5,118.267,34.043,360.614957,5
4,1610612737,21300071,2013-11-07,ATL @ DEN,L,2,3,107,Atlanta Hawks,2013,DEN,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Denver Nuggets,DEN,39.749,105.008,DEN,109,America/New_York,0,DEN,America/Denver,-2,-2,39.749,105.008,77.021,38.898,1487.653627,-2


In [13]:
team_datasets = []
for team in df_merged["team"].unique():
    team_df = df_merged[df_merged["team"] == team].copy()
    for season in team_df["season_start_year"].unique():
        # print(f"{team}: {season}")
        team_season_df = team_df[team_df["season_start_year"] == season].copy()

        team_season_df["is_last_game_home"] = (
            team_season_df["is_home_game"].shift(1).fillna(1).astype(int)
        )
        team_season_df["is_2_games_ago_home"] = (
            team_season_df["is_home_game"].shift(2).fillna(1).astype(int)
        )
        team_season_df["is_3_games_ago_home"] = (
            team_season_df["is_home_game"].shift(3).fillna(1).astype(int)
        )
        team_season_df["is_4_games_ago_home"] = (
            team_season_df["is_home_game"].shift(4).fillna(1).astype(int)
        )
        team_season_df["is_5_games_ago_home"] = (
            team_season_df["is_home_game"].shift(5).fillna(1).astype(int)
        )
        team_season_df["is_6_games_ago_home"] = (
            team_season_df["is_home_game"].shift(6).fillna(1).astype(int)
        )
        team_season_df["is_7_games_ago_home"] = (
            team_season_df["is_home_game"].shift(7).fillna(1).astype(int)
        )
        
        # num away games in last 5
        team_season_df["num_away_last_5_games"] = team_season_df[
            [
                "is_last_game_home",
                "is_2_games_ago_home",
                "is_3_games_ago_home",
                "is_4_games_ago_home",
                "is_5_games_ago_home",

            ]
        ].apply(lambda row: ((row <= 7) & (row > 0)).sum(), axis=1)

        # streak of home vs away games
        team_season_df["start_of_streak"] = team_season_df["is_home_game"].ne(
            team_season_df["is_home_game"].shift()
        )
        team_season_df["streak_id"] = team_season_df["start_of_streak"].cumsum()
        team_season_df["streak_counter"] = (
            team_season_df.groupby("streak_id").cumcount() + 1
        )
        # count of previous consecutive away games
        team_season_df["incoming_away_game_streak"] = np.where(
            team_season_df["is_last_game_home"] == False,
            team_season_df["streak_counter"] - 1,
            0,
        )

        team_season_df["days_since_1_games_ago"] = (
            team_season_df["GAME_DATE"] - team_season_df["GAME_DATE"].shift(1)
        ).dt.days.fillna(150)
        team_season_df["days_since_2_games_ago"] = (
            team_season_df["GAME_DATE"] - team_season_df["GAME_DATE"].shift(2)
        ).dt.days.fillna(150)
        team_season_df["days_since_3_games_ago"] = (
            team_season_df["GAME_DATE"] - team_season_df["GAME_DATE"].shift(3)
        ).dt.days.fillna(150)
        team_season_df["days_since_4_games_ago"] = (
            team_season_df["GAME_DATE"] - team_season_df["GAME_DATE"].shift(4)
        ).dt.days.fillna(150)
        team_season_df["days_since_5_games_ago"] = (
            team_season_df["GAME_DATE"] - team_season_df["GAME_DATE"].shift(5)
        ).dt.days.fillna(150)
        team_season_df["days_since_6_games_ago"] = (
            team_season_df["GAME_DATE"] - team_season_df["GAME_DATE"].shift(6)
        ).dt.days.fillna(150)
        team_season_df["days_since_7_games_ago"] = (
            team_season_df["GAME_DATE"] - team_season_df["GAME_DATE"].shift(7)
        ).dt.days.fillna(150)

        team_season_df["num_games_last_4_days"] = team_season_df[
            [
                "days_since_1_games_ago",
                "days_since_2_games_ago",
                "days_since_3_games_ago",
                "days_since_4_games_ago",
                "days_since_5_games_ago",
                "days_since_6_games_ago",
                "days_since_7_games_ago",
            ]
        ].apply(lambda row: ((row <= 4) & (row > 0)).sum(), axis=1)
        team_season_df["num_games_last_7_days"] = team_season_df[
            [
                "days_since_1_games_ago",
                "days_since_2_games_ago",
                "days_since_3_games_ago",
                "days_since_4_games_ago",
                "days_since_5_games_ago",
                "days_since_6_games_ago",
                "days_since_7_games_ago",
            ]
        ].apply(lambda row: ((row <= 7) & (row > 0)).sum(), axis=1)

        # distance traveled up to this game --------------------
        # Create a list to store the cumulative distances
        cumulative_distance = 0
        cumulative_distances = []
        # Iterate through the DataFrame rows
        for index, row in team_season_df.iterrows():
            # Check if it's a home game
            if row['is_last_game_home'] == 1:
                # print("here", index)
                cumulative_distance = 0  # Reset cumulative distance
            
            # Add the current distance to cumulative_distance
            cumulative_distance += row['distance_from_previous_game']
            
            # Append the cumulative distance to the list
            cumulative_distances.append(cumulative_distance)

        # Add the cumulative_distances list as a new column to the DataFrame
        team_season_df['cumulative_distance'] = cumulative_distances
        # ------------------------------------------------------------

        # concatenate
        team_datasets.append(team_season_df)

processed_df = pd.concat(team_datasets)
print(processed_df.shape)
processed_df.head(10)

(23958, 58)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,is_home_game,is_covid_bubble,city_team_key,abbreviation,Latitude,Longitude,city_team_key_opponent,abbreviation_opponent,Latitude_opponent,Longitude_opponent,team_abbreviation_opponent,PTS_opponent,timezone,time_diff_from_et,abbreviation_opponent.1,timezone_opponent,time_diff_from_et_opponent,game_timezone_vs_ET,game_lat,game_lon,prev_game_lon,prev_game_lat,distance_from_previous_game,point_difference,is_last_game_home,is_2_games_ago_home,is_3_games_ago_home,is_4_games_ago_home,is_5_games_ago_home,is_6_games_ago_home,is_7_games_ago_home,num_away_last_5_games,start_of_streak,streak_id,streak_counter,incoming_away_game_streak,days_since_1_games_ago,days_since_2_games_ago,days_since_3_games_ago,days_since_4_games_ago,days_since_5_games_ago,days_since_6_games_ago,days_since_7_games_ago,num_games_last_4_days,num_games_last_7_days,cumulative_distance
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,DAL,118,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,-9,1,1,1,1,1,1,1,5,True,1,1,0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,0,0,718.172413
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Toronto Raptors,TOR,43.644,79.379,TOR,95,America/New_York,0,TOR,America/Toronto,0,0,33.757,84.396,96.81,32.791,718.172413,7,0,1,1,1,1,1,1,4,True,2,1,0,2.0,150.0,150.0,150.0,150.0,150.0,150.0,1,1,1436.344826
2,1610612737,21300046,2013-11-03,ATL @ LAL,L,1,2,103,Atlanta Hawks,2013,LAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Los Angeles Lakers,LAL,34.043,118.267,LAL,105,America/New_York,0,LAL,America/Los_Angeles,-3,-3,34.043,118.267,79.379,43.644,2168.135335,-2,1,0,1,1,1,1,1,4,True,3,1,0,2.0,4.0,150.0,150.0,150.0,150.0,150.0,2,2,2168.135335
3,1610612737,21300059,2013-11-05,ATL @ SAC,W,2,2,105,Atlanta Hawks,2013,SAC,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Sacramento Kings,SAC,38.58,121.5,SAC,100,America/New_York,0,SAC,America/Los_Angeles,-3,-3,38.58,121.5,118.267,34.043,360.614957,5,0,1,0,1,1,1,1,3,False,3,2,1,2.0,4.0,6.0,150.0,150.0,150.0,150.0,2,3,2528.750292
4,1610612737,21300071,2013-11-07,ATL @ DEN,L,2,3,107,Atlanta Hawks,2013,DEN,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Denver Nuggets,DEN,39.749,105.008,DEN,109,America/New_York,0,DEN,America/Denver,-2,-2,39.749,105.008,77.021,38.898,1487.653627,-2,0,0,1,0,1,1,1,2,False,3,3,2,2.0,4.0,6.0,8.0,150.0,150.0,150.0,2,3,4016.403919
5,1610612737,21300086,2013-11-09,ATL vs. ORL,W,3,3,104,Atlanta Hawks,2013,ORL,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Orlando Magic,ORL,28.539,81.384,ORL,94,America/New_York,0,ORL,America/New_York,0,0,33.757,84.396,77.021,38.898,541.319182,10,0,0,0,1,0,1,1,1,True,4,1,0,2.0,4.0,6.0,8.0,10.0,150.0,150.0,2,3,4557.723101
6,1610612737,21300098,2013-11-11,ATL @ CHA,W,4,3,103,Atlanta Hawks,2013,CHA,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Charlotte Hornets,CHA,35.225,80.839,CHA,94,America/New_York,0,CHA,America/New_York,0,0,35.225,80.839,77.021,38.898,328.923358,9,1,0,0,0,1,0,1,2,True,5,1,0,2.0,4.0,6.0,8.0,10.0,12.0,150.0,2,3,328.923358
7,1610612737,21300116,2013-11-13,ATL vs. NYK,L,4,4,91,Atlanta Hawks,2013,NYK,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,New York Knicks,NYK,40.751,73.993,NYK,95,America/New_York,0,NYK,America/New_York,0,0,33.757,84.396,77.021,38.898,541.319182,-4,0,1,0,0,0,1,0,1,True,6,1,0,2.0,4.0,6.0,8.0,10.0,12.0,14.0,2,3,870.24254
8,1610612737,21300127,2013-11-15,ATL vs. PHI,W,5,4,113,Atlanta Hawks,2013,PHI,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Philadelphia 76ers,PHI,39.901,75.172,PHI,103,America/New_York,0,PHI,America/New_York,0,0,33.757,84.396,73.993,40.751,746.273513,10,1,0,1,0,0,0,1,2,False,6,2,0,2.0,4.0,6.0,8.0,10.0,12.0,14.0,2,3,746.273513
9,1610612737,21300139,2013-11-16,ATL @ NYK,W,6,4,110,Atlanta Hawks,2013,NYK,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,New York Knicks,NYK,40.751,73.993,NYK,90,America/New_York,0,NYK,America/New_York,0,0,40.751,73.993,84.396,33.757,746.273513,20,1,1,0,1,0,0,0,3,True,7,1,0,1.0,3.0,5.0,7.0,9.0,11.0,13.0,2,4,746.273513


In [14]:
new_df = pd.merge(
    processed_df,
    processed_df[
        [
            "Game_ID",
            "team_abbreviation",
            "incoming_away_game_streak",
            "days_since_1_games_ago",
            "num_games_last_4_days",
            "num_games_last_7_days",
            "cumulative_distance",
        ]
    ],
    left_on=["Game_ID", "opponent_abbreviation"],
    right_on=[
        "Game_ID",
        "team_abbreviation",
    ],
    suffixes=("", "_opponent"),
).drop("team_abbreviation_opponent", axis=1)
new_df = new_df.assign(
    days_of_rest_difference=new_df["days_since_1_games_ago"]
    - new_df["days_since_1_games_ago_opponent"],
    games_last_7_diff=new_df["num_games_last_7_days"]
    - new_df["num_games_last_7_days_opponent"],
    distance_difference = new_df['cumulative_distance'] - new_df['cumulative_distance_opponent']
)
new_df["target"] = np.where(new_df["WL"] == "L", 1, 0)
print(processed_df.shape)
print(new_df.shape)
new_df.head()

(23958, 58)
(23958, 66)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,is_home_game,is_covid_bubble,city_team_key,abbreviation,Latitude,Longitude,city_team_key_opponent,abbreviation_opponent,Latitude_opponent,Longitude_opponent,PTS_opponent,timezone,time_diff_from_et,abbreviation_opponent.1,timezone_opponent,time_diff_from_et_opponent,game_timezone_vs_ET,game_lat,game_lon,prev_game_lon,prev_game_lat,distance_from_previous_game,point_difference,is_last_game_home,is_2_games_ago_home,is_3_games_ago_home,is_4_games_ago_home,is_5_games_ago_home,is_6_games_ago_home,is_7_games_ago_home,num_away_last_5_games,start_of_streak,streak_id,streak_counter,incoming_away_game_streak,days_since_1_games_ago,days_since_2_games_ago,days_since_3_games_ago,days_since_4_games_ago,days_since_5_games_ago,days_since_6_games_ago,days_since_7_games_ago,num_games_last_4_days,num_games_last_7_days,cumulative_distance,incoming_away_game_streak_opponent,days_since_1_games_ago_opponent,num_games_last_4_days_opponent,num_games_last_7_days_opponent,cumulative_distance_opponent,days_of_rest_difference,games_last_7_diff,distance_difference,target
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Dallas Mavericks,DAL,32.791,96.81,118,America/New_York,0,DAL,America/Chicago,-1,-1,32.791,96.81,84.396,33.757,718.172413,-9,1,1,1,1,1,1,1,5,True,1,1,0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,0,0,718.172413,0,150.0,0,0,718.172413,0.0,0,0.0,1
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,1,0,Atlanta Hawks,ATL,33.757,84.396,Toronto Raptors,TOR,43.644,79.379,95,America/New_York,0,TOR,America/Toronto,0,0,33.757,84.396,96.81,32.791,718.172413,7,0,1,1,1,1,1,1,4,True,2,1,0,2.0,150.0,150.0,150.0,150.0,150.0,150.0,1,1,1436.344826,0,2.0,1,1,876.035741,0.0,0,560.309084,0
2,1610612737,21300046,2013-11-03,ATL @ LAL,L,1,2,103,Atlanta Hawks,2013,LAL,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Los Angeles Lakers,LAL,34.043,118.267,105,America/New_York,0,LAL,America/Los_Angeles,-3,-3,34.043,118.267,79.379,43.644,2168.135335,-2,1,0,1,1,1,1,1,4,True,3,1,0,2.0,4.0,150.0,150.0,150.0,150.0,150.0,2,2,2168.135335,0,2.0,2,3,1929.261873,0.0,-1,238.873462,1
3,1610612737,21300059,2013-11-05,ATL @ SAC,W,2,2,105,Atlanta Hawks,2013,SAC,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Sacramento Kings,SAC,38.58,121.5,100,America/New_York,0,SAC,America/Los_Angeles,-3,-3,38.58,121.5,118.267,34.043,360.614957,5,0,1,0,1,1,1,1,3,False,3,2,1,2.0,4.0,6.0,150.0,150.0,150.0,150.0,2,3,2528.750292,0,3.0,2,3,2614.319194,-1.0,0,-85.568901,0
4,1610612737,21300071,2013-11-07,ATL @ DEN,L,2,3,107,Atlanta Hawks,2013,DEN,ATL,0,0,Atlanta Hawks,ATL,33.757,84.396,Denver Nuggets,DEN,39.749,105.008,109,America/New_York,0,DEN,America/Denver,-2,-2,39.749,105.008,77.021,38.898,1487.653627,-2,0,0,1,0,1,1,1,2,False,3,3,2,2.0,4.0,6.0,8.0,150.0,150.0,150.0,2,3,4016.403919,0,2.0,1,2,1207.771416,0.0,1,2808.632504,1


In [15]:
# new_df['recent_games_diff'].plot()

In [16]:
processed_df.columns

Index(['Team_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'W', 'L', 'PTS',
       'team', 'season_start_year', 'opponent_abbreviation',
       'team_abbreviation', 'is_home_game', 'is_covid_bubble', 'city_team_key',
       'abbreviation', 'Latitude', 'Longitude', 'city_team_key_opponent',
       'abbreviation_opponent', 'Latitude_opponent', 'Longitude_opponent',
       'team_abbreviation_opponent', 'PTS_opponent', 'timezone',
       'time_diff_from_et', 'abbreviation_opponent', 'timezone_opponent',
       'time_diff_from_et_opponent', 'game_timezone_vs_ET', 'game_lat',
       'game_lon', 'prev_game_lon', 'prev_game_lat',
       'distance_from_previous_game', 'point_difference', 'is_last_game_home',
       'is_2_games_ago_home', 'is_3_games_ago_home', 'is_4_games_ago_home',
       'is_5_games_ago_home', 'is_6_games_ago_home', 'is_7_games_ago_home',
       'num_away_last_5_games', 'start_of_streak', 'streak_id',
       'streak_counter', 'incoming_away_game_streak', 'days_since_1_games_a

In [17]:
new_df.to_csv("../data/game_data_2.csv", index=False)

In [18]:
# processed_df.to_csv("../data/game_data_1.csv", index=False)

In [19]:
# assert False

In [20]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(new_df, title="Profiling Report")


In [21]:
# profile

In [22]:
assert False

AssertionError: 

### Scratch
