In [None]:
import pandas as pd
import json
import math
from shapely.geometry import MultiPoint

# my file structure:
# wisd hackathon 2023 (root folder)
# - /games (downloaded games from aws)
# -- /gameid
# - /metadata (downloaded metadata from aws)
# - this notebook
# i hope i wrote this in a way that makes sense!


In [None]:
game_ids = ['0042100301', '0042100304', '0042100307', '0042100313', '0042100401', '0042100404', '0042100302', '0042100305', '0042100311',
            '0042100314', '0042100402', '0042100405', '0042100303', '0042100306', '0042100312', '0042100315', '0042100403', '0042100406']

# dictionary mapping from game id to the joint_df for that game
# reads all pyball csvs from all games, generated by get_pyball.ipynb
shot_df_dict = {}
for game_id in game_ids:
    # read csv generated by get_pyball.ipynb, use first col as index
    joint_df = pd.read_csv(
        f'./games/{game_id}/{game_id}_pyball.csv', index_col=0)
    shot_df_dict[game_id] = joint_df[joint_df['eventType'] == "SHOT"]


In [None]:
# shots and tracking df for one game
df = pd.read_csv('joint.csv', index_col=0)
shot_df = df[df['eventType'] == "SHOT"]
print(len(shot_df))

tracking_df = pd.read_json(
    "games/0042100301/0042100301_tracking.jsonl", lines=True)
print(len(tracking_df))
print(len(shot_df))

# merge the event and tracking data on the wallclock
shots_tracking_df = pd.merge(
    shot_df, tracking_df, on='wallClock', how='inner', suffixes=('_event', '_tracking'))


In [None]:
# convex hullll

def calc_hull(row, team):
    player_coords = []
    for player in row[f'{team}Players_tracking']:
        player_coords.append(player['xyz'])
    multipoint = MultiPoint(player_coords)
    convex_hull = multipoint.convex_hull
    return convex_hull.area


shots_tracking_df['home_hull'] = shots_tracking_df.apply(
    lambda row: calc_hull(row, 'home'), axis=1)
shots_tracking_df['away_hull'] = shots_tracking_df.apply(
    lambda row: calc_hull(row, 'away'), axis=1)

print(shots_tracking_df[['home_hull', 'away_hull']].to_string())


In [None]:
# create dictionary of all game shots+tracking data
# isolating the frames for each shot

# inner join the shots_df from joint.csv
# and the tracking data from that game
shots_tracking_df_dict = {}
for game_id in game_ids:
    tracking_df = pd.read_json(
        f"games/{game_id}/{game_id}_tracking.jsonl", lines=True)

    # merge and add suffixes to columns that originally belonged to events df and tracking df
    shots_tracking_df_dict[game_id] = pd.merge(
        shot_df_dict[game_id], tracking_df, on='wallClock', how='inner', suffixes=('_event', '_tracking'))

    # I used this line to create csvs to plot players
    # shots_tracking_df_dict[game_id].to_csv(f'./games/{game_id}/{game_id}_e_t.csv')


In [None]:
def distance3d(p1: list, p2: list) -> float:
    x1, y1, z1 = p1
    x2, y2, z2 = p2
    return math.sqrt(math.pow(x2 - x1, 2) +
                     math.pow(y2 - y1, 2) +
                     math.pow(z2 - z1, 2) * 1.0)


In [None]:
def away_or_home(row):
    # did the away or home team take the shot
    if (row['playerId'] in row['homePlayers_event']):
        return 'home'
    elif (row['playerId'] in row['awayPlayers_event']):
        return 'away'
    else:
        print(f'ERROR: {row["EVENTNUM"]}')
        return None


def get_player_coords(row):
    team = row['shotmaker_team']
    for dict in row[f'{team}Players_tracking']:
        if dict['playerId'] == row['playerId']:
            return dict['xyz']


def get_defender_distances(row):
    # return a list of distances between each defender and the shotmaker
    team = row['shotmaker_team']
    shotmaker_coord = row['shotmaker_coord']
    dist_arr = []
    if team == 'away':
        for entry in row['homePlayers_tracking']:
            defender_coord = entry['xyz']
            dist_arr.append(distance3d(shotmaker_coord, defender_coord))
    elif team == 'home':
        for entry in row['awayPlayers_tracking']:
            defender_coord = entry['xyz']
            dist_arr.append(distance3d(shotmaker_coord, defender_coord))
    else:
        print(f'ERROR: {row["EVENTNUM"]}')
        return None

    return dist_arr


def calc_hull(row):
    # calculate hull area for away + home team
    home_coords = [p['xyz'] for p in row['homePlayers_tracking']]
    away_coords = [p['xyz'] for p in row['awayPlayers_tracking']]
    home_multipoint = MultiPoint(home_coords)
    home_convex_hull = home_multipoint.convex_hull
    away_multipoint = MultiPoint(away_coords)
    away_convex_hull = away_multipoint.convex_hull
    row['home_hull'] = home_convex_hull.area
    row['away_hull'] = away_convex_hull.area
    return row


# create new columns on the shots_and_tracking_df
for game_id in game_ids:
    shots_tracking_df_dict[game_id] = shots_tracking_df_dict[game_id].apply(
        calc_hull, axis=1)
    # I decided to find if the shotmaker was on home or away first
    # then I found the shotmaker's coordinates
    # then I found the distance from all of the defenders
    shots_tracking_df_dict[game_id]['shotmaker_team'] = shots_tracking_df_dict[game_id].apply(
        lambda row: away_or_home(row), axis=1)
    shots_tracking_df_dict[game_id]['shotmaker_coord'] = shots_tracking_df_dict[game_id].apply(
        lambda row: get_player_coords(row), axis=1)
    shots_tracking_df_dict[game_id]['defense_distance'] = shots_tracking_df_dict[game_id].apply(
        lambda row: get_defender_distances(row), axis=1)


In [None]:
# made_dist, miss_dist: average distance from shooter to closest defensive player
# made_off_hull: avg hull area of the offensive team on makes
# miss_off_hull: avg hull area of the offensive team on misses

# globals
global_dict = {'made_dist': [],  'miss_dist': [], 'diff': [], 'made_off_hull': [], 'made_def_hull': [
], 'miss_off_hull': [], 'miss_def_hull': []}  # FOR THIS GAME ONLY


def shot_made(row) -> bool:
    # determine if the player made the shot or not, required: row['shotmaker_team']
    description = ""
    if row['shotmaker_team'] == "away":
        description = "VISITORDESCRIPTION"
    elif row['shotmaker_team'] == "home":
        description = "HOMEDESCRIPTION"
    else:
        print(f"ERROR: {row['EVENTNUM']}")
        return
    if isinstance(row[description], str) and "MISS" in row[description]:
        return False
        # player missed the shot
    elif isinstance(row[description], str) and "PTS" in row[description]:
        return True


def calculate_aggregates(row, agg_dict):
    if shot_made(row):
        # add the distance from the shotmaker and the closest defender
        agg_dict['made_dist'].append(min(row['defense_distance']))
        if row['shotmaker_team'] == "away":
            agg_dict['made_off_hull'].append(row['away_hull'])
            agg_dict['made_def_hull'].append(row['home_hull'])
        elif row['shotmaker_team'] == "home":
            agg_dict['made_off_hull'].append(row['home_hull'])
            agg_dict['made_def_hull'].append(row['away_hull'])
    else:
        agg_dict['miss_dist'].append(min(row['defense_distance']))
        if row['shotmaker_team'] == "away":
            agg_dict['miss_off_hull'].append(row['away_hull'])
            agg_dict['miss_def_hull'].append(row['home_hull'])
        elif row['shotmaker_team'] == "home":
            agg_dict['miss_off_hull'].append(row['home_hull'])
            agg_dict['miss_def_hull'].append(row['away_hull'])


avgs = []  # list of all games' stats to create df with

for game_id in game_ids:
    game_stats = {'made_dist': [], 'miss_dist': [], 'diff': [0], 'made_off_hull': [], 'made_def_hull': [
    ], 'miss_off_hull': [], 'miss_def_hull': []}  # FOR THIS GAME ONLY
    shots_tracking_df_dict[game_id].apply(
        lambda row: calculate_aggregates(row, game_stats), axis=1)
    for k, v in game_stats.items():
        if k == 'diff':
            continue
        game_stats[k] = sum(v) / len(v) # calculate avg of this game's stats
        global_dict[k].append(game_stats[k])
    game_stats['diff'] = game_stats['made_dist'] - game_stats['miss_dist']
    global_dict['diff'].append(game_stats['diff'])
    this_avgs = list(game_stats.values())
    this_avgs.insert(0, game_id)
    avgs.append(this_avgs)

for k, v in global_dict.items():
    global_dict[k] = sum(v) / len(v)
global_avg = list(global_dict.values())
global_avg.insert(0, "AVG")
avgs.append(global_avg)

avg_df = pd.DataFrame(avgs, columns=['game_id', 'made_dist', 'miss_dist', 'diff', 'made_off_hull',
                      'made_def_hull',  'miss_off_hull', 'miss_def_hull'])
avg_df.to_csv('avg.csv')

print(avg_df.to_string())


In [None]:
# sorting plays by greatest difference between the shotmaker team and the other team
def hull_distances(row):
    shot_maker = row['shotmaker_team']
    if shot_maker == 'home':
        return row['home_hull'] - row['away_hull']
    elif shot_maker == 'away':
        return row['away_hull'] - row['home_hull']
    else:
        print(f"ERROR: {row['EVENTNUM']}")

for game_id in game_ids:
    shots_tracking_df_dict[game_id]['hull_diff'] = shots_tracking_df_dict[game_id].apply(
            lambda row: hull_distances(row), axis=1)
    shots_tracking_df_dict[game_id] = shots_tracking_df_dict[game_id].sort_values(by='hull_diff', ascending=False)
