In [1]:
# SCRIPT FOR GENERATING FLAT DATA, SAVING TO CSV, fnn_data.csv
import pandas as pd

target = ['wl_home']  # The target variable

label_data = ['game_id', 'num_matchup_history', 'num_team_history', 'num_opp_history', 'team_id_home', 'team_id_away', 'game_date', 'team_name_home', 'team_name_away', 'season_type']

# Define the features
game_features = ['is_regular_season', 'is_playoffs', 'is_pre_season']

historical_features = [
    'days_ago', 'games_ago', 'is_home_team', 'is_regular_season_matchup', 'is_playoffs_matchup', 'is_pre_season_matchup', 'wl',
    'pts_for', 'fg_pct_for', 'fg3_pct_for', 'fg3m_for', 'ft_pct_for', 'ftm_for', 'reb_for', 'ast_for', 'stl_for', 'blk_for', 'tov_for',
    'pts_against', 'fg_pct_against', 'fg3_pct_against', 'fg3m_against', 'ft_pct_against', 'ftm_against', 'reb_against', 'ast_against',
    'stl_against', 'blk_against', 'tov_against'
]

files = [
    'matchup_history.csv',
    'team_game_history.csv',
    'opponent_game_history.csv'
]

prefixes = ['m_', 't_', 'o_']  # Prefixes for matchup, team, and opponent histories

# Load main game data
games = pd.read_csv('games.csv')

print("Loading data...")
matchup_file = pd.read_csv('matchup_history.csv')
matchup_file = matchup_file[matchup_file['games_ago'] <= 5] # limit to 5 most recent games, some have more. 
team_file = pd.read_csv('team_game_history.csv')
opponent_file = pd.read_csv('opponent_game_history.csv')

column_headers = label_data + game_features
# use prefixes for sequence data to add to historical data columns
for prefix in prefixes:
    column_headers += [f"{prefix}{col}" for col in historical_features]

# add target in last column
column_headers += target

print("Number of columns:", len(column_headers))
print("Column headers:", column_headers)


def get_averaged_history_from_file(game_id: int, file: pd.DataFrame, features_to_average: list) -> list:
    # Get the historical data for this game
    matchup_data = file[file['game_id'] == game_id]
    # Calculate the mean of the historical data
    averaged_data = matchup_data[features_to_average].mean().values
    return averaged_data

print("Processing data...")
# Convert each game row into a flat array
flat_records = []
# add header
flat_records.append(column_headers)
for i, row in games.iterrows():
    print(f"Processing game {i + 1}/{len(games)}")
    flat_row = [row[label] for label in label_data] + [row[feature] for feature in game_features]
    game_id = row['game_id']

    # Get the historical data for this game
    matchup_data = get_averaged_history_from_file(game_id, matchup_file, historical_features)
    team_data = get_averaged_history_from_file(game_id, team_file, historical_features)
    opponent_data = get_averaged_history_from_file(game_id, opponent_file, historical_features)

    flat_row += list(matchup_data) + list(team_data) + list(opponent_data) + [row['wl_home']]
    flat_records.append(flat_row)

print("Number of rows:", len(flat_records))
print("Number of columns:", len(flat_records[0]))
print("Sample row:", flat_records[1])

# save to csv
print("Saving data to file fnn_data.csv ...")
with open('fnn_data.csv', 'w') as file:
    for row in flat_records:
        file.write(','.join(map(str, row)) + '\n')
print("Data saved successfully!")




Loading data...
Number of columns: 101
Column headers: ['game_id', 'num_matchup_history', 'num_team_history', 'num_opp_history', 'team_id_home', 'team_id_away', 'game_date', 'team_name_home', 'team_name_away', 'season_type', 'is_regular_season', 'is_playoffs', 'is_pre_season', 'm_days_ago', 'm_games_ago', 'm_is_home_team', 'm_is_regular_season_matchup', 'm_is_playoffs_matchup', 'm_is_pre_season_matchup', 'm_wl', 'm_pts_for', 'm_fg_pct_for', 'm_fg3_pct_for', 'm_fg3m_for', 'm_ft_pct_for', 'm_ftm_for', 'm_reb_for', 'm_ast_for', 'm_stl_for', 'm_blk_for', 'm_tov_for', 'm_pts_against', 'm_fg_pct_against', 'm_fg3_pct_against', 'm_fg3m_against', 'm_ft_pct_against', 'm_ftm_against', 'm_reb_against', 'm_ast_against', 'm_stl_against', 'm_blk_against', 'm_tov_against', 't_days_ago', 't_games_ago', 't_is_home_team', 't_is_regular_season_matchup', 't_is_playoffs_matchup', 't_is_pre_season_matchup', 't_wl', 't_pts_for', 't_fg_pct_for', 't_fg3_pct_for', 't_fg3m_for', 't_ft_pct_for', 't_ftm_for', 't_re

In [1]:
# SCRIPT FOR GENERATING THE GAME OBJECTS FOR RNN. SAVES TO A PICKLE FILE, rnn_game_objects.pkl
import pandas as pd
import pickle

target = ['wl_home']  # The target variable

# Define the features
game_features = ['is_regular_season', 'is_playoffs', 'is_pre_season']

historical_features = [
    'days_ago', 'games_ago', 'is_home_team', 'is_regular_season_matchup', 'is_playoffs_matchup', 'is_pre_season_matchup', 'wl',
    'pts_for', 'fg_pct_for', 'fg3_pct_for', 'fg3m_for', 'ft_pct_for', 'ftm_for', 'reb_for', 'ast_for', 'stl_for', 'blk_for', 'tov_for',
    'pts_against', 'fg_pct_against', 'fg3_pct_against', 'fg3m_against', 'ft_pct_against', 'ftm_against', 'reb_against', 'ast_against',
    'stl_against', 'blk_against', 'tov_against'
]

files = [
    'matchup_history.csv',
    'team_game_history.csv',
    'opponent_game_history.csv'
]

# Load main game data
games = pd.read_csv('games.csv')

# get all rows that match the game_id
def process_historical_file(file, prefix):
    # Load the file
    df = pd.read_csv(file)
    # for matchup history, filter where games_ago is less than or equal to 5
    if file == 'matchup_history.csv':
        df = df[df['games_ago'] <= 5]
    
# Convert each game row into a Python object
class GameData:
    def __init__(self, game_id, game_date, is_regular_season, is_playoffs, is_pre_season, matchups, team_history, opponent_history, target):
        # label data
        self.game_id = game_id
        self.game_date = game_date
        # game categorical features
        self.is_regular_season = is_regular_season
        self.is_playoffs = is_playoffs
        self.is_pre_season = is_pre_season
        # sequential arrays
        self.matchups = matchups                    # 5 most recent matchups
        self.team_history = team_history            # 10 most recent team games
        self.opponent_history = opponent_history    # 10 most recent opponent games
        # target variable
        self.target = target

print("Loading data...")
matchup_file = pd.read_csv('matchup_history.csv')
matchup_file = matchup_file[matchup_file['games_ago'] <= 5] # limit to 5 most recent games, some have more. 
team_file = pd.read_csv('team_game_history.csv')
opponent_file = pd.read_csv('opponent_game_history.csv')

# return array of rows from file that match the game_id 
def get_history_from_file(game_id, file, features_to_keep):
    data = file[file['game_id'] == game_id] # get all rows that match the game_id
    data = data[features_to_keep] # keep only the features we want
    return data


game_objects = []
for i, row in games.iterrows():
    print(f"Processing game {i + 1}/{len(games)}")
    game_id = row['game_id']

    matchups = get_history_from_file(game_id, matchup_file, historical_features)
    team_history = get_history_from_file(game_id, team_file, historical_features)
    opponent_history = get_history_from_file(game_id, opponent_file, historical_features)

    game_data = GameData(game_id, row['game_date'], row['is_regular_season'], row['is_playoffs'], row['is_pre_season'], matchups, team_history, opponent_history, row['wl_home'])
    game_objects.append(game_data)

# print first row for verification
game : GameData = game_objects[0]
print("First row of game data:")
print("game_id:", game.game_id)
print("game_date:", game.game_date)
print("is_regular_season:", game.is_regular_season)
print("is_playoffs:", game.is_playoffs)
print("is_pre_season:", game.is_pre_season)
print("matchups:", game.matchups)
print("team_history:", game.team_history)
print("opponent_history:", game.opponent_history)

# save the game objects to a pickle file
with open('rnn_game_objects.pkl', 'wb') as f:
    pickle.dump(game_objects, f)


Loading data...
Processing game 1/28803
Processing game 2/28803
Processing game 3/28803
Processing game 4/28803
Processing game 5/28803
Processing game 6/28803
Processing game 7/28803
Processing game 8/28803
Processing game 9/28803
Processing game 10/28803
Processing game 11/28803
Processing game 12/28803
Processing game 13/28803
Processing game 14/28803
Processing game 15/28803
Processing game 16/28803
Processing game 17/28803
Processing game 18/28803
Processing game 19/28803
Processing game 20/28803
Processing game 21/28803
Processing game 22/28803
Processing game 23/28803
Processing game 24/28803
Processing game 25/28803
Processing game 26/28803
Processing game 27/28803
Processing game 28/28803
Processing game 29/28803
Processing game 30/28803
Processing game 31/28803
Processing game 32/28803
Processing game 33/28803
Processing game 34/28803
Processing game 35/28803
Processing game 36/28803
Processing game 37/28803
Processing game 38/28803
Processing game 39/28803
Processing game 40

In [None]:
# NORMALIZE THE GAME DATA
GLOBAL_FEATURE_MIN_MAX = {
    'is_regular_season' : [0, 1],
    'is_playoffs' : [0, 1],
    'is_pre_season' : [0, 1],
    'days_ago' : [1, 1825],
    'days_ago' : [1, 1825],
    'games_ago' : [1, 10],
    'is_home_team' : [0, 1],
    'is_regular_season_matchup' : [0, 1],
    'is_playoffs_matchup' : [0, 1],
    'is_pre_season_matchup' : [0, 1],
    'wl' : [0, 1],
    'pts_for' : [49, 176],
    'fg_pct_for' : [0.234, 0.693],
    'fg3_pct_for' : [0, 1],
    'fg3m_for' : [0, 29],
    'ft_pct_for' : [0.143, 1],
    'ftm_for' : [0, 52],
    'reb_for' : [0, 81],
    'ast_for' : [4, 50],
    'stl_for' : [0, 27],
    'blk_for' : [0, 23],
    'tov_for' : [0, 38],
    'pts_against' : [49, 176],
    'fg_pct_against' : [0.234, 0.693],
    'fg3_pct_against' : [0, 1],
    'fg3m_against' : [0, 29],
    'ft_pct_against' : [0.143, 1],
    'ftm_against' : [0, 52],
    'reb_against' : [0, 81],
    'ast_against' : [4, 50],
    'stl_against' : [0, 27],
    'blk_against' : [0, 23],
    'tov_against' : [0, 38]
}


# load in file from pickle
import pickle
with open('rnn_game_objects.pkl', 'rb') as f:
    game_objects = pickle.load(f)

# print first row for verification
game : GameData = game_objects[0]
print("First row of game data:")
print("game_id:", game.game_id)
print("game_date:", game.game_date)
print("is_regular_season:", game.is_regular_season)
print("is_playoffs:", game.is_playoffs)
print("is_pre_season:", game.is_pre_season)
print("matchups:", game.matchups)
# print("team_history:", game.team_history)
# print("opponent_history:", game.opponent_history)

import pandas as pd

# Normalize all rows
# (x - min) / (max - min)
def normalize_game_data(game_objects, global_feature_min_max, historical_features):
    # Iterate through each game object and normalize the numeric features
    for n, game in enumerate(game_objects):  # Use enumerate to get index and object
        print(f"Normalizing game {n + 1}/{len(game_objects)}")

        # loop thorugh matchups
        for i, row in game.matchups.iterrows():
            for key, value in row.items():
                if key in global_feature_min_max:
                    min_val = global_feature_min_max[key][0]
                    max_val = global_feature_min_max[key][1]
                    game.matchups.loc[i, key] = (value - min_val) / (max_val - min_val)
        # loop through team history
        for i, row in game.team_history.iterrows():
            for key, value in row.items():
                if key in global_feature_min_max:
                    min_val = global_feature_min_max[key][0]
                    max_val = global_feature_min_max[key][1]
                    game.team_history.loc[i, key] = (value - min_val) / (max_val - min_val)
        # loop through opponent history
        for i, row in game.opponent_history.iterrows():
            for key, value in row.items():
                if key in global_feature_min_max:
                    min_val = global_feature_min_max[key][0]
                    max_val = global_feature_min_max[key][1]
                    game.opponent_history.loc[i, key] = (value - min_val) / (max_val - min_val)
                            
    return game_objects


# Normalize the game data
game_objects = normalize_game_data(game_objects, GLOBAL_FEATURE_MIN_MAX, historical_features)

# print first row for verification
game : GameData = game_objects[0]
print("First row of game data:")
print("game_id:", game.game_id)
print("game_date:", game.game_date)
print("is_regular_season:", game.is_regular_season)
print("is_playoffs:", game.is_playoffs)
print("is_pre_season:", game.is_pre_season)
print("matchups:", game.matchups)


norm_game_objects = game_objects
with open('norm_rnn_game_objects.pkl', 'wb') as f:
    pickle.dump(norm_game_objects, f)