# Load and Preprocess Data

This notebook performs the following steps:

1. Load raw data  
2. Clean and preprocess data into input features and targets  
3. Save data to be used later

In [None]:
import torch 
import numpy as np
import pandas as pd
import random
import os
import pickle
import matplotlib.pyplot as plt
from datetime import datetime
from typing import Optional, Union

from data_cleaning import clean_data, optimize_memory_usage, aggregate_data

VERBOSE = False      # for debugging
root_dir = os.getcwd()

# set manual custom seed for reproducibility
def set_random_seed(value): 
    g = torch.manual_seed(value)   
    np.random.seed(value)
    random.seed(value)
    torch.backends.cudnn.deterministic=True
    return g

%load_ext autoreload
%autoreload 2

: 

In [None]:
plays_fname = os.path.join(root_dir, "data/plays.csv")
player_play_fname = os.path.join(root_dir, "data/player_play.csv")
tracking_fname_list = [os.path.join(root_dir, f"data/tracking_week_{i}.csv") for i in range(1,10)]

import polars as pl

# Aggregate data from the plays.csv, players.csv, and any tracking data into one aggregate dataframe.
df = aggregate_data(plays_fname, player_play_fname, tracking_fname_list)

# Preprocess and clean the data
df_clean = clean_data(df)

# optimize memory usage of dataframe
df_opt = optimize_memory_usage(df_clean)

print(df_opt.head())

INFO: Aggregating data from play data, tracking data, and players data into a master dataframe...
