In [1]:
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm  # For progress tracking

In [2]:
features = [
    #  'START_POSITION',
    "PCT_FGA_2PT",
    "PCT_AST_2PM",
    "PCT_PTS_2PT",
    "AST_PCT",
    "PCT_FG3M",
    "PCT_BLKA",
    "PCT_BLK",
    "FG3_PCT",
    "PCT_PTS",
    "PCT_FGM",
    "PCT_REB",
    "PCT_FGA",
    "E_USG_PCT",
    "REB_PCT",
    "PCT_PTS_OFF_TOV",
    "PCT_DREB",
    "OPP_OREB_PCT",
    "PCT_UAST_3PM",
    "PCT_TOV",
    "DREB_PCT",
    "PCT_FTM",
    "OPP_TOV_PCT",
    "PCT_UAST_2PM",
    "PCT_AST_3PM",
    "USG_PCT",
    "PCT_AST",
    "FG_PCT",
    "EFG_PCT",
    "TS_PCT",
    "PCT_OREB",
    "PCT_PTS_2PT_MR",
    "PCT_PF",
    "FT_PCT",
    "PCT_PTS_PAINT",
    "PCT_PTS_FT",
    "PCT_PFD",
    "PCT_FGA_3PT",
    "OPP_EFG_PCT",
    "CFG_PCT",
    "TM_TOV_PCT",
    "PCT_UAST_FGM",
    "PCT_PTS_3PT",
    "OREB_PCT",
    "PCT_PTS_FB",
    "PCT_AST_FGM",
    "UFG_PCT",
    "PCT_FG3A",
    "PCT_STL",
    "DFG_PCT",
    "OREB",
    "AST",
    "REB",
    "DFGA",
    "SAST",
    "OPP_PTS_2ND_CHANCE",
    "PFD",
    "TO",
    "FG3A",
    "STL",
    "POSS",
    "PASS",
    "UFGM",
    "FG3M",
    "PTS",
    "UFGA",
    "DRBC",
    "OPP_PTS_PAINT",
    "FTM",
    "ORBC",
    "BLKA",
    "PTS_FB",
    "CFGA",
    "PTS_PAINT",
    "TCHS",
    "CFGM",
    "PLUS_MINUS",
    "DFGM",
    "OPP_PTS_OFF_TOV",
    "PTS_OFF_TOV",
    "FGA",
    "FTA",
    "PTS_2ND_CHANCE",
    "FGM",
    "PF",
    "DREB",
    "BLK",
    "RBC",
    "OPP_PTS_FB",
    "FTAST",
    "E_OFF_RATING",
    "OFF_RATING",
    "E_NET_RATING",
    "E_DEF_RATING",
    "NET_RATING",
    "DEF_RATING",
    "E_PACE",
    "AST_RATIO",
    "DIST",
    "AST_TOV",
    "FTA_RATE",
    "OPP_FTA_RATE",
    "MIN",
    "PACE_PER40",
    "PACE",
    "PIE",
]

In [3]:
def win_loss_error_rate(test_predictions, test_labels):
    win_loss_predictions = np.where(test_predictions > 0, 1, 0)
    win_loss_truth = np.where(test_labels.to_numpy() > 0, 1, 0)
    return abs(win_loss_predictions - win_loss_truth).mean()

In [4]:
# Load the player data
player_data = pd.read_csv("data/player_data.csv")

In [5]:
# Define parameters
player_count = 5  # Number of players to consider per team
target = "PLUS_MINUS"
game_ids = list(set(player_data.GAME_ID))

# Placeholder for processed rows
data_rows = []

# Process each game
for game_id in tqdm(game_ids, desc="Processing games"):
    # Filter data for the current game
    game_data = player_data[player_data.GAME_ID == game_id]

    # Identify the two teams
    team_abbr = game_data["TEAM_ABBREVIATION"].unique()
    if len(team_abbr) != 2:
        continue  # Skip games without exactly two teams

    # Get top players based on minutes played
    team_1_data = (
        game_data[game_data["TEAM_ABBREVIATION"] == team_abbr[0]]
        .sort_values(by="MIN", ascending=False)
        .head(player_count)
    )
    team_2_data = (
        game_data[game_data["TEAM_ABBREVIATION"] == team_abbr[1]]
        .sort_values(by="MIN", ascending=False)
        .head(player_count)
    )

    # Ensure both teams have the required number of players
    if len(team_1_data) < player_count or len(team_2_data) < player_count:
        continue

    # Compute the target (plus-minus)
    team_1_plus_minus = team_1_data["PTS"].sum() - team_2_data["PTS"].sum()

    # Flatten the features for both teams
    team_1_features = team_1_data[features].fillna(0).values.flatten()
    team_2_features = team_2_data[features].fillna(0).values.flatten()

    # Generate column names dynamically
    feature_columns = [
        f"team1_player{i+1}_{feature}"
        for i in range(player_count)
        for feature in features
    ] + [
        f"team2_player{i+1}_{feature}"
        for i in range(player_count)
        for feature in features
    ]

    # Create original row
    data_rows.append(
        {
            "features": np.concatenate([team_1_features, team_2_features]),
            "plus_minus": team_1_plus_minus,
        }
    )

    # Create reordered row
    data_rows.append(
        {
            "features": np.concatenate([team_2_features, team_1_features]),
            "plus_minus": -team_1_plus_minus,
        }
    )

# Convert rows to DataFrame
data_df = pd.DataFrame(data_rows)

data_df = pd.concat(
    [
        pd.DataFrame(data_df["features"].tolist(), columns=feature_columns),
        data_df[["plus_minus"]],
    ],
    axis=1,
)

data_df.fillna(0, inplace=True)

# Scaling and splitting the data
X = data_df.drop(columns=["plus_minus"])
y = data_df["plus_minus"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Final output
final_data = pd.concat(
    [pd.DataFrame(X_scaled, columns=X.columns), y.reset_index(drop=True)], axis=1
)

Processing games: 100%|██████████| 9830/9830 [00:08<00:00, 1166.13it/s]


In [6]:
# Train an XGBoost model
model = xgb.XGBRegressor(
    booster="gbtree",
    tree_method="hist",
    learning_rate=0.01,
    n_estimators=1000,
    objective="reg:squarederror",
    eval_metric="rmse",
    early_stopping_rounds=50,
)

# Train the model
eval_set = [(X_train, y_train), (X_test, y_test)]
model = model.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    verbose=True,
)

[0]	validation_0-rmse:16.02219	validation_1-rmse:16.35190
[1]	validation_0-rmse:15.93993	validation_1-rmse:16.27437
[2]	validation_0-rmse:15.85832	validation_1-rmse:16.19732
[3]	validation_0-rmse:15.77837	validation_1-rmse:16.12203
[4]	validation_0-rmse:15.69923	validation_1-rmse:16.04734
[5]	validation_0-rmse:15.62087	validation_1-rmse:15.97246
[6]	validation_0-rmse:15.54389	validation_1-rmse:15.89863
[7]	validation_0-rmse:15.46732	validation_1-rmse:15.82705
[8]	validation_0-rmse:15.39218	validation_1-rmse:15.75558
[9]	validation_0-rmse:15.31793	validation_1-rmse:15.68576
[10]	validation_0-rmse:15.24442	validation_1-rmse:15.61692
[11]	validation_0-rmse:15.17201	validation_1-rmse:15.54825
[12]	validation_0-rmse:15.10016	validation_1-rmse:15.48056
[13]	validation_0-rmse:15.02919	validation_1-rmse:15.41275
[14]	validation_0-rmse:14.95848	validation_1-rmse:15.34589
[15]	validation_0-rmse:14.88886	validation_1-rmse:15.28210
[16]	validation_0-rmse:14.82023	validation_1-rmse:15.22051
[17]	va

KeyboardInterrupt: 

In [None]:
# Print feature importances
importances = model.feature_importances_
feature_importance_df = pd.DataFrame(
    {"Feature": X.columns, "Importance": importances}
).sort_values(by="Importance", ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance_df.head(10))

# Evaluate model performance
train_rmse = np.sqrt(((model.predict(X_train) - y_train) ** 2).mean())
test_rmse = np.sqrt(((model.predict(X_test) - y_test) ** 2).mean())

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")

# Evaluate win-loss error rate
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
train_error_rate = win_loss_error_rate(train_predictions, y_train)
test_error_rate = win_loss_error_rate(test_predictions, y_test)

print(f"Train Win-Loss Error Rate: {train_error_rate}")
print(f"Test Win-Loss Error Rate: {test_error_rate}")

# R2 score
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f"Train R2 Score: {train_r2}")
print(f"Test R2 Score: {test_r2}")

In [None]:
model.save_model(f"models/{player_count}_player_model.json")

In [7]:
model_5 = xgb.XGBRegressor()
model_5.load_model("models/5_player_model.json")

In [16]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Load the player data
player_data = pd.read_csv("data/player_data.csv")

# Define parameters
player_count = 5  # Number of players per team
target = "PLUS_MINUS"
game_ids = list(set(player_data.GAME_ID))

# Placeholder for processed rows
data_rows = []

# Process each game
for game_id in tqdm(game_ids, desc="Processing games"):
    # Filter data for the current game
    game_data = player_data[player_data.GAME_ID == game_id]

    # Identify the two teams
    team_abbr = game_data["TEAM_ABBREVIATION"].unique()
    if len(team_abbr) != 2:
        continue  # Skip games without exactly two teams

    # Get top players based on minutes played
    team_1_data = (
        game_data[game_data["TEAM_ABBREVIATION"] == team_abbr[0]]
        .sort_values(by="MIN", ascending=False)
        .head(player_count)
    )
    team_2_data = (
        game_data[game_data["TEAM_ABBREVIATION"] == team_abbr[1]]
        .sort_values(by="MIN", ascending=False)
        .head(player_count)
    )

    # Ensure both teams have the required number of players
    if len(team_1_data) < player_count or len(team_2_data) < player_count:
        continue

    # Compute the target (plus-minus)
    team_1_plus_minus = team_1_data["PTS"].sum() - team_2_data["PTS"].sum()

    # Flatten the features for both teams
    team_1_features = team_1_data[features].fillna(0).values
    team_2_features = team_2_data[features].fillna(0).values

    # Append the data
    data_rows.append(
        {
            "team_1_features": team_1_features,
            "team_2_features": team_2_features,
            "plus_minus": team_1_plus_minus,
        }
    )

# Convert to DataFrame
data_df = pd.DataFrame(data_rows)

# Combine team features into sequences
data_df["features"] = data_df.apply(
    lambda row: np.concatenate(
        [row["team_1_features"], row["team_2_features"]], axis=0
    ),
    axis=1,
)

# Extract features and target
X = np.stack(data_df["features"].values)
y = data_df["plus_minus"].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

Processing games: 100%|██████████| 9830/9830 [00:07<00:00, 1329.54it/s]


In [17]:
import torch.nn as nn


class TransformerRegressor(nn.Module):
    def __init__(
        self, input_dim, num_tokens, hidden_dim=128, num_heads=4, num_layers=2
    ):
        super(TransformerRegressor, self).__init__()
        self.input_dim = input_dim
        self.num_tokens = num_tokens
        self.hidden_dim = hidden_dim

        # Input projection
        self.fc_in = nn.Linear(input_dim, hidden_dim)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.zeros(1, num_tokens, hidden_dim))

        # Transformer encoder
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=num_heads, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(
            self.encoder_layer, num_layers=num_layers
        )

        # Regression head
        self.fc_out = nn.Linear(hidden_dim * num_tokens, 1)

    def forward(self, x):
        # Project inputs to hidden dimension
        x = self.fc_in(x)

        # Add positional encoding
        x = x + self.positional_encoding

        # Transformer
        x = self.transformer(x)

        # Flatten and regression head
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc_out(x)  # Regression output
        return x

In [18]:
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Initialize model, loss, optimizer
model = TransformerRegressor(
    input_dim=len(features),
    num_tokens=2 * player_count,
    hidden_dim=128,
    num_heads=4,
    num_layers=2,
)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=1e-3)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        X_batch, y_batch = batch
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}")

# Evaluate on test set
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch in test_loader:
        X_batch, y_batch = batch
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        test_loss += loss.item()
print(f"Test Loss: {test_loss / len(test_loader)}")

Epoch 1/10: 100%|██████████| 246/246 [00:04<00:00, 51.42it/s]


Epoch 1, Training Loss: 60.94430138425129


Epoch 2/10: 100%|██████████| 246/246 [00:04<00:00, 51.69it/s]


Epoch 2, Training Loss: 17.542549179821478


Epoch 3/10: 100%|██████████| 246/246 [00:04<00:00, 51.79it/s]


Epoch 3, Training Loss: 11.539365869227463


Epoch 4/10: 100%|██████████| 246/246 [00:05<00:00, 49.09it/s]


Epoch 4, Training Loss: 8.602222748888217


Epoch 5/10: 100%|██████████| 246/246 [00:04<00:00, 49.78it/s]


Epoch 5, Training Loss: 6.825972472749105


Epoch 6/10: 100%|██████████| 246/246 [00:04<00:00, 51.78it/s]


Epoch 6, Training Loss: 6.170281768814335


Epoch 7/10: 100%|██████████| 246/246 [00:04<00:00, 51.97it/s]


Epoch 7, Training Loss: 5.020171988301161


Epoch 8/10: 100%|██████████| 246/246 [00:04<00:00, 50.36it/s]


Epoch 8, Training Loss: 4.82576222875254


Epoch 9/10: 100%|██████████| 246/246 [00:05<00:00, 47.35it/s]


Epoch 9, Training Loss: 4.574093804126832


Epoch 10/10: 100%|██████████| 246/246 [00:04<00:00, 51.12it/s]


Epoch 10, Training Loss: 3.9353739876088087
Test Loss: 3.917992930258474


In [19]:
from sklearn.metrics import mean_squared_error

# Predictions
y_pred = model(X_test_tensor).detach().numpy()
rmse = np.sqrt(mean_squared_error(y_test_tensor.numpy(), y_pred))
print(f"Test RMSE: {rmse}")

Test RMSE: 1.9811452627182007
