In [None]:
pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip install scikit-learn



In [119]:
import pandas as pd
import numpy as np

In [120]:
# load datasets
player_game_data = pd.read_csv('./player_game_data.csv')  # Use read_csv for CSV files
team_game_data = pd.read_csv('./team_game_data.csv')      # Use read_csv for CSV files
contract_data = pd.read_csv('./nba_spotrac_data.csv')

In [121]:
# clean contract data (remove $ symbols and commas from cap_hit, handle missing data)

contract_data['cap_hit'] = pd.to_numeric(
    contract_data['cap_hit'].replace({'\$': '', ',': '', '-': None}, regex=True),
    errors='coerce'
)

In [122]:
# aggregate payroll by team and year
team_payroll = contract_data.groupby(['team', 'year']).agg({'cap_hit': 'sum'}).reset_index()
team_payroll.rename(columns={'cap_hit': 'total_team_payroll'}, inplace=True)
team_payroll

Unnamed: 0,team,year,total_team_payroll
0,atlanta hawks,2011,71453928.0
1,atlanta hawks,2012,63620146.0
2,atlanta hawks,2013,55517146.0
3,atlanta hawks,2014,57245276.0
4,atlanta hawks,2015,71332945.0
...,...,...,...
413,washington wizards,2020,125335449.0
414,washington wizards,2021,125963635.0
415,washington wizards,2022,133790093.0
416,washington wizards,2023,132379717.0


In [123]:
# standardize team names by converting to lowercase and removing extra spaces
player_game_data['team_name'] = player_game_data['team_name'].str.lower().str.strip()
team_payroll['team'] = team_payroll['team'].str.lower().str.strip()

In [124]:
# filter year (can be changed later)
recent_seasons = [2023]
player_game_data = player_game_data[player_game_data['season'].isin(recent_seasons)]
team_game_data = team_game_data[team_game_data['season'].isin(recent_seasons)]

In [125]:
# player_game_data = player_game_data.sample(n=50000, random_state=42)

In [126]:
print(f"Player Game Data: {player_game_data.shape}")
print(f"Team Game Data: {team_game_data.shape}")
print(f"Contract Data: {contract_data.shape}")


Player Game Data: (43395, 59)
Team Game Data: (2460, 41)
Contract Data: (6747, 8)


In [127]:
# merge player and team game data on 'nbagameid' and 'nbateamid'
merged_game_data = pd.merge(
    player_game_data, team_game_data,
    left_on='nbateamid', right_on='offensivenbateamid',
    suffixes=('_player', '_team')
)
merged_game_data

Unnamed: 0,nbagameid_player,gamedate_player,season_player,gametype_player,nbapersonid,player_name,nbateamid,team,team_name,opposingnbateamid,...,turnovers_team,blocksagainst_team,defensivefouls_team,offensivefouls_team,shootingfoulsdrawn_team,possessions,points_team,shotattempts_team,andones_team,shotattemptpoints_team
0,22300184,2023-11-12,2023,2,1626196,Josh Richardson,1610612748,MIA,miami heat,1610612759,...,21,4,15,5,8,100,121,86,2,121
1,22300184,2023-11-12,2023,2,1626196,Josh Richardson,1610612748,MIA,miami heat,1610612759,...,7,13,21,2,11,95,103,101,2,97
2,22300184,2023-11-12,2023,2,1626196,Josh Richardson,1610612748,MIA,miami heat,1610612759,...,17,8,20,1,10,90,99,89,4,94
3,22300184,2023-11-12,2023,2,1626196,Josh Richardson,1610612748,MIA,miami heat,1610612759,...,10,3,17,0,12,95,115,93,3,113
4,22300184,2023-11-12,2023,2,1626196,Josh Richardson,1610612748,MIA,miami heat,1610612759,...,7,8,13,1,6,96,112,94,1,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3558385,22301187,2024-04-14,2023,2,1631121,Bryce McGowens,1610612766,CHA,charlotte hornets,1610612739,...,17,6,14,3,7,104,106,97,2,104
3558386,22301187,2024-04-14,2023,2,1631121,Bryce McGowens,1610612766,CHA,charlotte hornets,1610612739,...,7,11,17,0,7,99,100,104,3,96
3558387,22301187,2024-04-14,2023,2,1631121,Bryce McGowens,1610612766,CHA,charlotte hornets,1610612739,...,13,6,19,2,8,97,122,93,4,114
3558388,22301187,2024-04-14,2023,2,1631121,Bryce McGowens,1610612766,CHA,charlotte hornets,1610612739,...,19,7,11,4,6,97,84,91,3,82


In [128]:
# merge with contract data on 'team_name' and 'season'
chunk_size = 5000  # memory limit
merged_chunks = []

for i in range(0, len(merged_game_data), chunk_size):
    merged_chunk = pd.merge(
        merged_game_data.iloc[i:i+chunk_size], team_payroll,
        left_on=['team_name', 'season_player'],
        right_on=['team', 'year'],
        how='left'
    )
    merged_chunks.append(merged_chunk)

# concatenate all chunks into a single DataFrame
merged_data = pd.concat(merged_chunks)

In [129]:
# inspect the merged data
print(merged_data.head())

   nbagameid_player gamedate_player  season_player  gametype_player  \
0          22300184      2023-11-12           2023                2   
1          22300184      2023-11-12           2023                2   
2          22300184      2023-11-12           2023                2   
3          22300184      2023-11-12           2023                2   
4          22300184      2023-11-12           2023                2   

   nbapersonid      player_name   nbateamid team_x   team_name  \
0      1626196  Josh Richardson  1610612748    MIA  miami heat   
1      1626196  Josh Richardson  1610612748    MIA  miami heat   
2      1626196  Josh Richardson  1610612748    MIA  miami heat   
3      1626196  Josh Richardson  1610612748    MIA  miami heat   
4      1626196  Josh Richardson  1610612748    MIA  miami heat   

   opposingnbateamid  ... offensivefouls_team shootingfoulsdrawn_team  \
0         1610612759  ...                   5                       8   
1         1610612759  ...     

In [130]:
# calculate wins for each game
merged_data['win'] = (merged_data['points_team'] > merged_data['opponentteampoints']).astype(int)

# aggregate total wins and average payroll by team and season
team_performance = merged_data.groupby(['team_name', 'season_player']).agg({
    'win': 'sum',  # Total wins
    'total_team_payroll': 'mean'  # Average team payroll
}).reset_index()

# prepare features (payroll) and target (wins)
X = team_performance[['total_team_payroll']]  # Feature
y = team_performance['win']  # Target

# Linear Regression model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X, y)

print(f"Intercept: {lin_reg.intercept_}")
print(f"Coefficient for Payroll: {lin_reg.coef_[0]}")

Intercept: 124793.05666719201
Coefficient for Payroll: -4.725401966841712e-05


In [131]:
# scale the payroll by dividing it by 1 million (in millions of dollars)
team_performance['total_team_payroll_millions'] = team_performance['total_team_payroll'] / 1_000_000

# prepare features (scaled payroll) and target (wins)
X = team_performance[['total_team_payroll_millions']]  # feature
y = team_performance['win']  # target

# re-run the model
lin_reg = LinearRegression()
lin_reg.fit(X, y)

print(f"Intercept: {lin_reg.intercept_}")
print(f"Coefficient for Payroll (in millions): {lin_reg.coef_[0]}")

Intercept: 124793.05666719201
Coefficient for Payroll (in millions): -47.254019668417136


In [132]:
# calculate whether the team won the game, 1 if points_team > opponentteampoints, 0 otherwise
merged_data['win'] = (merged_data['points_team'] > merged_data['opponentteampoints']).astype(int)

# aggregate total wins and average payroll by team and season
# sum the 'win' column for total wins and average payroll for the team across the season
team_performance = merged_data.groupby(['team_name', 'season_player']).agg({
    'win': 'sum',  # sum of wins per season
    'total_team_payroll': 'mean'  # average payroll
}).reset_index()

# create the 'total_team_payroll_millions' column
team_performance['total_team_payroll_millions'] = team_performance['total_team_payroll'] / 1_000_000

print(team_performance['win'].describe())
print(team_performance['total_team_payroll_millions'].describe())


count        30.000000
mean     117635.633333
std        2981.241601
min      112955.000000
25%      115457.000000
50%      117931.500000
75%      119462.000000
max      127692.000000
Name: win, dtype: float64
count     30.000000
mean     151.466973
std       29.388172
min       84.251419
25%      133.144965
50%      158.974573
75%      167.206227
max      206.814776
Name: total_team_payroll_millions, dtype: float64


In [133]:
# calculate the win column: 1 if team points are greater than opponent points, otherwise 0
merged_data['win'] = (merged_data['points_team'] > merged_data['opponentteampoints']).astype(int)

print(merged_data[['team_name', 'points_team', 'opponentteampoints', 'win']].head())


    team_name  points_team  opponentteampoints  win
0  miami heat          121                  64    1
1  miami heat          103                  64    1
2  miami heat           99                  64    1
3  miami heat          115                  64    1
4  miami heat          112                  64    1


In [134]:
team_performance = merged_data.groupby(['team_name', 'season_player']).agg({
    'win': 'sum',
    'total_team_payroll': 'mean'
}).reset_index()

team_performance['total_team_payroll_millions'] = team_performance['total_team_payroll'] / 1_000_000

print(team_performance['win'].describe())
print(team_performance['total_team_payroll_millions'].describe())


count        30.000000
mean     117635.633333
std        2981.241601
min      112955.000000
25%      115457.000000
50%      117931.500000
75%      119462.000000
max      127692.000000
Name: win, dtype: float64
count     30.000000
mean     151.466973
std       29.388172
min       84.251419
25%      133.144965
50%      158.974573
75%      167.206227
max      206.814776
Name: total_team_payroll_millions, dtype: float64


In [135]:
print(merged_data[['team_name', 'points_team', 'opponentteampoints', 'opposingnbateamid']].head(20))


     team_name  points_team  opponentteampoints  opposingnbateamid
0   miami heat          121                  64         1610612759
1   miami heat          103                  64         1610612759
2   miami heat           99                  64         1610612759
3   miami heat          115                  64         1610612759
4   miami heat          112                  64         1610612759
5   miami heat           97                  64         1610612759
6   miami heat          129                  64         1610612759
7   miami heat          115                  64         1610612759
8   miami heat           90                  64         1610612759
9   miami heat          118                  64         1610612759
10  miami heat          108                  64         1610612759
11  miami heat          122                  64         1610612759
12  miami heat          109                  64         1610612759
13  miami heat          118                  64         161061

In [136]:
merged_data_small = merged_data[['nbagameid_player', 'nbateamid', 'points_team', 'opposingnbateamid']]


In [137]:
# create a dictionary to map each (game ID, team ID) pair to points_team
points_dict = merged_data.set_index(['nbagameid_player', 'nbateamid'])['points_team'].to_dict()

# add a more specific check to ensure the opponent's points are correctly matched
def get_opponent_points(row):
    # lookup the points for the opposing team in the same game
    return points_dict.get((row['nbagameid_player'], row['opposingnbateamid']), None)

# apply this to each row
merged_data['opponentteampoints'] = merged_data.apply(get_opponent_points, axis=1)

# check the result
print(merged_data[['team_name', 'points_team', 'opponentteampoints']].head())


    team_name  points_team  opponentteampoints
0  miami heat          121                 118
1  miami heat          103                 118
2  miami heat           99                 118
3  miami heat          115                 118
4  miami heat          112                 118


In [138]:
print(merged_data[['nbagameid_player', 'team_name', 'points_team', 'opposingnbateamid']].head(20))


    nbagameid_player   team_name  points_team  opposingnbateamid
0           22300184  miami heat          121         1610612759
1           22300184  miami heat          103         1610612759
2           22300184  miami heat           99         1610612759
3           22300184  miami heat          115         1610612759
4           22300184  miami heat          112         1610612759
5           22300184  miami heat           97         1610612759
6           22300184  miami heat          129         1610612759
7           22300184  miami heat          115         1610612759
8           22300184  miami heat           90         1610612759
9           22300184  miami heat          118         1610612759
10          22300184  miami heat          108         1610612759
11          22300184  miami heat          122         1610612759
12          22300184  miami heat          109         1610612759
13          22300184  miami heat          118         1610612759
14          22300184  mia

In [139]:
# check for duplicate game/team combinations
duplicates = merged_data[['nbagameid_player', 'nbateamid']].duplicated().sum()
print(f"Number of duplicate game/team combinations: {duplicates}")


Number of duplicate game/team combinations: 3555930


In [140]:
# aggregate by team and game to remove duplicates (one row per team per game)
team_game_data = merged_data.groupby(['nbagameid_player', 'nbateamid']).agg({
    'points_team': 'sum',
    'opposingnbateamid': 'first'
}).reset_index()

print(team_game_data.head())


   nbagameid_player   nbateamid  points_team  opposingnbateamid
0          22300001  1610612739       157012         1610612754
1          22300001  1610612754       181980         1610612739
2          22300002  1610612749       175608         1610612752
3          22300002  1610612752       166482         1610612749
4          22300003  1610612748       153544         1610612764


In [141]:
# create a dictionary for points lookup
points_dict = team_game_data.set_index(['nbagameid_player', 'nbateamid'])['points_team'].to_dict()

# use the dictionary to lookup opponent points for each game/team row
team_game_data['opponentteampoints'] = team_game_data.apply(
    lambda row: points_dict.get((row['nbagameid_player'], row['opposingnbateamid']), None), axis=1
)

print(team_game_data[['nbagameid_player', 'nbateamid', 'points_team', 'opponentteampoints']].head())


   nbagameid_player   nbateamid  points_team  opponentteampoints
0          22300001  1610612739       157012              181980
1          22300001  1610612754       181980              157012
2          22300002  1610612749       175608              166482
3          22300002  1610612752       166482              175608
4          22300003  1610612748       153544              167886


In [142]:
# calculate wins
team_game_data['win'] = (team_game_data['points_team'] > team_game_data['opponentteampoints']).astype(int)

print(team_game_data[['nbagameid_player', 'nbateamid', 'points_team', 'opponentteampoints', 'win']].head())


   nbagameid_player   nbateamid  points_team  opponentteampoints  win
0          22300001  1610612739       157012              181980    0
1          22300001  1610612754       181980              157012    1
2          22300002  1610612749       175608              166482    1
3          22300002  1610612752       166482              175608    0
4          22300003  1610612748       153544              167886    0


In [143]:
# aggregate total wins by team
team_performance = team_game_data.groupby('nbateamid').agg({
    'win': 'sum',
    'points_team': 'mean'
}).reset_index()

print(team_performance.head())


    nbateamid  win    points_team
0  1610612737   73  174417.341463
1  1610612738   52  167958.426829
2  1610612739   24  158138.341463
3  1610612740   21  160527.073171
4  1610612741   19  160094.585366


In [144]:
# mapping dictionary
team_id_to_name = {
    1610612737: 'atlanta hawks',
    1610612738: 'boston celtics',
    1610612751: 'brooklyn nets',
    1610612766: 'charlotte hornets',
    1610612741: 'chicago bulls',
    1610612739: 'cleveland cavaliers',
    1610612742: 'dallas mavericks',
    1610612743: 'denver nuggets',
    1610612765: 'detroit pistons',
    1610612744: 'golden state warriors',
    1610612745: 'houston rockets',
    1610612754: 'indiana pacers',
    1610612746: 'la clippers',
    1610612747: 'los angeles lakers',
    1610612763: 'memphis grizzlies',
    1610612748: 'miami heat',
    1610612749: 'milwaukee bucks',
    1610612750: 'minnesota timberwolves',
    1610612740: 'new orleans pelicans',
    1610612752: 'new york knicks',
    1610612760: 'oklahoma city thunder',
    1610612753: 'orlando magic',
    1610612755: 'philadelphia 76ers',
    1610612756: 'phoenix suns',
    1610612757: 'portland trail blazers',
    1610612758: 'sacramento kings',
    1610612759: 'san antonio spurs',
    1610612761: 'toronto raptors',
    1610612762: 'utah jazz',
    1610612764: 'washington wizards'
}

# map the team IDs in team_performance to team names
team_performance['team_name'] = team_performance['nbateamid'].map(team_id_to_name)

print(team_performance[['nbateamid', 'team_name']].head())


    nbateamid             team_name
0  1610612737         atlanta hawks
1  1610612738        boston celtics
2  1610612739   cleveland cavaliers
3  1610612740  new orleans pelicans
4  1610612741         chicago bulls


In [145]:
# merge using 'team_name' after mapping team IDs
team_performance = pd.merge(
    team_performance,
    team_payroll,
    left_on='team_name',
    right_on='team',
    how='left'
)

print(team_performance.head())


    nbateamid  win    points_team      team_name           team  year  \
0  1610612737   73  174417.341463  atlanta hawks  atlanta hawks  2011   
1  1610612737   73  174417.341463  atlanta hawks  atlanta hawks  2012   
2  1610612737   73  174417.341463  atlanta hawks  atlanta hawks  2013   
3  1610612737   73  174417.341463  atlanta hawks  atlanta hawks  2014   
4  1610612737   73  174417.341463  atlanta hawks  atlanta hawks  2015   

   total_team_payroll  
0          71453928.0  
1          63620146.0  
2          55517146.0  
3          57245276.0  
4          71332945.0  


In [146]:
from sklearn.linear_model import LinearRegression

# prepare the features (payroll) and target (wins)
X = team_performance[['total_team_payroll']]
y = team_performance['win']

lin_reg = LinearRegression()
lin_reg.fit(X, y)

print(f"Intercept: {lin_reg.intercept_}")
print(f"Coefficient for Payroll: {lin_reg.coef_[0]}")


# 42.63 means that when payroll is zero the model predicts 42 wins
# which is baseline level of team performance

# -1.5627827644186546e-08 very close to zero means an increase in payroll,
# has a small impact on winning


Intercept: 42.63412304208182
Coefficient for Payroll: -1.5627827644186576e-08


In [147]:
from sklearn.preprocessing import StandardScaler

# scale the payroll data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(team_performance[['total_team_payroll']])

# re-run the regression with scaled data
lin_reg = LinearRegression()
lin_reg.fit(X_scaled, y)

print(f"Intercept: {lin_reg.intercept_}")
print(f"Coefficient for Scaled Payroll: {lin_reg.coef_[0]}")


Intercept: 41.02392344497608
Coefficient for Scaled Payroll: -0.6189465637437004


In [148]:
from sklearn.linear_model import Lasso

# run Lasso regression to regularize the model
lasso_reg = Lasso(alpha=0.01) # change this to experiment
lasso_reg.fit(X_scaled, y)

print(f"Lasso Intercept: {lasso_reg.intercept_}")
print(f"Lasso Coefficient for Scaled Payroll: {lasso_reg.coef_[0]}")


# scaled payroll 41 is at average, so average payroll tend to perfrom around average
# -0.61 increasing payroll might associate with slight decrease in wins or
# overpayment for players
# or team chemistry/coaching/injuries/performance

Lasso Intercept: 41.02392344497608
Lasso Coefficient for Scaled Payroll: -0.6089465637437004


In [149]:
# calculate wins per million dollars spent (check the top teams in terms of cost-efficiency)
team_performance['wins_per_million'] = team_performance['win'] / (team_performance['total_team_payroll'] / 1_000_000)

print(team_performance[['team_name', 'win', 'total_team_payroll', 'wins_per_million']].sort_values(by='wins_per_million', ascending=False).head())

# top team in wins per million:
# teams that managed to get high win counts relative to payroll
# OKC has high payroll with efficient performance but diminishing rate

                 team_name  win  total_team_payroll  wins_per_million
236         indiana pacers   78          44413746.0          1.756213
292       sacramento kings   59          36583409.0          1.612753
330  oklahoma city thunder   76          47135212.0          1.612383
252     philadelphia 76ers   41          27287899.0          1.502497
320  oklahoma city thunder   76          56856253.0          1.336704


In [150]:
# identify the least cost-efficient teams
least_efficient_teams = team_performance[['team_name', 'win', 'total_team_payroll', 'wins_per_million']].sort_values(by='wins_per_million', ascending=True).head()
print(least_efficient_teams)


                  team_name  win  total_team_payroll  wins_per_million
291  portland trail blazers    5         165518581.0          0.030208
290  portland trail blazers    5         157605999.0          0.031725
289  portland trail blazers    5         136161044.0          0.036721
286  portland trail blazers    5         132112637.0          0.037846
285  portland trail blazers    5         128004910.0          0.039061


In [151]:
# calculate average wins per million for each team over time
team_avg_efficiency = team_performance.groupby('team_name')['wins_per_million'].mean().reset_index()
print(team_avg_efficiency.sort_values(by='wins_per_million', ascending=False))

# most cost efficient teams

                 team_name  wins_per_million
11          indiana pacers          0.902728
20   oklahoma city thunder          0.899004
0            atlanta hawks          0.878052
6         dallas mavericks          0.765678
25        sacramento kings          0.735712
28               utah jazz          0.723553
7           denver nuggets          0.679123
16         milwaukee bucks          0.668336
10         houston rockets          0.652329
22      philadelphia 76ers          0.608692
1           boston celtics          0.540059
12             la clippers          0.504813
23            phoenix suns          0.475267
13      los angeles lakers          0.469082
29      washington wizards          0.461521
14       memphis grizzlies          0.454889
21           orlando magic          0.440606
19         new york knicks          0.426251
26       san antonio spurs          0.425080
27         toronto raptors          0.367794
9    golden state warriors          0.299929
5      cle

In [152]:
# create lag features for wins and payroll (shift by 1 year)
team_performance['previous_win'] = team_performance.groupby('team_name')['win'].shift(1)
team_performance['previous_payroll'] = team_performance.groupby('team_name')['total_team_payroll'].shift(1)
team_performance['payroll_change'] = team_performance['total_team_payroll'] - team_performance['previous_payroll']

# drop rows with missing lag features
team_performance.dropna(subset=['previous_win', 'previous_payroll'], inplace=True)

# create the target variable for prediction
team_performance['future_wins_per_million'] = team_performance.groupby('team_name')['wins_per_million'].shift(-1)

# drop rows where the target is missing
team_performance.dropna(subset=['future_wins_per_million'], inplace=True)

print(team_performance[['team_name', 'win', 'previous_win', 'total_team_payroll', 'payroll_change', 'wins_per_million', 'future_wins_per_million']].head())


       team_name  win  previous_win  total_team_payroll  payroll_change  \
1  atlanta hawks   73          73.0          63620146.0      -7833782.0   
2  atlanta hawks   73          73.0          55517146.0      -8103000.0   
3  atlanta hawks   73          73.0          57245276.0       1728130.0   
4  atlanta hawks   73          73.0          71332945.0      14087669.0   
5  atlanta hawks   73          73.0          92928845.0      21595900.0   

   wins_per_million  future_wins_per_million  
1          1.147435                 1.314909  
2          1.314909                 1.275214  
3          1.275214                 1.023370  
4          1.023370                 0.785547  
5          0.785547                 1.082668  


In [153]:
from sklearn.model_selection import train_test_split

# features and target variable
features = ['previous_win', 'previous_payroll', 'payroll_change', 'wins_per_million']
target = 'future_wins_per_million'

# split the data into training and test sets
X = team_performance[features]
y = team_performance[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 286
Test set size: 72


In [154]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# train a linear regression model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# predict on the test set
y_pred = reg_model.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on the test set: {mse}")

print(f"Intercept: {reg_model.intercept_}")
print(f"Coefficients: {reg_model.coef_}")


# MSE 0.0072 mean model is relatively accuracte
# Coefficients:
# previous wins (0.0037): positive impact, wins in previous season slightly improve cost-efficiency
# previous payroll (-8.028e-10): minimal impact
# payroll change: minimal impact
# wins per million (0.6217): is significant, past wins per million is a strong predictor
# for future wins per million dollars spent

Mean Squared Error on the test set: 0.007208422679387856
Intercept: 0.08141137743587301
Coefficients: [ 3.66509748e-03 -8.02814319e-10 -1.24021459e-09  6.21712131e-01]


In [155]:
# use the trained model to predict future wins per million for a specific season
predicted_efficiency = reg_model.predict(X_test)

# add the predictions back to the dataset for comparison
team_performance_test = X_test.copy()
team_performance_test['predicted_wins_per_million'] = predicted_efficiency
team_performance_test['actual_wins_per_million'] = y_test

# compare actual and predicted
print(team_performance_test[['predicted_wins_per_million', 'actual_wins_per_million']].head())

# model's predicted values for future wins per million
# are somewhat close to the actual values
# it is able to capture some cost-efficiency across seasons

     predicted_wins_per_million  actual_wins_per_million
260                    0.290434                 0.274860
49                     0.218672                 0.178532
304                    0.429170                 0.351943
352                    0.821052                 0.735448
65                     0.133515                 0.143036
