In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("eoinamoore/historical-nba-data-and-player-box-scores")

print("Path to dataset files:", path)

In [None]:
import pandas as pd

def load_and_process_data(file_path):
    # Load data from CSV file
    df = pd.read_csv("C:\\Users\\nmall\\.cache\\kagglehub\\datasets\\eoinamoore\\historical-nba-data-and-player-box-scores\\versions\\366\\" + file_path)
    
    # Display the first few rows of the dataframe
    print("Initial Data:")
    print(df.head())
    
    # Check for missing values
    missing_values = df.isnull().sum()
    print("\nMissing Values in Each Column:")
    print(missing_values)
    
    # Fill missing values with the mean of the column (for numerical columns)
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        df[column].fillna(df[column].mean(), inplace=True)
    
    # Convert categorical columns to category type
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = df[column].astype('string')
    
    # Display the processed dataframe info
    print("\nProcessed Data Info:")
    print(df.info())
    
    return df
gamesInfo = load_and_process_data('Games.csv')
playersStats = load_and_process_data('PlayerStatistics.csv')
TeamStatistics = load_and_process_data('TeamStatistics.csv')

In [None]:
# TeamStatistics
# Add column representing the season for each row in RegSeasonPlayerStats
teamAverageBySeason = TeamStatistics.copy()
teamAverageBySeason['month'] = teamAverageBySeason['gameDateTimeEst'].str[5:7]
teamAverageBySeason['year'] = teamAverageBySeason['gameDateTimeEst'].str[0:4]
teamAverageBySeason['day'] = teamAverageBySeason['gameDateTimeEst'].str[8:10]
# create a datetime column based on the year, month, and day columns
teamAverageBySeason['gameDate'] = pd.to_datetime(teamAverageBySeason['year'] + '-' + teamAverageBySeason['month'] + '-' + teamAverageBySeason['day'])
# Create a season column based on the year and month
teamAverageBySeason['season'] = teamAverageBySeason.apply(lambda row: f"{row['year']}-{int(row['year']) + 1}" if row['month'] >= '10' else f"{int(row['year']) - 1}-{row['year']}", axis=1)

In [None]:
teamAverageBySeason.drop(columns=['month', 'year', 'day', 'gameDateTimeEst'], inplace=True)
teamAverageBySeason = teamAverageBySeason.groupby(["gameId", "season", "teamName"])
teamAverageBySeason.head(5)

In [None]:
playersStats.head(10)

In [None]:
print("Stats go from", playersStats['gameDateTimeEst'].min(), "to", playersStats['gameDateTimeEst'].max())

In [None]:
# count the values of gametype in PlayerStats
playersStats['gameType'].value_counts(dropna=False)

In [None]:
# take a look at what rows have gameType as null
playersStats[playersStats['gameType'].isnull()].tail(10)

In [None]:
# Any player stats with a null gameType are regular season games so we'll adjust accordingly
playersStats['gameType'].fillna('Regular Season', inplace=True)
playersStats['gameType'].value_counts(dropna=False)

In [None]:
RegSeasonPlayerStats = playersStats[playersStats['gameType'] == 'Regular Season']
RegSeasonPlayerStats.dtypes

In [None]:
# Add column representing the season for each row in RegSeasonPlayerStats
RegSeasonPlayerStats['month'] = RegSeasonPlayerStats['gameDateTimeEst'].str[5:7]
RegSeasonPlayerStats['year'] = RegSeasonPlayerStats['gameDateTimeEst'].str[0:4]
RegSeasonPlayerStats['day'] = RegSeasonPlayerStats['gameDateTimeEst'].str[8:10]
# create a datetime column based on the year, month, and day columns
RegSeasonPlayerStats['gameDate'] = pd.to_datetime(RegSeasonPlayerStats['year'] + '-' + RegSeasonPlayerStats['month'] + '-' + RegSeasonPlayerStats['day'])

RegSeasonPlayerStats["fullName"] = RegSeasonPlayerStats["firstName"] + " " + RegSeasonPlayerStats["lastName"]

# Create a season column based on the year and month
RegSeasonPlayerStats['season'] = RegSeasonPlayerStats.apply(lambda row: f"{row['year']}-{int(row['year']) + 1}" if row['month'] >= '10' else f"{int(row['year']) - 1}-{row['year']}", axis=1)
RegSeasonPlayerStats['season'].value_counts(dropna=False)

In [None]:
# check for duplicates in RegSeasonPlayerStats
RegSeasonPlayerStats.duplicated().sum()

In [None]:
# Aggregate the data by season, player, and total games played, points scored, and points per game
# also add columns for the player's average points, field goal percentage, in wins and in losses
season_player_stats = RegSeasonPlayerStats.groupby(['season', 'personId']).agg(
    fullName=('fullName', 'first'),
    team=('playerteamName', 'first'),
    player_games_played=('gameId', 'nunique'),
    total_points_scored=('points', 'sum'),
    points_per_game=('points', 'mean'),
    avg_points_per_win=('points', lambda x: x[RegSeasonPlayerStats.loc[x.index, 'win'] == 1].mean()),
    avg_points_per_loss=('points', lambda x: x[RegSeasonPlayerStats.loc[x.index, 'win'] == 0].mean()),
    fg_percentage_per_win=('fieldGoalsPercentage', lambda x: x[RegSeasonPlayerStats.loc[x.index, 'win'] == 1].mean()),
    fg_percentage_per_loss=('fieldGoalsPercentage', lambda x: x[RegSeasonPlayerStats.loc[x.index, 'win'] == 0].mean()),
).reset_index()
season_player_stats.tail(10)

In [None]:
# Get one row per team per game
game_wins = RegSeasonPlayerStats[RegSeasonPlayerStats['win'] == 1].groupby(['gameId', 'playerteamName', 'season']).agg(
    win=('win', 'first'),  # All players on winning team have win=1
).reset_index()
game_wins.head(5)
# Then count wins per team per season
game_wins = game_wins.groupby(['playerteamName', 'season']).agg(
    total_team_wins=('win', 'sum'),
).reset_index()

game_wins[game_wins['season'] == '2024-2025'].sort_values(by='total_team_wins', ascending=False).head(10)

In [None]:
# join the total wins per team per season to the season_player_stats dataframe
season_player_stats = season_player_stats.merge(game_wins, left_on=['team', 'season'], right_on=['playerteamName', 'season'], how='left')
season_player_stats.drop(columns=['playerteamName'], inplace=True)

In [None]:
# create a dataframe of the best scorer for each team for each season
topTeamScorers = season_player_stats.loc[season_player_stats.groupby(['season', 'team'])['points_per_game'].idxmax()]
topTeamScorers["ppg_gap"] = topTeamScorers["avg_points_per_win"] - topTeamScorers["avg_points_per_loss"]
topTeamScorers.tail(10)

In [None]:
import matplotlib.pyplot as plt

plot_df = topTeamScorers[topTeamScorers['season'] == '2024-2025'].copy()
plot_df = plot_df.sort_values("avg_points_per_win", ascending=True)

plt.figure(figsize=(10, 12))


# Loss averages (RED)
plt.scatter(
    plot_df["avg_points_per_loss"],
    plot_df["team"],
    color="red",
    label="Loss Avg",
    alpha=0.8,
    zorder=4
)

# Season averages
plt.scatter(
    plot_df["points_per_game"],
    plot_df["team"],
    color="yellow",
    label="Season Avg",
    alpha=0.9,
    zorder=3 
)

# Win averages
plt.scatter(
    plot_df["avg_points_per_win"],
    plot_df["team"],
    color="green",
    label="Win Avg",
    alpha=0.8,
    zorder=2 
)
# Black connecting line (loss -> win)
plt.hlines(
    y=plot_df["team"],
    xmin=plot_df["avg_points_per_loss"],
    xmax=plot_df["avg_points_per_win"],
    color="black",
    alpha=0.6,
    linewidth=1.5,
    zorder=1
)

plt.xlabel("Points Per Game")
plt.title("2024-2025 Best Scorer Point Averages on Each Team")
plt.legend()
plt.grid(axis="x", alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# create a plot using team's best scorer ppg_gap to see if there's a correlation between the gap and wins and label the dots with the teams
plot_df = topTeamScorers[topTeamScorers['season'] == '2024-2025'].copy()
plot_df = plot_df.merge(game_wins, left_on=['team', 'season'], right_on=['playerteamName', 'season'], how='left')
playoff_teams = ['Nuggets', 'Heat', 'Bucks', 'Celtics', 'Cavaliers', 'Warriors', 'Lakers', 'Knicks', 'Clippers', 'Grizzlies','Magic','Pistons','Thunder','Pacers','Timberwolves','Rockets']
plot_df['is_playoff_team'] = plot_df['team'].apply(lambda x: x in playoff_teams)
# color playoff teams differently
plot_df['color'] = plot_df['is_playoff_team'].apply(lambda x: 'red' if x else 'blue')
plt.figure(figsize=(10, 6))
plt.scatter(plot_df['ppg_gap'], plot_df['total_team_wins'], c=plot_df['color'], alpha=0.7)
for i, team in enumerate(plot_df['team']):
    plt.text(plot_df['ppg_gap'].iloc[i], plot_df['total_team_wins'].iloc[i], team, fontsize=8)
plt.xlabel("PPG Gap (Avg Points in Wins - Avg Points in Losses)")
plt.ylabel("Total Team Wins")
plt.title("Correlation Between PPG Gap and Total Wins for Best Scorers")
plt.grid(alpha=0.3)
plt.tight_layout()

In [None]:
# Build a logistic regression model to predict win or loss based on points scored, points per game, and average points per win and loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
# Prepare the data for modeling
model_df = season_player_stats.dropna(subset=['avg_points_per_win', 'avg_points_per_loss'])
X = model_df[['points_per_game', 'avg_points_per_win', 'avg_points_per_loss']]
y = model_df['total_team_wins'] > (model_df['player_games_played'] / 2)  # Binary target: win more than half the games
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")