In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import warnings

# For better visualization
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.precision', 2)

print("Libraries imported successfully!")


In [None]:
# Load the dataset
file_path = '../nba_player_stats_2023_24_per_game.csv'
df = pd.read_csv(file_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nNumber of players: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

# Display the first few rows
print("\nFirst 5 rows of the dataset:")
display(df.head())


In [None]:
# Check data types and null values
print("Data types:")
display(df.dtypes)

print("\nNull values count per column:")
display(df.isnull().sum())


In [None]:
# Make a copy of the original dataframe
df_clean = df.copy()

# Convert 'Rk' column to numeric
df_clean['Rk'] = pd.to_numeric(df_clean['Rk'], errors='coerce')

# Convert percentage columns to proper numeric values
percentage_cols = ['FG%', '3P%', '2P%', 'eFG%', 'FT%']
for col in percentage_cols:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Convert other numeric columns
numeric_cols = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 
                'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
for col in numeric_cols:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Check for duplicates
duplicate_count = df_clean.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    # Remove duplicates
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)
    print("Duplicates removed.")

# Check for missing values after conversion
print("\nMissing values after conversion:")
display(df_clean.isnull().sum())

# Fill missing values with appropriate defaults
# For percentage columns, we'll fill with the mean
for col in percentage_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].mean())

# For other numeric columns, fill with 0 (assuming missing means player didn't record any)
for col in numeric_cols:
    if col not in ['Age', 'G', 'GS', 'MP']:  # Don't fill demographics with 0
        df_clean[col] = df_clean[col].fillna(0)

print("\nCleaned data types:")
display(df_clean.dtypes)

# Add some derived metrics
# True Shooting Percentage (TS%)
df_clean['TS%'] = df_clean['PTS'] / (2 * (df_clean['FGA'] + 0.44 * df_clean['FTA']))

# Assist to Turnover Ratio
df_clean['AST/TOV'] = df_clean['AST'] / df_clean['TOV'].replace(0, 0.001)  # Avoid division by zero

# Per minute stats (per 36 minutes)
per_minute_cols = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV']
for col in ['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV']:
    new_col = f"{col}_per36"
    source_col = col
    if col == 'REB':
        source_col = 'TRB'
    df_clean[new_col] = df_clean[source_col] * 36 / df_clean['MP']

print("\nFirst 5 rows of cleaned data with derived metrics:")
display(df_clean.head())


In [None]:
# Basic statistics summary
print("Summary statistics for key metrics:")
display(df_clean[['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'MP', 'FG%', '3P%', 'FT%']].describe())

# Analyze player positions
print("\nDistribution of player positions:")
position_counts = df_clean['Pos'].value_counts()
display(position_counts)

# Visualize position distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Pos', data=df_clean, palette='viridis')
plt.title('Distribution of Player Positions', fontsize=16)
plt.xlabel('Position', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

# Distribution of player ages
plt.figure(figsize=(12, 6))
sns.histplot(df_clean['Age'], bins=20, kde=True, color='blue')
plt.title('Distribution of Player Ages', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Distribution of minutes played
plt.figure(figsize=(12, 6))
sns.histplot(df_clean['MP'], bins=30, kde=True, color='green')
plt.axvline(df_clean['MP'].mean(), color='red', linestyle='--', label=f'Mean: {df_clean["MP"].mean():.2f}')
plt.title('Distribution of Minutes Played per Game', fontsize=16)
plt.xlabel('Minutes per Game', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Let's identify players who played significant minutes
# Create a filtered dataframe with players who played at least 20 minutes per game and 40 games
min_minutes = 20
min_games = 40
df_filtered = df_clean[(df_clean['MP'] >= min_minutes) & (df_clean['G'] >= min_games)].copy()

print(f"Number of players with at least {min_minutes} minutes per game and {min_games} games: {len(df_filtered)}")

# Top 10 scorers
print("\nTop 10 Scorers (Points per Game):")
top_scorers = df_filtered.sort_values(by='PTS', ascending=False).head(10)[['Player', 'Team', 'Pos', 'PTS', 'FG%', '3P%', 'FT%']]
display(top_scorers)

# Top 10 rebounders
print("\nTop 10 Rebounders (Rebounds per Game):")
top_rebounders = df_filtered.sort_values(by='TRB', ascending=False).head(10)[['Player', 'Team', 'Pos', 'TRB', 'ORB', 'DRB']]
display(top_rebounders)

# Top 10 assist leaders
print("\nTop 10 Assist Leaders (Assists per Game):")
top_assisters = df_filtered.sort_values(by='AST', ascending=False).head(10)[['Player', 'Team', 'Pos', 'AST', 'TOV', 'AST/TOV']]
display(top_assisters)

# Top 10 steal leaders
print("\nTop 10 Steal Leaders (Steals per Game):")
top_stealers = df_filtered.sort_values(by='STL', ascending=False).head(10)[['Player', 'Team', 'Pos', 'STL']]
display(top_stealers)

# Top 10 block leaders
print("\nTop 10 Block Leaders (Blocks per Game):")
top_blockers = df_filtered.sort_values(by='BLK', ascending=False).head(10)[['Player', 'Team', 'Pos', 'BLK']]
display(top_blockers)

# Top 10 3-point shooters (minimum 100 3PA)
print("\nTop 10 3-Point Shooters (minimum 100 3-point attempts):")
top_3pt = df_clean[(df_clean['3PA'] >= 100)].sort_values(by='3P%', ascending=False).head(10)[['Player', 'Team', 'Pos', '3P%', '3P', '3PA']]
display(top_3pt)

# Top 10 efficient scorers (TS%)
print("\nTop 10 Most Efficient Scorers (True Shooting %):")
top_ts = df_filtered.sort_values(by='TS%', ascending=False).head(10)[['Player', 'Team', 'Pos', 'TS%', 'PTS', 'FGA', 'FTA']]
display(top_ts)


In [None]:
# Select key statistics for correlation analysis
key_stats = ['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'FG%', '3P%', 'FT%', 'MP', 'Age']
correlation = df_filtered[key_stats].corr()

# Create a heatmap of correlations
plt.figure(figsize=(12, 10))
mask = np.triu(correlation)
sns.heatmap(correlation, annot=True, fmt=".2f", cmap='coolwarm', mask=mask, vmin=-1, vmax=1, 
            linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Key NBA Statistics', fontsize=16)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

# Let's look at some interesting relationships
# Points vs. Minutes
plt.figure(figsize=(10, 6))
sns.scatterplot(x='MP', y='PTS', data=df_filtered, alpha=0.7, hue='Pos', palette='viridis')
plt.title('Points per Game vs. Minutes Played', fontsize=16)
plt.xlabel('Minutes per Game', fontsize=14)
plt.ylabel('Points per Game', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Field Goal % vs. 3-Point Attempts
plt.figure(figsize=(10, 6))
sns.scatterplot(x='3PA', y='FG%', data=df_filtered, alpha=0.7, hue='Pos', palette='viridis')
plt.title('Field Goal % vs. 3-Point Attempts per Game', fontsize=16)
plt.xlabel('3-Point Attempts per Game', fontsize=14)
plt.ylabel('Field Goal Percentage', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Age vs. Minutes Played
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='MP', data=df_filtered, alpha=0.7)
plt.title('Minutes Played vs. Player Age', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Minutes per Game', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Add a regression line to see the trend
plt.figure(figsize=(10, 6))
sns.regplot(x='Age', y='MP', data=df_filtered, scatter_kws={'alpha':0.5}, line_kws={'color': 'red'})
plt.title('Minutes Played vs. Player Age (with Trend Line)', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Minutes per Game', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Group data by position and calculate means of key statistics
position_stats = df_filtered.groupby('Pos')[['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'FG%', '3P%', 'FT%', 'MP']].mean().reset_index()

print("Average statistics by position:")
display(position_stats)

# Visualize key statistics by position
stats_to_plot = ['PTS', 'TRB', 'AST', 'STL', 'BLK']

# Create a figure with subplots
fig, axes = plt.subplots(len(stats_to_plot), 1, figsize=(12, 20))
fig.suptitle('Key Statistics by Position', fontsize=20, y=0.95)

# Plot each statistic
for i, stat in enumerate(stats_to_plot):
    sns.barplot(x='Pos', y=stat, data=position_stats, ax=axes[i], palette='viridis')
    axes[i].set_title(f'Average {stat} by Position', fontsize=16)
    axes[i].set_xlabel('Position', fontsize=14)
    axes[i].set_ylabel(stat, fontsize=14)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()

# Let's also look at shooting percentages by position
shooting_stats = ['FG%', '3P%', 'FT%']

fig, axes = plt.subplots(len(shooting_stats), 1, figsize=(12, 15))
fig.suptitle('Shooting Percentages by Position', fontsize=20, y=0.95)

for i, stat in enumerate(shooting_stats):
    sns.barplot(x='Pos', y=stat, data=position_stats, ax=axes[i], palette='viridis')
    axes[i].set_title(f'Average {stat} by Position', fontsize=16)
    axes[i].set_xlabel('Position', fontsize=14)
    axes[i].set_ylabel(stat, fontsize=14)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()

# Create a radar chart for position comparison
# Prepare data for radar chart
categories = ['PTS', 'TRB', 'AST', 'STL', 'BLK']

# Normalize the data for radar chart
position_radar = position_stats.copy()
for cat in categories:
    position_radar[cat] = (position_radar[cat] - position_radar[cat].min()) / (position_radar[cat].max() - position_radar[cat].min())

# Convert to numpy arrays for plotting
positions = position_radar['Pos'].tolist()
values = position_radar[categories].values

# Set up the radar chart
angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1]  # Close the loop

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

# Add category labels
plt.xticks(angles[:-1], categories, fontsize=14)

# Plot each position
for i, pos in enumerate(positions):
    values_for_plot = values[i].tolist()
    values_for_plot += values_for_plot[:1]  # Close the loop
    ax.plot(angles, values_for_plot, linewidth=2, label=pos)
    ax.fill(angles, values_for_plot, alpha=0.1)

plt.title('Normalized Statistics by Position (Radar Chart)', fontsize=18)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1), fontsize=12)

plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Calculate Player Efficiency Rating (PER) - a simplified version
# Real PER is more complex, but we can create a simplified version
df_filtered['PER'] = (
    df_filtered['PTS'] + 
    df_filtered['TRB'] + 
    df_filtered['AST'] + 
    df_filtered['STL'] * 2 + 
    df_filtered['BLK'] * 2 - 
    df_filtered['TOV'] - 
    (df_filtered['FGA'] - df_filtered['FG']) - 
    (df_filtered['FTA'] - df_filtered['FT']) * 0.5
) / df_filtered['MP']

# Calculate Win Shares per 48 minutes (simplified)
league_avg_pts_per_poss = 1.12  # NBA average points per possession approximation
df_filtered['WS_48'] = (
    (df_filtered['PTS'] / (2 * df_filtered['FGA'] + 0.44 * df_filtered['FTA'])) / league_avg_pts_per_poss * 
    0.5 + 
    ((df_filtered['STL'] + df_filtered['BLK'] * 0.65 + df_filtered['DRB'] * 0.7 + 
      df_filtered['ORB'] * 0.3) - (df_filtered['PF'] * 0.5 + df_filtered['TOV'])) / df_filtered['MP'] * 
    0.5
) * df_filtered['MP'] / 48

# Calculate Box Plus/Minus (BPM) - simplified
df_filtered['BPM'] = (
    (df_filtered['PTS'] - 20) / 20 +  # Points above league average (assumed 20 PPG)
    (df_filtered['TRB'] - 7) / 7 +    # Rebounds above league average (assumed 7 RPG)
    (df_filtered['AST'] - 5) / 5 +    # Assists above league average (assumed 5 APG)
    df_filtered['STL'] +              # Steals (already normalized)
    df_filtered['BLK'] * 0.7 -        # Blocks (with weight)
    df_filtered['TOV'] / 2            # Turnovers (with weight)
)

# Display top players by our efficiency metrics
print("Top 15 Players by PER (Player Efficiency Rating):")
top_per = df_filtered.sort_values(by='PER', ascending=False).head(15)[['Player', 'Team', 'Pos', 'PER', 'PTS', 'TRB', 'AST', 'MP']]
display(top_per)

print("\nTop 15 Players by WS_48 (Win Shares per 48 minutes):")
top_ws = df_filtered.sort_values(by='WS_48', ascending=False).head(15)[['Player', 'Team', 'Pos', 'WS_48', 'PTS', 'TRB', 'AST', 'MP']]
display(top_ws)

print("\nTop 15 Players by BPM (Box Plus/Minus):")
top_bpm = df_filtered.sort_values(by='BPM', ascending=False).head(15)[['Player', 'Team', 'Pos', 'BPM', 'PTS', 'TRB', 'AST', 'MP']]
display(top_bpm)

# Visualize top players by PER
plt.figure(figsize=(12, 8))
sns.barplot(x='Player', y='PER', data=top_per, palette='viridis')
plt.title('Top 15 Players by PER', fontsize=16)
plt.xlabel('Player', fontsize=14)
plt.ylabel('PER', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.show()

# Create a scatter plot of PTS vs. PER to see efficiency
plt.figure(figsize=(12, 8))
sns.scatterplot(x='PTS', y='PER', data=df_filtered, alpha=0.7, hue='Pos', palette='viridis')

# Annotate top players
top_10_per = df_filtered.sort_values(by='PER', ascending=False).head(10)
for i, player in top_10_per.iterrows():
    plt.annotate(player['Player'], 
                 (player['PTS'], player['PER']),
                 xytext=(5, 5), 
                 textcoords='offset points',
                 fontsize=10,
                 fontweight='bold')

plt.title('Points per Game vs. Player Efficiency Rating', fontsize=16)
plt.xlabel('Points per Game', fontsize=14)
plt.ylabel('PER', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Create a comprehensive player rating based on multiple metrics
df_filtered['RATING'] = (
    (df_filtered['PER'] - df_filtered['PER'].min()) / (df_filtered['PER'].max() - df_filtered['PER'].min()) +
    (df_filtered['WS_48'] - df_filtered['WS_48'].min()) / (df_filtered['WS_48'].max() - df_filtered['WS_48'].min()) +
    (df_filtered['BPM'] - df_filtered['BPM'].min()) / (df_filtered['BPM'].max() - df_filtered['BPM'].min())
) / 3 * 100  # Scale to 0-100

print("\nTop 15 Players by Combined Rating:")
top_rating = df_filtered.sort_values(by='RATING', ascending=False).head(15)[['Player', 'Team', 'Pos', 'RATING', 'PER', 'WS_48', 'BPM', 'PTS']]
display(top_rating)

# Visualize top players by combined rating
plt.figure(figsize=(12, 8))
ax = sns.barplot(x='Player', y='RATING', data=top_rating, palette='viridis')
plt.title('Top 15 Players by Combined Rating', fontsize=16)
plt.xlabel('Player', fontsize=14)
plt.ylabel('Rating (0-100)', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.ylim(0, 100)
plt.tight_layout()

# Add value labels on bars
for i, p in enumerate(ax.patches):
    ax.annotate(f'{p.get_height():.1f}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.show()


In [None]:
# Group data by team
team_stats = df_filtered.groupby('Team')[['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'FG%', '3P%', 'FT%', 'PER', 'TS%']].mean().reset_index()

# Sort teams by scoring
team_stats_sorted = team_stats.sort_values(by='PTS', ascending=False)

print("Average player statistics by team (sorted by points per game):")
display(team_stats_sorted)

# Count number of players per team in the filtered dataset
team_player_counts = df_filtered['Team'].value_counts().reset_index()
team_player_counts.columns = ['Team', 'PlayerCount']

# Merge with team stats
team_stats_merged = pd.merge(team_stats, team_player_counts, on='Team')

# Visualize team scoring
plt.figure(figsize=(14, 8))
sns.barplot(x='Team', y='PTS', data=team_stats_sorted, palette='viridis')
plt.title('Average Points per Game by Team', fontsize=16)
plt.xlabel('Team', fontsize=14)
plt.ylabel('Points per Game', fontsize=14)
plt.xticks(rotation=90, fontsize=12)
plt.tight_layout()
plt.show()

# Visualize shooting percentages by team
fig, axes = plt.subplots(3, 1, figsize=(14, 15))
fig.suptitle('Shooting Percentages by Team', fontsize=20, y=0.95)

# Field Goal Percentage
sns.barplot(x='Team', y='FG%', data=team_stats_sorted, ax=axes[0], palette='viridis')
axes[0].set_title('Field Goal Percentage', fontsize=16)
axes[0].set_xlabel('Team', fontsize=14)
axes[0].set_ylabel('FG%', fontsize=14)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=90, fontsize=12)

# 3-Point Percentage
sns.barplot(x='Team', y='3P%', data=team_stats_sorted, ax=axes[1], palette='viridis')
axes[1].set_title('3-Point Percentage', fontsize=16)
axes[1].set_xlabel('Team', fontsize=14)
axes[1].set_ylabel('3P%', fontsize=14)
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=90, fontsize=12)

# Free Throw Percentage
sns.barplot(x='Team', y='FT%', data=team_stats_sorted, ax=axes[2], palette='viridis')
axes[2].set_title('Free Throw Percentage', fontsize=16)
axes[2].set_xlabel('Team', fontsize=14)
axes[2].set_ylabel('FT%', fontsize=14)
axes[2].set_xticklabels(axes[2].get_xticklabels(), rotation=90, fontsize=12)

plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()

# Create a scatter plot of team performance metrics
plt.figure(figsize=(12, 8))
sns.scatterplot(x='PTS', y='AST', size='PER', hue='Team', data=team_stats_merged, sizes=(50, 500), alpha=0.7)
plt.title('Team Offensive Performance (Points vs. Assists)', fontsize=16)
plt.xlabel('Average Points per Game', fontsize=14)
plt.ylabel('Average Assists per Game', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()
plt.show()

# Find the best and worst teams in various categories
categories = ['PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'PER', 'TS%']
best_teams = {}
worst_teams = {}

for cat in categories:
    best_team = team_stats.loc[team_stats[cat].idxmax()]
    worst_team = team_stats.loc[team_stats[cat].idxmin()]
    best_teams[cat] = (best_team['Team'], best_team[cat])
    worst_teams[cat] = (worst_team['Team'], worst_team[cat])

print("\nBest Team in Each Category:")
for cat, (team, value) in best_teams.items():
    print(f"{cat}: {team} ({value:.3f})")

print("\nWorst Team in Each Category:")
for cat, (team, value) in worst_teams.items():
    print(f"{cat}: {team} ({value:.3f})")


In [None]:
# Create a summary markdown to display our conclusions
from IPython.display import Markdown

summary_md = """
# NBA 2023-24 Season Analysis Summary

## Dataset Overview
- Analyzed **{total_players}** NBA players from the 2023-24 season
- Focused on **{filtered_players}** players with significant playing time (≥{min_minutes} minutes per game, ≥{min_games} games)

## Top Performers

### Scoring Leaders
1. {top_scorer} ({top_scorer_team}) - {top_scorer_pts:.1f} PPG
2. {second_scorer} ({second_scorer_team}) - {second_scorer_pts:.1f} PPG
3. {third_scorer} ({third_scorer_team}) - {third_scorer_pts:.1f} PPG

### Most Efficient Players (PER)
1. {top_per_player} ({top_per_team}) - {top_per_value:.2f}
2. {second_per_player} ({second_per_team}) - {second_per_value:.2f}
3. {third_per_player} ({third_per_team}) - {third_per_value:.2f}

### Overall Player Rating (Combined Metrics)
1. {top_rating_player} ({top_rating_team}) - {top_rating_value:.1f}
2. {second_rating_player} ({second_rating_team}) - {second_rating_value:.1f}
3. {third_rating_player} ({third_rating_team}) - {third_rating_value:.1f}

## Position Analysis
- **Centers** excel in rebounds ({center_reb:.1f} RPG) and blocks ({center_blk:.1f} BPG)
- **Point Guards** lead in assists ({pg_ast:.1f} APG)
- **Shooting Guards** have high 3-point attempt rates
- **Power Forwards** balance scoring and rebounding

## Team Performance
- Highest scoring team: **{best_pts_team}** ({best_pts_value:.1f} PPG)
- Best shooting team (FG%): **{best_fg_team}** ({best_fg_value:.3f})
- Best 3-point shooting team: **{best_3p_team}** ({best_3p_value:.3f})

## Key Insights
1. Strong correlation between minutes played and points scored (r = {pts_min_corr:.2f})
2. Player efficiency generally peaks in the age range of {peak_age_min}-{peak_age_max}
3. Balanced teams with both scoring and playmaking tend to have better overall player efficiency
4. The league continues to emphasize 3-point shooting across all positions

## Statistical Highlights
- League average points per game: {avg_pts:.1f}
- League average rebounds per game: {avg_reb:.1f}
- League average assists per game: {avg_ast:.1f}
- League average field goal percentage: {avg_fg:.3f}
- League average 3-point percentage: {avg_3p:.3f}
- League average free throw percentage: {avg_ft:.3f}
"""

# Extract values for the summary
total_players = len(df_clean)
filtered_players = len(df_filtered)

# Top scorers
top_scorer_row = top_scorers.iloc[0]
top_scorer = top_scorer_row['Player']
top_scorer_team = top_scorer_row['Team']
top_scorer_pts = top_scorer_row['PTS']

second_scorer_row = top_scorers.iloc[1]
second_scorer = second_scorer_row['Player']
second_scorer_team = second_scorer_row['Team']
second_scorer_pts = second_scorer_row['PTS']

third_scorer_row = top_scorers.iloc[2]
third_scorer = third_scorer_row['Player']
third_scorer_team = third_scorer_row['Team']
third_scorer_pts = third_scorer_row['PTS']

# Top PER
top_per_row = top_per.iloc[0]
top_per_player = top_per_row['Player']
top_per_team = top_per_row['Team']
top_per_value = top_per_row['PER']

second_per_row = top_per.iloc[1]
second_per_player = second_per_row['Player']
second_per_team = second_per_row['Team']
second_per_value = second_per_row['PER']

third_per_row = top_per.iloc[2]
third_per_player = third_per_row['Player']
third_per_team = third_per_row['Team']
third_per_value = third_per_row['PER']

# Top Rating
top_rating_row = top_rating.iloc[0]
top_rating_player = top_rating_row['Player']
top_rating_team = top_rating_row['Team']
top_rating_value = top_rating_row['RATING']

second_rating_row = top_rating.iloc[1]
second_rating_player = second_rating_row['Player']
second_rating_team = second_rating_row['Team']
second_rating_value = second_rating_row['RATING']

third_rating_row = top_rating.iloc[2]
third_rating_player = third_rating_row['Player']
third_rating_team = third_rating_row['Team']
third_rating_value = third_rating_row['RATING']

# Position stats
position_avg = df_filtered.groupby('Pos')[['TRB', 'AST', 'BLK']].mean()
center_reb = position_avg.loc['C', 'TRB']
center_blk = position_avg.loc['C', 'BLK']
pg_ast = position_avg.loc['PG', 'AST']

# Team stats
best_pts_team, best_pts_value = best_teams['PTS']
best_fg_team, best_fg_value = best_teams['FG%']
best_3p_team, best_3p_value = best_teams['3P%']

# Correlation
pts_min_corr = df_filtered[['PTS', 'MP']].corr().iloc[0, 1]

# Age analysis - find peak efficiency age range
age_stats = df_filtered.groupby('Age')['PER'].mean()
sorted_age_stats = age_stats.sort_values(ascending=False)
peak_age_min = sorted_age_stats.index[0]
peak_age_max = sorted_age_stats.index[1]

# League averages
avg_pts = df_filtered['PTS'].mean()
avg_reb = df_filtered['TRB'].mean()
avg_ast = df_filtered['AST'].mean()
avg_fg = df_filtered['FG%'].mean()
avg_3p = df_filtered['3P%'].mean()
avg_ft = df_filtered['FT%'].mean()

# Format and display the summary
formatted_summary = summary_md.format(
    total_players=total_players,
    filtered_players=filtered_players,
    min_minutes=min_minutes,
    min_games=min_games,
    top_scorer=top_scorer,
    top_scorer_team=top_scorer_team,
    top_scorer_pts=top_scorer_pts,
    second_scorer=second_scorer,
    second_scorer_team=second_scorer_team,
    second_scorer_pts=second_scorer_pts,
    third_scorer=third_scorer,
    third_scorer_team=third_scorer_team,
    third_scorer_pts=third_scorer_pts,
    top_per_player=top_per_player,
    top_per_team=top_per_team,
    top_per_value=top_per_value,
    second_per_player=second_per_player,
    second_per_team=second_per_team,
    second_per_value=second_per_value,
    third_per_player=third_per_player,
    third_per_team=third_per_team,
    third_per_value=third_per_value,
    top_rating_player=top_rating_player,
    top_rating_team=top_rating_team,
    top_rating_value=top_rating_value,
    second_rating_player=second_rating_player,
    second_rating_team=second_rating_team,
    second_rating_value=second_rating_value,
    third_rating_player=third_rating_player,
    third_rating_team=third_rating_team,
    third_rating_value=third_rating_value,
    center_reb=center_reb,
    center_blk=center_blk,
    pg_ast=pg_ast,
    best_pts_team=best_pts_team,
    best_pts_value=best_pts_value,
    best_fg_team=best_fg_team,
    best_fg_value=best_fg_value,
    best_3p_team=best_3p_team,
    best_3p_value=best_3p_value,
    pts_min_corr=pts_min_corr,
    peak_age_min=peak_age_min,
    peak_age_max=peak_age_max,
    avg_pts=avg_pts,
    avg_reb=avg_reb,
    avg_ast=avg_ast,
    avg_fg=avg_fg,
    avg_3p=avg_3p,
    avg_ft=avg_ft
)

display(Markdown(formatted_summary))
