In [None]:
import pandas as pd
import numpy as np 
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.manifold import TSNE 

### Playoff Stats

In [26]:
stat_df = pd.read_csv('playoff_stats.csv')
stat_df.columns

Index(['RANK', 'NAME', 'TEAM', 'POS', 'AGE', 'GP', 'MPG', 'USG%', 'TO%', 'FTA',
       'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG', 'APG',
       'SPG', 'BPG', 'TPG', 'P+R', 'P+A', 'P+R+A', 'VI', 'ORtg', 'DRtg'],
      dtype='object')

In [None]:
# Filter the df to contain only players that play significant minutes
stat_df = stat_df[stat_df['MPG'] > 10]

In [28]:
names = stat_df['NAME']
position = stat_df['POS']
X = stat_df.drop(['RANK', 'TEAM','NAME'], axis=1)
X


Unnamed: 0,POS,AGE,GP,MPG,USG%,TO%,FTA,FT%,2PA,2P%,...,APG,SPG,BPG,TPG,P+R,P+A,P+R+A,VI,ORtg,DRtg
0,C,30.2,6,41.4,35.7,15.8,78,0.859,94,0.489,...,5.7,1.2,1.5,4.2,43.8,38.7,49.5,12.2,117.1,108.0
1,G,27.8,13,39.8,36.4,9.3,120,0.775,256,0.488,...,7.5,0.8,0.2,2.7,35.7,39.8,43.2,9.3,114.8,114.7
2,G,33.9,4,39.1,31.4,10.0,38,0.974,33,0.424,...,5.0,1.0,0.0,2.3,34.5,36.3,39.5,8.2,127.6,115.7
3,G,25.9,10,39.9,32.3,8.9,81,0.790,187,0.508,...,6.4,1.3,1.7,2.2,37.4,36.6,43.8,11.2,118.3,106.9
4,G,23.6,6,44.6,28.1,8.6,28,0.893,83,0.530,...,6.8,0.8,0.3,2.2,35.0,36.7,41.8,9.1,120.9,113.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,F,30.1,5,11.9,17.2,23.8,0,0.000,3,0.667,...,1.2,0.4,0.0,1.0,3.6,3.8,4.8,4.9,74.6,105.5
149,G,22.6,5,11.9,10.8,21.4,0,0.000,5,0.600,...,2.0,1.0,0.2,0.6,4.6,4.4,6.6,7.4,123.9,98.8
150,F,33.9,4,10.5,14.1,0.0,2,1.000,5,0.400,...,0.5,0.0,0.5,0.0,3.8,2.8,4.3,4.6,93.4,111.7
159,G,26.2,4,15.4,17.1,16.7,0,0.000,6,0.333,...,2.3,1.3,0.3,1.0,2.8,4.0,5.0,4.2,53.7,110.2


In [29]:
# preprocessing for position (categorical)
X.POS = pd.factorize(X['POS'])[0]

In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.columns),
        ('cat', OneHotEncoder(), ['POS'])
    ]
)

X_processed = preprocessor.fit_transform(X)

kmeans = KMeans(n_clusters=5, random_state=42)
stat_df['cluster'] = kmeans.fit_predict(X_processed)

# Optionally, reduce dimensions for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_processed)

# Add the PCA components to the DataFrame for plotting
stat_df['PCA1'] = X_pca[:, 0]
stat_df['PCA2'] = X_pca[:, 1]

fig = px.scatter(
    stat_df,
    x='PCA1',
    y='PCA2',
    color='cluster',
    hover_name=stat_df['NAME'],  # Assuming player names are the index, otherwise use df['player_name']
    title='NBA Players Clustering',
    labels={'PCA1': 'PCA Component 1', 'PCA2': 'PCA Component 2'},
)

fig.update_layout(
    width=800,  # Width of the plot in pixels
    height=600  # Height of the plot in pixels
)
# Show the plot
fig.show()





KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [31]:
import plotly.express as px

def plot_player_highlight(df, player_name):
    """
    Generates an interactive scatter plot with a specific player highlighted.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the player data, including PCA components and cluster labels.
    player_name (str): The name of the player to highlight.
    
    Returns:
    plotly.graph_objects.Figure: The generated plot with the highlighted player.
    """
    
    # Check if the player_name exists in the DataFrame
    if not df['NAME'].str.contains(player_name).any():
        raise ValueError(f"Player '{player_name}' not found in the DataFrame index.")
    
    # Create a column to determine the size of each point (increase size for the highlighted player)
    df['point_size'] = df['NAME'].map(lambda x: 15 if x == player_name else 1)
    
    # Create a column to determine the color of the point (distinct color for the highlighted player)
    df['highlight_color'] = df['NAME'].map(lambda x: 'red' if x == player_name else 'blue')

    # Plot using Plotly
    fig = px.scatter(
        df,
        x='PCA1',
        y='PCA2',
        color='cluster',
        hover_name=df['NAME'],  # Assuming player names are the index, otherwise use df['player_name']
        size='point_size',  # Size of points, highlight player will be larger
        color_discrete_sequence=df['highlight_color'],  # Custom colors
        title=f'NBA Players Clustering (Highlight: {player_name})',
        labels={'PCA1': 'PCA Component 1', 'PCA2': 'PCA Component 2'},
    )

    fig.update_layout(
    width=800,  # Width of the plot in pixels
    height=600  # Height of the plot in pixels
    )

    # Show the plot
    fig.show()
    print(stat_df['point_size'])

# Example usage
# plot_player_highlight(df, 'LeBron James')

plot_player_highlight(stat_df, 'LeBron James')

0      1
1      1
2      1
3      1
4      1
      ..
147    1
149    1
150    1
159    1
167    1
Name: point_size, Length: 143, dtype: int64


### Regular Season Stats

In [32]:
reg_stats = pd.read_csv("reg_stats.csv")
player_count = reg_stats.Player.value_counts() 

dupes = reg_stats[reg_stats['Player'].isin(player_count[player_count > 1].index)]
dupes = dupes[dupes['Tm'] == 'TOT']

reg_stats = pd.concat([reg_stats[~reg_stats['Player'].isin(dupes.Player)], dupes], ignore_index=True)
reg_stats = reg_stats.dropna()
reg_stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Bam Adebayo,C,26,MIA,23,23,34.1,8.0,15.7,0.506,...,0.777,2.1,8.3,10.4,4.0,1.1,1.0,2.7,2.4,22.0
1,Ochai Agbaji,SG,23,UTA,34,10,21.1,2.6,5.7,0.451,...,0.714,0.8,1.7,2.5,1.0,0.6,0.5,0.7,1.5,6.6
2,Santi Aldama,PF,23,MEM,25,5,24.0,4.2,9.5,0.443,...,0.59,1.3,4.0,5.4,1.8,0.6,0.7,1.0,1.6,11.0
3,Nickeil Alexander-Walker,SG,25,MIN,32,11,22.6,2.4,5.8,0.418,...,0.647,0.4,1.3,1.8,2.4,0.9,0.6,1.0,1.9,6.6
4,Grayson Allen,SG,28,PHO,28,28,33.2,4.5,8.8,0.506,...,0.907,0.9,3.5,4.3,2.8,1.0,0.6,1.4,2.2,13.1


In [33]:
names = reg_stats['Player']
position = reg_stats['Pos']
X = reg_stats.drop(['Player', 'Tm'], axis=1)

# preprocessing for position (categorical)
X.Pos = pd.factorize(X['Pos'])[0]
X

Unnamed: 0,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,26,23,23,34.1,8.0,15.7,0.506,0.0,0.3,...,0.777,2.1,8.3,10.4,4.0,1.1,1.0,2.7,2.4,22.0
1,1,23,34,10,21.1,2.6,5.7,0.451,1.2,3.3,...,0.714,0.8,1.7,2.5,1.0,0.6,0.5,0.7,1.5,6.6
2,2,23,25,5,24.0,4.2,9.5,0.443,1.6,4.8,...,0.590,1.3,4.0,5.4,1.8,0.6,0.7,1.0,1.6,11.0
3,1,25,32,11,22.6,2.4,5.8,0.418,1.4,3.8,...,0.647,0.4,1.3,1.8,2.4,0.9,0.6,1.0,1.9,6.6
4,1,28,28,28,33.2,4.5,8.8,0.506,2.4,5.3,...,0.907,0.9,3.5,4.3,2.8,1.0,0.6,1.4,2.2,13.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,6,33,29,6,16.8,1.5,3.4,0.439,0.7,1.9,...,0.846,1.1,2.3,3.3,0.8,1.3,0.6,0.4,1.9,4.4
513,4,23,20,0,6.4,0.8,1.6,0.484,0.1,0.4,...,0.500,0.3,0.6,0.9,0.5,0.5,0.1,0.3,0.9,1.8
515,7,24,31,1,24.1,5.1,11.2,0.451,2.1,5.4,...,0.866,0.2,2.5,2.7,2.5,0.6,0.1,1.0,1.5,15.0
516,0,31,22,0,16.5,2.8,5.4,0.517,0.5,1.0,...,0.742,1.5,1.9,3.4,0.7,0.4,0.8,0.7,2.4,7.0


In [None]:
reg_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.columns),
        ('cat', OneHotEncoder(), ['Pos'])
    ]
)


X_processed = reg_preprocessor.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
reg_stats['cluster'] = kmeans.fit_predict(X_processed)

# Reduce dimensions for visualization using t-SNE
tsne = TSNE(n_components=2, random_state=42) 
X_tsne = tsne.fit_transform(X_processed) 

# Add the t-SNE components to the DataFrame for plotting
reg_stats['TSNE1'] = X_tsne[:, 0]  
reg_stats['TSNE2'] = X_tsne[:, 1]

# Create a scatter plot using Plotly
fig = px.scatter(
    reg_stats,
    x='TSNE1',
    y='TSNE2',
    color='cluster',
    hover_name=reg_stats['Player'], 
    title='NBA Players Clustering (t-SNE)',
    labels={'TSNE1': 't-SNE Component 1', 'TSNE2': 't-SNE Component 2'},
)

# Update the layout for the plot dimensions
fig.update_layout(
    width=800,  # Width of the plot in pixels
    height=600  # Height of the plot in pixels
)

# Show the plot
fig.show()





KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.



In [None]:
def plot_player_highlight(df, player_name):
    """
    Generates an interactive scatter plot with a specific player highlighted.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the player data, including PCA components and cluster labels.
    player_name (str): The name of the player to highlight.
    
    Returns:
    plotly.graph_objects.Figure: The generated plot with the highlighted player.
    """
    
    # Check if the player_name exists in the DataFrame
    if not df['Player'].str.contains(player_name).any():
        raise ValueError(f"Player '{player_name}' not found in the DataFrame index.")
    
    # increase size for the highlighted player
    df['point_size'] = df['Player'].map(lambda x: 15 if x == player_name else 1)
    
    # distinct color for the highlighted player
    df['highlight_color'] = df['Player'].map(lambda x: 'red' if x == player_name else 'blue')

    # Plot using Plotly
    fig = px.scatter(
        df,
        x='TSNE1',
        y='TSNE2',
        color='cluster',
        hover_name=df['Player'], 
        size='point_size', 
        color_discrete_sequence=df['highlight_color'],  # Custom colors
        title=f'NBA Players Clustering (Highlight: {player_name})',
        labels={'TSNE1': 'TSNE Component 1', 'TSNE2': 'TSNE Component 2'},
    )

    fig.update_layout(
    width=800,  # Width of the plot in pixels
    height=600  # Height of the plot in pixels
    )

    # Show the plot
    fig.show()

# Example usage
# plot_player_highlight(df, 'LeBron James')

plot_player_highlight(reg_stats, 'Jamal Murray')

In [37]:
reg_stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'cluster',
       'TSNE1', 'TSNE2', 'point_size', 'highlight_color'],
      dtype='object')