# Data Exploration
The following dataset is with repsect to NBA Shots. Here we're ensuring that the data is cleaned up which allows us to run analysis on it easily

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('NBA_2004_Shots.csv')

# Display few rows
print(df.head())

   SEASON_1 SEASON_2     TEAM_ID               TEAM_NAME  PLAYER_ID  \
0      2004  2003-04  1610612747      Los Angeles Lakers        977   
1      2004  2003-04  1610612757  Portland Trail Blazers        757   
2      2004  2003-04  1610612747      Los Angeles Lakers        977   
3      2004  2003-04  1610612757  Portland Trail Blazers        757   
4      2004  2003-04  1610612757  Portland Trail Blazers        757   

        PLAYER_NAME POSITION_GROUP POSITION   GAME_DATE   GAME_ID  ...  \
0       Kobe Bryant              G       SG  04-14-2004  20301187  ...   
1  Damon Stoudamire              G       PG  04-14-2004  20301187  ...   
2       Kobe Bryant              G       SG  04-14-2004  20301187  ...   
3  Damon Stoudamire              G       PG  04-14-2004  20301187  ...   
4  Damon Stoudamire              G       PG  04-14-2004  20301187  ...   

          BASIC_ZONE         ZONE_NAME ZONE_ABB       ZONE_RANGE LOC_X  LOC_Y  \
0  Above the Break 3  Left Side Center       LC

# Removing unecessary data points
Here we intend to remove the following datasets with the following reasons:

1. Season_2, Team_ID, Team_Name: These provide context but don't directly influence the decision to take a shot
2. Game_Date, Game_ID: The specific games and dates are less relevant unless analyzing trends or specific matchups
3. Home_Team, Away_Team: No way to match team information with abbreviated team data types
4. Quarter, mins_left, secs_left: can be condensed into time passed

# Combining columns and condensing them into relevant information
1. Condensing quarter, mins left and secs left into time passed

In [65]:
# convert the quarters, mins and secs columns into a new column called time_passed in seconds ((quarter-1)*10*60 + mins*60 + secs)
df['TIME_PASSED'] = (df['QUARTER']-1)*10*60 + df['MINS_LEFT']*60 + df['SECS_LEFT']

# print the first few rows of the dataframe
print(df.head())
df = df.drop(['SEASON_2', 'TEAM_ID', 'TEAM_NAME', 'GAME_DATE', 'GAME_ID', 'HOME_TEAM', 'AWAY_TEAM', 'EVENT_TYPE', 'QUARTER', 'MINS_LEFT', 'SECS_LEFT'], axis=1)

# print the remaining columns
print(df.columns)

   SEASON_1 SEASON_2     TEAM_ID               TEAM_NAME  PLAYER_ID  \
0      2004  2003-04  1610612747      Los Angeles Lakers        977   
1      2004  2003-04  1610612757  Portland Trail Blazers        757   
2      2004  2003-04  1610612747      Los Angeles Lakers        977   
3      2004  2003-04  1610612757  Portland Trail Blazers        757   
4      2004  2003-04  1610612757  Portland Trail Blazers        757   

        PLAYER_NAME POSITION_GROUP POSITION   GAME_DATE   GAME_ID  ...  \
0       Kobe Bryant              G       SG  04-14-2004  20301187  ...   
1  Damon Stoudamire              G       PG  04-14-2004  20301187  ...   
2       Kobe Bryant              G       SG  04-14-2004  20301187  ...   
3  Damon Stoudamire              G       PG  04-14-2004  20301187  ...   
4  Damon Stoudamire              G       PG  04-14-2004  20301187  ...   

          ZONE_NAME ZONE_ABB       ZONE_RANGE  LOC_X  LOC_Y SHOT_DISTANCE  \
0  Left Side Center       LC          24+ ft.   20.

In [66]:
# Column visualisation 
print("POSITION_GROUP: " + str(df['POSITION_GROUP'].nunique()))
print("POSITION: " + str(df['POSITION'].nunique()))
print("ACTION_TYPE: " + str(df['ACTION_TYPE'].nunique()))
print("SHOT_TYPE: " + str(df['SHOT_TYPE'].nunique()))
print("BASIC_ZONE: " + str(df['BASIC_ZONE'].nunique()))
print("ZONE_NAME: " + str(df['ZONE_NAME'].nunique()))
print("ZONE_ABB: " + str(df['ZONE_ABB'].nunique()))
print("ZONE_RANGE: " + str(df['ZONE_RANGE'].nunique()))

POSITION_GROUP: 3
POSITION: 12
ACTION_TYPE: 29
SHOT_TYPE: 2
BASIC_ZONE: 7
ZONE_NAME: 6
ZONE_ABB: 6
ZONE_RANGE: 5


In [67]:
# print columns
print(df.columns)

training_df = df.drop(['SEASON_1', 'PLAYER_NAME','ZONE_NAME', 'LOC_X','LOC_Y'], axis = 1)

Index(['SEASON_1', 'PLAYER_ID', 'PLAYER_NAME', 'POSITION_GROUP', 'POSITION',
       'SHOT_MADE', 'ACTION_TYPE', 'SHOT_TYPE', 'BASIC_ZONE', 'ZONE_NAME',
       'ZONE_ABB', 'ZONE_RANGE', 'LOC_X', 'LOC_Y', 'SHOT_DISTANCE',
       'TIME_PASSED'],
      dtype='object')


In [68]:
for column in training_df.columns:
    if column == 'PLAYER_ID' or column == 'TIME_PASSED':
        continue
    else:
        print(column + ": " + str(training_df[column].unique()))

POSITION_GROUP: ['G' 'C' 'F']
POSITION: ['SG' 'PG' 'C' 'PF' 'SF' 'SF-SG' 'C-PF' 'SG-SF' 'PG-SF' 'SG-PG' 'PF-C'
 'PF-SF']
SHOT_MADE: [ True False]
ACTION_TYPE: ['Jump Shot' 'Driving Layup Shot' 'Dunk Shot' 'Alley Oop Dunk Shot'
 'Fadeaway Jump Shot' 'Driving Finger Roll Shot' 'Layup Shot' 'Tip Shot'
 'Slam Dunk Shot' 'Running Hook Shot' 'Turnaround Jump Shot'
 'Running Layup Shot' 'Jump Bank Shot' 'Running Jump Shot'
 'Jump Hook Shot' 'Reverse Layup Shot' 'Driving Dunk Shot' 'Hook Shot'
 'Reverse Dunk Shot' 'Running Finger Roll Shot' 'Turnaround Hook Shot'
 'Alley Oop Layup shot' 'Follow Up Dunk Shot' 'Driving Hook Shot'
 'Running Dunk Shot' 'Finger Roll Shot' 'Hook Bank Shot'
 'Turnaround Finger Roll Shot' 'Running Tip Shot']
SHOT_TYPE: ['3PT Field Goal' '2PT Field Goal']
BASIC_ZONE: ['Above the Break 3' 'Restricted Area' 'Mid-Range' 'Left Corner 3'
 'In The Paint (Non-RA)' 'Right Corner 3' 'Backcourt']
ZONE_ABB: ['LC' 'C' 'L' 'R' 'RC' 'BC']
ZONE_RANGE: ['24+ ft.' 'Less Than 8 ft.' '16

In [69]:
# Drop ZONE_ABB
training_df = training_df.drop(['ZONE_ABB'], axis = 1)

# size of df
print(training_df.shape)

(189803, 10)


# Pre Process data for KMEans and DBSCAN 
These two methods are dependent on distance metrics so we need to ensure that the following categorical variables (POSITION_TYPE, ACTION_TYPE, BASIC_ZONE, ZONE_RANGE) are encoded using one-hot encoding

In [70]:
from sklearn.cluster import MiniBatchKMeans, DBSCAN
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score
import time


le_position_group = LabelEncoder()
training_df['POSITION_GROUP'] = le_position_group.fit_transform(training_df['POSITION_GROUP'])

le_position = LabelEncoder()
training_df['POSITION'] = le_position.fit_transform(training_df['POSITION'])

le_action_type = LabelEncoder()
training_df['ACTION_TYPE'] = le_action_type.fit_transform(training_df['ACTION_TYPE'])

le_shot_type = LabelEncoder()
training_df['SHOT_TYPE'] = le_shot_type.fit_transform(training_df['SHOT_TYPE'])

# Define features for clustering (excluding SHOT_MADE as it's a target feature)
features = ['TIME_PASSED', 'POSITION_GROUP', 'POSITION', 'ACTION_TYPE', 'SHOT_TYPE', 'SHOT_DISTANCE']
X = training_df[features]

# Normalize the data using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [71]:
# Step 2: Apply MiniBatch KMeans for large datasets
start_time = time.time()
kmeans = MiniBatchKMeans(n_clusters=5, batch_size=10000, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)
print(f"MiniBatch KMeans finished in {time.time() - start_time} seconds.")

# Step 3: Apply DBSCAN clustering (might require tuning of eps and min_samples)
start_time = time.time()
dbscan = DBSCAN(eps=0.5, min_samples=5, n_jobs=-1)
dbscan_labels = dbscan.fit_predict(X_scaled)
print(f"DBSCAN finished in {time.time() - start_time} seconds.")

# Step 4: Apply Bayesian Gaussian Mixture clustering
start_time = time.time()
bayes = BayesianGaussianMixture(n_components=5, random_state=42)
bayes_labels = bayes.fit_predict(X_scaled)
print(f"Bayesian Gaussian Mixture finished in {time.time() - start_time} seconds.")

# Add cluster labels to the original DataFrame for analysis
training_df['KMeans_Cluster'] = kmeans_labels
training_df['DBSCAN_Cluster'] = dbscan_labels
training_df['Bayes_Cluster'] = bayes_labels


  super()._check_params_vs_input(X, default_n_init=3)


MiniBatch KMeans finished in 0.9905319213867188 seconds.
DBSCAN finished in 19.687666177749634 seconds.
Bayesian Gaussian Mixture finished in 3.7150752544403076 seconds.


In [72]:

# Step 5: Post-clustering analysis - How representative are the clusters of SHOT_MADE?

# KMeans Clustering: Calculate the mean success rate (SHOT_MADE) per cluster
kmeans_analysis = training_df.groupby('KMeans_Cluster')['SHOT_MADE'].mean()
print("KMeans Cluster - SHOT_MADE Success Rate:")
print(kmeans_analysis)

# DBSCAN Clustering: Calculate the mean success rate (SHOT_MADE) per cluster
dbscan_analysis = training_df.groupby('DBSCAN_Cluster')['SHOT_MADE'].mean()
print("\nDBSCAN Cluster - SHOT_MADE Success Rate:")
print(dbscan_analysis)

# Bayesian Clustering: Calculate the mean success rate (SHOT_MADE) per cluster
bayes_analysis = training_df.groupby('Bayes_Cluster')['SHOT_MADE'].mean()
print("\nBayes Cluster - SHOT_MADE Success Rate:")
print(bayes_analysis)

KMeans Cluster - SHOT_MADE Success Rate:
KMeans_Cluster
0    0.415255
1    0.375710
2    0.347139
3    0.456486
4    0.767366
Name: SHOT_MADE, dtype: float64

DBSCAN Cluster - SHOT_MADE Success Rate:
DBSCAN_Cluster
-1     0.230216
 0     0.350396
 1     0.441557
 2     0.344272
 3     0.471094
 4     0.361386
 5     0.443597
 6     0.457964
 7     0.483202
 8     0.336991
 9     0.284895
 10    0.000000
 11    0.000000
 12    0.200000
 13    0.000000
 14    0.222222
 15    0.000000
 16    0.500000
 17    0.000000
 18    0.500000
 19    0.000000
 20    0.000000
Name: SHOT_MADE, dtype: float64

Bayes Cluster - SHOT_MADE Success Rate:
Bayes_Cluster
0    0.362297
1    0.404913
2    0.620852
3    0.347167
4    0.748569
Name: SHOT_MADE, dtype: float64


In [73]:
# Step 6: Evaluate clustering quality using Silhouette Score and Davies-Bouldin Index

# For KMeans
silhouette_kmeans = silhouette_score(X_scaled, kmeans_labels)
dbi_kmeans = davies_bouldin_score(X_scaled, kmeans_labels)
print(f"\nMiniBatch KMeans Silhouette Score: {silhouette_kmeans}")
print(f"MiniBatch KMeans DBI: {dbi_kmeans}")

# For DBSCAN
if len(np.unique(dbscan_labels)) > 1:
    silhouette_dbscan = silhouette_score(X_scaled, dbscan_labels)
    dbi_dbscan = davies_bouldin_score(X_scaled, dbscan_labels)
    print(f"DBSCAN Silhouette Score: {silhouette_dbscan}")
    print(f"DBSCAN DBI: {dbi_dbscan}")
else:
    print("DBSCAN did not find more than one cluster")

# For Bayesian Gaussian Mixture
silhouette_bayes = silhouette_score(X_scaled, bayes_labels)
dbi_bayes = davies_bouldin_score(X_scaled, bayes_labels)
print(f"Bayesian Silhouette Score: {silhouette_bayes}")
print(f"Bayesian DBI: {dbi_bayes}")


MiniBatch KMeans Silhouette Score: 0.2379887269860108
MiniBatch KMeans DBI: 1.5091721553431416
DBSCAN Silhouette Score: 0.2013338570254
DBSCAN DBI: 1.8117748787177064
Bayesian Silhouette Score: 0.30583279326929125
Bayesian DBI: 1.3046953265286296


# How can we optimise the code above?

1. Batch Sizing for Mini KMeans Clustering
The size can potentially be reduced further if needed, current size is 10,000

2. DBSCAN Parameters
eps = 0.5 which might be a little too high for a dataset of our density. It might make sense to perform Principle Component Analaysis (PCA), a dimension reduction method, so that the dataset is easier to handle

3. Handling Cluster Labels for DBSCAN
Excluding noise where dbscan label is -1 would be important 

In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans, DBSCAN
from sklearn.mixture import BayesianGaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score
import time


# Step 1: Standardize the data
X = training_df[['SHOT_DISTANCE', 'TIME_PASSED']] 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% variance
X_reduced = pca.fit_transform(X_scaled)

# Step 3: Apply MiniBatch KMeans for large datasets
start_time = time.time()
kmeans = MiniBatchKMeans(n_clusters=5, batch_size=1000, random_state=42)  # Adjusted batch size
kmeans_labels = kmeans.fit_predict(X_reduced)
print(f"MiniBatch KMeans finished in {time.time() - start_time} seconds.")

# Step 4: Apply DBSCAN clustering
start_time = time.time()
dbscan = DBSCAN(eps=0.5, min_samples=5, n_jobs=-1)  
dbscan_labels = dbscan.fit_predict(X_reduced)
print(f"DBSCAN finished in {time.time() - start_time} seconds.")

# Step 5: Apply Bayesian Gaussian Mixture clustering
start_time = time.time()
bayes = BayesianGaussianMixture(n_components=5, random_state=42)
bayes_labels = bayes.fit_predict(X_reduced)
print(f"Bayesian Gaussian Mixture finished in {time.time() - start_time} seconds.")

# Add cluster labels to the original DataFrame for analysis
training_df['KMeans_Cluster'] = kmeans_labels
training_df['DBSCAN_Cluster'] = dbscan_labels
training_df['Bayes_Cluster'] = bayes_labels

# KMeans Clustering
kmeans_analysis = training_df.groupby('KMeans_Cluster')['SHOT_MADE'].mean()
print("KMeans Cluster - SHOT_MADE Success Rate:")
print(kmeans_analysis)

# DBSCAN Clustering
dbscan_analysis = training_df.groupby('DBSCAN_Cluster')['SHOT_MADE'].mean()
print("\nDBSCAN Cluster - SHOT_MADE Success Rate:")
print(dbscan_analysis)

# Bayesian Clustering
bayes_analysis = training_df.groupby('Bayes_Cluster')['SHOT_MADE'].mean()
print("\nBayes Cluster - SHOT_MADE Success Rate:")
print(bayes_analysis)

# Step 7: Evaluate clustering quality using Silhouette Score and Davies-Bouldin Index
# For KMeans
silhouette_kmeans = silhouette_score(X_reduced, kmeans_labels)
dbi_kmeans = davies_bouldin_score(X_reduced, kmeans_labels)
print(f"\nMiniBatch KMeans Silhouette Score: {silhouette_kmeans}")
print(f"MiniBatch KMeans DBI: {dbi_kmeans}")

# For DBSCAN
if len(np.unique(dbscan_labels)) > 1:
    silhouette_dbscan = silhouette_score(X_reduced, dbscan_labels)
    dbi_dbscan = davies_bouldin_score(X_reduced, dbscan_labels)
    print(f"DBSCAN Silhouette Score: {silhouette_dbscan}")
    print(f"DBSCAN DBI: {dbi_dbscan}")
else:
    print("DBSCAN did not find more than one cluster, so Silhouette and DBI scores are not applicable.")

# For Bayesian Gaussian Mixture
silhouette_bayes = silhouette_score(X_reduced, bayes_labels)
dbi_bayes = davies_bouldin_score(X_reduced, bayes_labels)
print(f"Bayesian Silhouette Score: {silhouette_bayes}")
print(f"Bayesian DBI: {dbi_bayes}")


  super()._check_params_vs_input(X, default_n_init=3)


MiniBatch KMeans finished in 0.04029488563537598 seconds.
DBSCAN finished in 231.86426496505737 seconds.




Bayesian Gaussian Mixture finished in 10.899379014968872 seconds.
KMeans Cluster - SHOT_MADE Success Rate:
KMeans_Cluster
0    0.376841
1    0.477883
2    0.366108
3    0.523372
4    0.515614
Name: SHOT_MADE, dtype: float64

DBSCAN Cluster - SHOT_MADE Success Rate:
DBSCAN_Cluster
-1    0.230769
 0    0.438659
 1    0.285714
Name: SHOT_MADE, dtype: float64

Bayes Cluster - SHOT_MADE Success Rate:
Bayes_Cluster
0    0.369043
1    0.596970
2    0.405498
3    0.381789
4    0.422607
Name: SHOT_MADE, dtype: float64

MiniBatch KMeans Silhouette Score: 0.42039755669877105
MiniBatch KMeans DBI: 0.835032030353854
DBSCAN Silhouette Score: 0.505051536020271
DBSCAN DBI: 0.7846465535065041
Bayesian Silhouette Score: 0.1754291348013303
Bayesian DBI: 1.5662926792312546
