In [1]:
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## 1.Load pickles of initial clustering and test-dataset

In [2]:
scaler = pickle.load(open("/Users/marcus/Documents/Projects/Final/Final_Project_NBA_Classifier/pickle/scaler", "rb"))
kmeans = pickle.load(open("/Users/marcus/Documents/Projects/Final/Final_Project_NBA_Classifier/pickle/kmeans", "rb"))

In [3]:
test_position_model_1 = pd.read_csv("/Users/marcus/Documents/Projects/Final/Final_Project_NBA_Classifier/data/test_position_model_1.csv")
test_clean_nba = pd.read_csv("/Users/marcus/Documents/Projects/Final/Final_Project_NBA_Classifier/data/test_clean_nba.csv")
clean_nba = pd.read_csv("/Users/marcus/Documents/Projects/Final/Final_Project_NBA_Classifier/data/clean_nba_cluster.csv")

In [16]:
test_position_model_1.columns

Index(['Pos', '2PA', 'FTA', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV',
       '2P_3-10', '2P%_0-3', '3P%', '2P%_asst', '3P%_asst', 'Corner3%_3PA',
       'Clusters'],
      dtype='object')

## 2. Apply clustering to test-dataset (Season 17/18)

In [4]:
# X/Y split
X = test_position_model_1.drop(["Pos"], axis=1) 
y = test_position_model_1['Pos']

# Apply scaler and kmeans
X_scaled = scaler.transform(X)
clusters = kmeans.predict(X_scaled)

# Assign cluster to test datset(Season 18/19) and to the full dataset of 18/19
test_position_model_1["Clusters"] = clusters
test_clean_nba['Cluster'] = clusters

# Assign cluster names
test_clean_nba['Cluster_name'] = test_clean_nba['Cluster'].map({0: 'Just_Big',
                                 1: 'Def Driving Shot Creator',
                                 2: 'Spot Up Facilitator',
                                 3: 'Spot Up Wing',
                                 4: 'Facilitating/Midrange Big',
                                 5: 'Sharp-Shooting Big',
                                 6: 'Rotational',
                                 7: 'Midrange Shooter',
                                 8: 'Attacking Facilitator',
                                 9: 'Dirty-Work'})

## 3. Distribution of clusters among top teams of 17/18 and 19/20

In [5]:
# Via Basketball reference I took the top 4 teams of each year and added it to the list
list_topteams_18 = ["GSW", "CLE", "BOS", "HOU"]
list_topteams_19 = ["LAL", "MIA", "BOS", "DEN"]

Top Teams 2018 (Clustering)

In [6]:
cluster_dist_18 = test_clean_nba.groupby('Tm')["Cluster_name"].value_counts(dropna=False).sort_index(ascending=True)
cluster_frame_18 = cluster_dist_18.to_frame()
cluster_frame_18 = cluster_frame_18.rename(columns={'Cluster_name': "count"})
cluster_frame_18 = cluster_frame_18.reset_index()
top_teams_18 = cluster_frame_18[cluster_frame_18['Tm'].isin(list_topteams_18)]

Top Teams 2019 (Clustering)

In [7]:
cluster_dist_19 = clean_nba.groupby('Tm')["Cluster_name"].value_counts(dropna=False).sort_index(ascending=True)
cluster_frame_19 = cluster_dist_19.to_frame()
cluster_frame_19 = cluster_frame_19.rename(columns={'Cluster_name': "count"})
cluster_frame_19 = cluster_frame_19.reset_index()
top_teams_19 = cluster_frame_19[cluster_frame_19['Tm'].isin(list_topteams_19)]

Pivot both tables and concatenate the top teams of both years

In [8]:
cluster_18 = top_teams_18.pivot(index = 'Tm', columns = 'Cluster_name', values = 'count')
cluster_19 = top_teams_19.pivot(index = 'Tm', columns = 'Cluster_name', values = 'count')
top_teams_18_19 = pd.concat([cluster_19, cluster_18], axis=0)
top_teams_18_19 = top_teams_18_19.fillna(0)
top_teams_18_19

Unnamed: 0_level_0,Attacking Facilitator,Def Driving Shot Creator,Dirty-Work,Facilitating/Midrange Big,Just_Big,Midrange Shooter,Rotational,Sharp-Shooting Big,Spot Up Facilitator,Spot Up Wing
Tm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BOS,2.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,3.0,2.0
DEN,0.0,1.0,0.0,1.0,1.0,2.0,1.0,1.0,4.0,2.0
LAL,1.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,2.0,5.0
MIA,2.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,5.0
BOS,1.0,0.0,0.0,0.0,1.0,4.0,2.0,2.0,3.0,0.0
CLE,4.0,0.0,0.0,0.0,1.0,2.0,1.0,2.0,2.0,6.0
GSW,2.0,1.0,0.0,0.0,3.0,1.0,0.0,3.0,1.0,3.0
HOU,2.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,4.0


## 4. The Perfect Team

Taking the average amount per cluster of the top teams of the past 2 seasons, in order to find the perfect distribution of players in a team.

In [11]:
perfect_team = top_teams_18_19.mean().round()
perfect_team = perfect_team.to_frame()
perfect_team = perfect_team.rename(columns={0: "count"})
perfect_team = perfect_team.reset_index()
perfect_team['Tm'] = "Perfect"
perfect_team = perfect_team.pivot(index = 'Tm', columns = 'index', values = 'count')
perfect_team

index,Attacking Facilitator,Def Driving Shot Creator,Dirty-Work,Facilitating/Midrange Big,Just_Big,Midrange Shooter,Rotational,Sharp-Shooting Big,Spot Up Facilitator,Spot Up Wing
Tm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Perfect,2.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,3.0


## 5. Comparing Test_Roster to perfect team

In [12]:
DAL_19 = cluster_frame_19[cluster_frame_19['Tm'].isin(['DAL'])]
DAL_cluster_19 = DAL_19.pivot(index = 'Tm', columns = 'Cluster_name', values = 'count')
DAL_decision = pd.concat([perfect_team, DAL_cluster_19], axis=0)
DAL_decision = DAL_decision.fillna(0)

In [13]:
DAL_decision


Unnamed: 0_level_0,Attacking Facilitator,Def Driving Shot Creator,Dirty-Work,Facilitating/Midrange Big,Just_Big,Midrange Shooter,Rotational,Sharp-Shooting Big,Spot Up Facilitator,Spot Up Wing
Tm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Perfect,2.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,3.0
DAL,1.0,1.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,3.0
