In [33]:
import pandas as pd

import tensorflow as tf
from tensorflow import keras

!pip install -q -U keras-tuner
import keras_tuner as kt

from sklearn.feature_selection import RFECV 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.cluster import KMeans
import plotly.express as px
import matplotlib.pyplot as plt



In [2]:
mvp = pd.read_csv("mvp_votings.csv")
nba_df = pd.read_csv("nba_info.csv")
mvp["award_share"]

0      0.658
1      0.613
2      0.414
3      0.261
4      0.120
       ...  
632    0.006
633    0.005
634    0.005
635    0.004
636    0.002
Name: award_share, Length: 637, dtype: float64

In [3]:
print(mvp.columns)
print(nba_df.columns)

Index(['Unnamed: 0', 'fga', 'fg3a', 'fta', 'per', 'ts_pct', 'usg_pct', 'bpm',
       'season', 'player', 'win_pct', 'votes_first', 'points_won',
       'points_max', 'award_share', 'g', 'mp_per_g', 'pts_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct',
       'ws', 'ws_per_48'],
      dtype='object')
Index(['GAME_DATE_EST', 'GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON',
       'TEAM_ID_home', 'HOME_TEAM_WINS', 'TEAM_ID', 'CONFERENCE', 'TEAM',
       'PLAYER_ID', 'PLAYER_NAME', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'REB', 'AST', 'STL', 'BLK',
       'TO', 'PTS'],
      dtype='object')


In [4]:
mvp_groupedII = mvp.groupby(["season", "player"]).mean()
mvp_groupedII.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,fga,fg3a,fta,per,ts_pct,usg_pct,bpm,win_pct,votes_first,...,pts_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,fg_pct,fg3_pct,ft_pct,ws,ws_per_48
season,player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1980-81,Adrian Dantley,11.0,20.3,0.1,9.8,24.3,0.622,28.4,4.6,0.341463,1.0,...,30.7,6.4,4.0,1.4,0.2,0.559,0.286,0.806,13.6,0.191
1980-81,Artis Gilmore,21.0,10.0,0.0,6.5,21.7,0.699,18.5,4.9,0.54878,0.0,...,17.9,10.1,2.1,0.6,2.4,0.67,0.0,0.705,12.3,0.208
1980-81,Bernard King,13.0,15.4,0.1,5.4,19.9,0.617,23.2,3.3,0.47561,0.0,...,21.9,6.8,3.5,0.9,0.4,0.588,0.333,0.703,9.1,0.15
1980-81,Bob Lanier,22.0,10.7,0.0,4.1,19.7,0.573,21.8,3.4,0.731707,0.0,...,14.3,6.2,2.7,1.1,1.2,0.525,1.0,0.751,6.8,0.185
1980-81,Bobby Jones,26.0,9.3,0.0,4.3,20.1,0.604,20.8,5.0,0.756098,0.0,...,13.5,5.4,2.8,1.2,0.9,0.539,0.0,0.813,9.2,0.217


In [5]:
nba_cleaning = nba_df.drop(columns=['HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'TEAM_ID_home', 'HOME_TEAM_WINS', 'TEAM_ID', 'CONFERENCE', 'TEAM'])
nba_cleaning.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,SEASON,PLAYER_ID,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,FG3M,...,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,TO,PTS
0,2022-12-22,22200477,2022-01-01,1629641,Romeo Langford,18.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,2.0,2.0
1,2022-12-22,22200477,2022-01-01,1631110,Jeremy Sochan,31.0,7.0,14.0,0.5,2.0,...,0.5,7.0,10.0,0.7,9.0,6.0,1.0,0.0,2.0,23.0
2,2022-12-22,22200477,2022-01-01,1627751,Jakob Poeltl,21.0,6.0,9.0,0.667,0.0,...,0.0,1.0,1.0,1.0,4.0,1.0,1.0,0.0,2.0,13.0
3,2022-12-22,22200477,2022-01-01,1630170,Devin Vassell,30.0,4.0,13.0,0.308,1.0,...,0.167,1.0,1.0,1.0,9.0,5.0,3.0,0.0,2.0,10.0
4,2022-12-22,22200477,2022-01-01,1630200,Tre Jones,27.0,7.0,12.0,0.583,1.0,...,0.333,4.0,4.0,1.0,2.0,3.0,0.0,0.0,2.0,19.0


In [6]:
mvp_groupedII["award_share"]

season   player           
1980-81  Adrian Dantley       0.022
         Artis Gilmore        0.006
         Bernard King         0.017
         Bob Lanier           0.006
         Bobby Jones          0.004
                              ...  
2017-18  LaMarcus Aldridge    0.006
         LeBron James         0.731
         Russell Westbrook    0.075
         Stephen Curry        0.005
         Victor Oladipo       0.002
Name: award_share, Length: 637, dtype: float64

In [7]:
mvp_cat = mvp_groupedII.dtypes[mvp_groupedII.dtypes == "object"].index.tolist()
mvp_groupedII.dtypes

Unnamed: 0     float64
fga            float64
fg3a           float64
fta            float64
per            float64
ts_pct         float64
usg_pct        float64
bpm            float64
win_pct        float64
votes_first    float64
points_won     float64
points_max     float64
award_share    float64
g              float64
mp_per_g       float64
pts_per_g      float64
trb_per_g      float64
ast_per_g      float64
stl_per_g      float64
blk_per_g      float64
fg_pct         float64
fg3_pct        float64
ft_pct         float64
ws             float64
ws_per_48      float64
dtype: object

In [8]:
def get_clusters(k, data) :
# Create a copy of the DataFrame
  data = data.copy()
# Initialize the K-Means model
  model = KMeans(n_clusters=k, random_state=0)
# Fit the model
  model.fit(data)
# Predict clusters
  predictions = model.predict(data)
# Create return DataFrame with predicted clusters
  data["class"] = model.labels_
  return data

In [9]:
two_clusters = get_clusters(6, mvp_groupedII)
two_clusters.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,fga,fg3a,fta,per,ts_pct,usg_pct,bpm,win_pct,votes_first,...,trb_per_g,ast_per_g,stl_per_g,blk_per_g,fg_pct,fg3_pct,ft_pct,ws,ws_per_48,class
season,player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1980-81,Adrian Dantley,11.0,20.3,0.1,9.8,24.3,0.622,28.4,4.6,0.341463,1.0,...,6.4,4.0,1.4,0.2,0.559,0.286,0.806,13.6,0.191,2
1980-81,Artis Gilmore,21.0,10.0,0.0,6.5,21.7,0.699,18.5,4.9,0.54878,0.0,...,10.1,2.1,0.6,2.4,0.67,0.0,0.705,12.3,0.208,2
1980-81,Bernard King,13.0,15.4,0.1,5.4,19.9,0.617,23.2,3.3,0.47561,0.0,...,6.8,3.5,0.9,0.4,0.588,0.333,0.703,9.1,0.15,2
1980-81,Bob Lanier,22.0,10.7,0.0,4.1,19.7,0.573,21.8,3.4,0.731707,0.0,...,6.2,2.7,1.1,1.2,0.525,1.0,0.751,6.8,0.185,2
1980-81,Bobby Jones,26.0,9.3,0.0,4.3,20.1,0.604,20.8,5.0,0.756098,0.0,...,5.4,2.8,1.2,0.9,0.539,0.0,0.813,9.2,0.217,2


In [10]:
# Verification of different clusters
two_clusters["class"].nunique()

6

In [11]:
!pip install hvplot


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hvplot
  Downloading hvplot-0.8.3-py2.py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: hvplot
Successfully installed hvplot-0.8.3


In [17]:
import hvplot.pandas
!pip install -U bokeh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bokeh
  Downloading bokeh-3.1.0-py3-none-any.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting xyzservices>=2021.09.1
  Downloading xyzservices-2023.2.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xyzservices, bokeh
  Attempting uninstall: bokeh
    Found existing installation: bokeh 2.4.3
    Uninstalling bokeh-2.4.3:
      Successfully uninstalled bokeh-2.4.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
panel 0.14.4 requires bokeh<2.5.0,>=2.4.0, but you have bokeh 3.1.0 which is incompatible.[0m[31m
[0mSuccessfully inst

In [18]:
%matplotlib inline
two_clusters.hvplot.scatter(x="fga", y="per", by="class")

In [14]:
two_clusters.dtypes

Unnamed: 0     float64
fga            float64
fg3a           float64
fta            float64
per            float64
ts_pct         float64
usg_pct        float64
bpm            float64
win_pct        float64
votes_first    float64
points_won     float64
points_max     float64
award_share    float64
g              float64
mp_per_g       float64
pts_per_g      float64
trb_per_g      float64
ast_per_g      float64
stl_per_g      float64
blk_per_g      float64
fg_pct         float64
fg3_pct        float64
ft_pct         float64
ws             float64
ws_per_48      float64
class            int32
dtype: object

## THE MODEL

In [19]:
# For demonstration purposes only

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mvp_groupedII[mvp_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(mvp_cat)
encode_df.head()



0
1
2
3
4


In [22]:
two_clusters.columns

Index(['Unnamed: 0', 'fga', 'fg3a', 'fta', 'per', 'ts_pct', 'usg_pct', 'bpm',
       'win_pct', 'votes_first', 'points_won', 'points_max', 'award_share',
       'g', 'mp_per_g', 'pts_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g',
       'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct', 'ws', 'ws_per_48', 'class'],
      dtype='object')

In [29]:
research_df = two_clusters[["award_share", "fga", "per", "ws", "ws_per_48", "usg_pct","ast_per_g", "stl_per_g",
       "blk_per_g"]]
research_df

Unnamed: 0_level_0,Unnamed: 1_level_0,award_share,fga,per,ws,ws_per_48,usg_pct,ast_per_g,stl_per_g,blk_per_g
season,player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-81,Adrian Dantley,0.022,20.3,24.3,13.6,0.191,28.4,4.0,1.4,0.2
1980-81,Artis Gilmore,0.006,10.0,21.7,12.3,0.208,18.5,2.1,0.6,2.4
1980-81,Bernard King,0.017,15.4,19.9,9.1,0.150,23.2,3.5,0.9,0.4
1980-81,Bob Lanier,0.006,10.7,19.7,6.8,0.185,21.8,2.7,1.1,1.2
1980-81,Bobby Jones,0.004,9.3,20.1,9.2,0.217,20.8,2.8,1.2,0.9
...,...,...,...,...,...,...,...,...,...,...
2017-18,LaMarcus Aldridge,0.006,18.0,25.0,10.9,0.209,29.1,2.0,0.6,1.2
2017-18,LeBron James,0.731,19.3,28.6,14.0,0.221,31.6,9.1,1.4,0.9
2017-18,Russell Westbrook,0.075,21.1,24.7,10.1,0.166,34.1,10.3,1.8,0.3
2017-18,Stephen Curry,0.005,16.9,28.2,9.1,0.267,31.0,6.1,1.6,0.2


In [35]:
# Remove loan status target from features data
y = research_df.award_share
X = research_df.drop(columns=["award_share"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [37]:
from sklearn import preprocessing
from sklearn import utils

In [42]:
enc = preprocessing.LabelEncoder()
encoded = enc.fit_transform(y_train)
print(utils.multiclass.type_of_target(research_df))

print(utils.multiclass.type_of_target(research_df.astype('int')))

print(utils.multiclass.type_of_target(research_df))


continuous-multioutput
multiclass-multioutput
continuous-multioutput


In [41]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

ValueError: ignored

In [43]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
5/5 - 0s - loss: 0.3053 - accuracy: 0.0000e+00 - 191ms/epoch - 38ms/step
Loss: 0.30528339743614197, Accuracy: 0.0
