In [1]:

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression, VarianceThreshold
from sklearn.metrics import mean_absolute_error, r2_score

from keras.layers import Normalization, Dense, Dropout
from keras.optimizers import Adam
from keras.models import Sequential
from keras import regularizers

from keras.callbacks import ModelCheckpoint
from keras import backend as K


## Import Data from .csv File

In [2]:

df = pd.read_csv(r'365finalproject.csv', encoding= 'unicode_escape')


In [3]:
df

Unnamed: 0,PlayerID,Player,Age,Games,Minutes,FGM,FGA,3PM,3PA,FTM,...,PF,PTS,FG%,3P%,FT%,MPG,PPG,RPG,APG,WS_25
0,duartch01,Chris Duarte,24,55,1541,268,621,94,255,90,...,95,720,0.432,0.369,0.804,28.0,13.1,4.1,2.1,0.0
1,marshna01,Naji Marshall,23,32,700,82,209,29,83,53,...,58,246,0.392,0.349,0.707,21.9,7.7,4.6,2.8,2.5
2,pritcpa01,Payton Pritchard,23,66,1268,184,418,102,248,40,...,104,510,0.440,0.411,0.889,19.2,7.7,2.4,1.8,0.6
3,windldy01,Dylan Windler,24,31,513,57,130,26,77,21,...,37,161,0.438,0.338,0.778,16.5,5.2,3.5,1.1,0.8
4,clarkbr01,Brandon Clarke,23,58,1300,296,479,23,64,85,...,100,700,0.618,0.359,0.759,22.4,12.1,5.9,1.4,6.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,kunneke01,Kevin Kunnert,22,64,701,105,215,0,0,21,...,151,231,0.488,0.000,0.636,11.0,3.6,3.4,0.7,4.1
1032,mcneila01,Larry McNeill,23,54,516,106,220,0,0,99,...,76,311,0.482,0.000,0.707,9.6,5.8,2.7,0.4,4.3
1033,ratleed01,Ed Ratleff,23,81,1773,254,585,0,0,103,...,182,611,0.434,0.000,0.798,21.9,7.5,3.5,2.2,4.5
1034,wattssl01,Slick Watts,22,62,1424,198,510,0,0,100,...,207,496,0.388,0.000,0.645,23.0,8.0,2.9,5.7,3.3


In [4]:
## remove 3 pt statistics
df = df.drop(df.columns[[7, 8, 19]], axis=1)

In [5]:
df

Unnamed: 0,PlayerID,Player,Age,Games,Minutes,FGM,FGA,FTM,FTA,ORB,...,BLK,PF,PTS,FG%,FT%,MPG,PPG,RPG,APG,WS_25
0,duartch01,Chris Duarte,24,55,1541,268,621,90,112,41,...,10,95,720,0.432,0.804,28.0,13.1,4.1,2.1,0.0
1,marshna01,Naji Marshall,23,32,700,82,209,53,75,15,...,10,58,246,0.392,0.707,21.9,7.7,4.6,2.8,2.5
2,pritcpa01,Payton Pritchard,23,66,1268,184,418,40,45,33,...,9,104,510,0.440,0.889,19.2,7.7,2.4,1.8,0.6
3,windldy01,Dylan Windler,24,31,513,57,130,21,27,20,...,12,37,161,0.438,0.778,16.5,5.2,3.5,1.1,0.8
4,clarkbr01,Brandon Clarke,23,58,1300,296,479,85,112,92,...,48,100,700,0.618,0.759,22.4,12.1,5.9,1.4,6.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,kunneke01,Kevin Kunnert,22,64,701,105,215,21,33,83,...,54,151,231,0.488,0.636,11.0,3.6,3.4,0.7,4.1
1032,mcneila01,Larry McNeill,23,54,516,106,220,99,140,60,...,6,76,311,0.482,0.707,9.6,5.8,2.7,0.4,4.3
1033,ratleed01,Ed Ratleff,23,81,1773,254,585,103,129,93,...,27,182,611,0.434,0.798,21.9,7.5,3.5,2.2,4.5
1034,wattssl01,Slick Watts,22,62,1424,198,510,100,155,72,...,13,207,496,0.388,0.645,23.0,8.0,2.9,5.7,3.3


In [6]:

names_ds = df[["PlayerID", "Player"]]
features_ds_raw = df.loc[:,'Age':'APG']
labels_ds = df[["WS_25"]]


In [7]:
names_ds

Unnamed: 0,PlayerID,Player
0,duartch01,Chris Duarte
1,marshna01,Naji Marshall
2,pritcpa01,Payton Pritchard
3,windldy01,Dylan Windler
4,clarkbr01,Brandon Clarke
...,...,...
1031,kunneke01,Kevin Kunnert
1032,mcneila01,Larry McNeill
1033,ratleed01,Ed Ratleff
1034,wattssl01,Slick Watts


In [8]:
features_ds_raw

Unnamed: 0,Age,Games,Minutes,FGM,FGA,FTM,FTA,ORB,TRB,AST,STL,BLK,PF,PTS,FG%,FT%,MPG,PPG,RPG,APG
0,24,55,1541,268,621,90,112,41,226,114,56,10,95,720,0.432,0.804,28.0,13.1,4.1,2.1
1,23,32,700,82,209,53,75,15,148,88,26,10,58,246,0.392,0.707,21.9,7.7,4.6,2.8
2,23,66,1268,184,418,40,45,33,158,120,37,9,104,510,0.440,0.889,19.2,7.7,2.4,1.8
3,24,31,513,57,130,21,27,20,107,33,19,12,37,161,0.438,0.778,16.5,5.2,3.5,1.1
4,23,58,1300,296,479,85,112,92,345,81,32,48,100,700,0.618,0.759,22.4,12.1,5.9,1.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,22,64,701,105,215,21,33,83,217,43,10,54,151,231,0.488,0.636,11.0,3.6,3.4,0.7
1032,23,54,516,106,220,99,140,60,146,24,35,6,76,311,0.482,0.707,9.6,5.8,2.7,0.4
1033,23,81,1773,254,585,103,129,93,286,181,90,27,182,611,0.434,0.798,21.9,7.5,3.5,2.2
1034,22,62,1424,198,510,100,155,72,182,351,115,13,207,496,0.388,0.645,23.0,8.0,2.9,5.7


In [9]:
labels_ds

Unnamed: 0,WS_25
0,0.0
1,2.5
2,0.6
3,0.8
4,6.3
...,...
1031,4.1
1032,4.3
1033,4.5
1034,3.3


## Normalization of Data

In [10]:

normalization_layer = tf.keras.layers.Normalization(axis=-1)
normalization_layer.adapt(features_ds_raw)


In [11]:

features_ds_normalized = normalization_layer(features_ds_raw)


In [12]:
features_ds = np.array(features_ds_normalized)

In [13]:
features_ds

array([[ 1.587192  , -1.133756  ,  0.03314409, ...,  0.9669427 ,
         0.09333976,  0.11713215],
       [ 0.8004314 , -2.9718945 , -1.2081058 , ..., -0.21646269,
         0.3200158 ,  0.55341506],
       [ 0.8004314 , -0.2546463 , -0.36978245, ..., -0.21646269,
        -0.6773586 , -0.06984619],
       ...,
       [ 0.8004314 ,  0.94413966,  0.37555787, ..., -0.26029247,
        -0.17867143,  0.17945836],
       [ 0.01367071, -0.5743225 , -0.13953872, ..., -0.1507179 ,
        -0.4506826 ,  2.3608725 ],
       [ 0.8004314 , -0.33456534, -0.44653037, ..., -0.23837757,
         1.0000439 , -0.81775963]], dtype=float32)

## K Fold Validation with 20Folds

In [63]:
from sklearn.model_selection import KFold

In [112]:
kf = KFold(n_splits=10)

In [146]:
r2_list = []

In [147]:
best_r2 = -np.inf
best_weights = None

In [148]:
y = labels_ds

In [149]:
X = features_ds

In [150]:
for train_index, test_index in kf.split(X):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Sequential()
    model.add(Dense(units=1024, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(0.25))
    model.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(0.25))
    model.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(0.25))
    model.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(0.25))
    
    model.add(Dense(units=1, activation='linear'))
    
    
    callback = tf.keras.callbacks.EarlyStopping (
        monitor = 'mean_absolute_error',
        patience = 5,
        restore_best_weights = True
    )

    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001), metrics=['mean_absolute_error'])
    model.fit(x_train, y_train, epochs=50, callbacks = [callback], validation_data=(x_test, y_test))
    
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
    
    mae_list.append(mae)
    r2_list.append(r2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


Epoch 28/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50


Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


In [151]:
print("done")

done


In [152]:
mean_mae = np.mean(mae_list)
print("Mean MAE:", mean_mae)
mean_r2 = np.mean(r2_list)
print("Mean R2:", mean_r2)

Mean MAE: 2.214293824821388
Mean R2: 0.22676953120263468


In [153]:
best_r2 = np.max(r2_list)
best_mae = np.min(mae_list)
print("Best MAE:", best_mae)
print("Best R2:", best_r2)

Best MAE: 1.8703169685143688
Best R2: 0.4099043608632037


In [154]:
best_model.save_weights('best_weights5.h5')

In [155]:
n_train1, n_test1, x_train1, x_test1, y_train1, y_test1 = train_test_split(names_ds, X, y, test_size=0.3)


In [156]:
model.load_weights('best_weights5.h5') ## 5th run was the best

### R2 on entire dataset

In [157]:
model.evaluate(X, y)



[8.373924255371094, 1.9169092178344727]

In [158]:
y_pred1 = model.predict(X)



In [159]:
r2_score1 = r2_score(y, y_pred1)
print(r2_score1)

0.4475711739233199


## Split Data into Training and Testing Datasets

In [None]:

##n_train, n_test, x_train, x_test, y_train, y_test = train_test_split(names_ds, features_ds, labels_ds, test_size=0.3)


In [None]:

##n_train


In [None]:
##x_train

In [None]:
##y_train

In [None]:
##n_test

In [None]:
##x_test

In [None]:
##y_test

In [None]:

NUM_FEATURES = x_train.shape[1]
print(NUM_FEATURES)


# Data Visualization 

## Histogram

In [None]:
sb.histplot(data=features_ds_raw)

# Boxplot

In [None]:
sb.boxplot(data=features_ds_raw)

## Correlation Matrix

In [None]:

C_mat = features_ds_raw.corr()
C_mat


In [None]:

fig = plt.figure(figsize = (15,15))
sb.heatmap(C_mat, vmax=.8, square=True)
plt.show()


## Filter Feature Selection Techniques

### Univariate Feature Selection

In [None]:

f_val, p_val = f_regression(features_ds, np.array(labels_ds).ravel())


In [None]:
f_val

In [None]:
p_val

In [None]:

feature_dictonary={'features':features_ds_raw.columns.tolist(), 'f_scores':f_val.tolist()}
features_f_score= pd.DataFrame(feature_dictonary).sort_values(by='f_scores', ascending=False).reset_index(drop=True)


In [None]:
features_f_score

### Variance Threshold

In [None]:

variance_filter = VarianceThreshold(threshold=0.25)
features_variance_selected = variance_filter.fit_transform(features_ds_raw)


In [None]:
pd.DataFrame(features_variance_selected)

## Principal Component Analysis

## K-Means Clustering

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
mean_abs_error = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print("Mean Absolute Error:", mean_abs_error)
print("R-squared:", r2)

## DNN Architecture

In [None]:
input_layer = tf.keras.layers.Input(shape=(NUM_FEATURES))

x = tf.keras.layers.Dense(units=1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(input_layer)
x = tf.keras.layers.Dropout(0.25)(x)
x = tf.keras.layers.Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
x = tf.keras.layers.Dropout(0.25)(x)
x = tf.keras.layers.Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
x = tf.keras.layers.Dropout(0.25)(x)
x = tf.keras.layers.Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
x = tf.keras.layers.Dropout(0.25)(x)

output_layer = tf.keras.layers.Dense(units=1, activation='linear')(x)

Model = tf.keras.Model(input_layer, output_layer)


In [None]:
Model.summary()

## Training Process

In [None]:

callback = tf.keras.callbacks.EarlyStopping (
    monitor = 'mean_absolute_error',
    patience = 5,
    restore_best_weights = True
)


In [None]:

Model.compile(loss='mean_absolute_error', optimizer=Adam(learning_rate=0.005), metrics=['mean_absolute_error'])


Model.load_weights("Model-02-004.h5")

In [None]:

history = Model.fit (
    x_train, 
    y_train,
    validation_data = (x_test, y_test),
    epochs = 50,
    callbacks = [callback]
)


In [None]:
Model.evaluate(x_test, y_test)

Model.save("FINAL-PROJECT-TRXX-TEXX.h5")

# Result Interpretation & Evaluation 

In [None]:

plt.plot(history.history['mean_absolute_error']) 
plt.plot(history.history['val_mean_absolute_error']) 
plt.legend(['Training Set', 'Testing Set'], loc='upper left')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.title('Model MAE')
plt.show()


In [None]:
preds = Model.predict(x_test)

## Correlation Coefficient

In [None]:
r2 = r2_score(y_test, preds)

In [None]:
r2