# **NBA Ratings Classifier EDA**

In [39]:
import importlib

import mysklearn.mypytable as mypytable
import mysklearn.myutils as myutils
import mysklearn.myevaluation as myevaluation
import mysklearn.myclassifiers as myclassifiers


importlib.reload(myutils)
importlib.reload(mypytable)
importlib.reload(myevaluation)

<module 'mysklearn.myevaluation' from '/home/CPSC322-Final-Project/mysklearn/myevaluation.py'>

### Dataset: nba_ratings.csv (https://www.kaggle.com/datasets/willyiamyu/nba-2k-ratings-with-real-nba-stats?select=nba_rankings_2014-2020.csv)
> We will immediately remote several attributes from the dataset that will are either too specific (ex. player name, team name, season) 
or redundant (ex. offensive rebounds, defensive rebounds, 3pt attempts, FG attempts)


> The dataset that we will be using will consist of the attributes below. We will derive all subsets of data from this dataset.
* AGE (Age)
* MIN (minutes played)
* W% (W / GP)
* PTS (points per game)
* FG % (field goal percentage)
* FGM (field goals made)
* 3P% (3 pointers made)
* FT% (free throw percentage)
* REB (rebounds)
* +/- (plus/minus metric)

> These attributes will be used to classify the "ranking" (NBA 2K Ratings) of each instance (player)


In [40]:
## load data into a MyPyTable
nba_data = mypytable.MyPyTable()
nba_data.load_from_file("nba_ratings.csv")
before_len = len(nba_data.data)


# clean table by removing players with 0% 3 point percentage or FG percentage
nba_data.drop_rows_with_zero_in_col(["3P%", "FG%"])
after_len = len(nba_data.data)
print(before_len - after_len, "rows were removed during cleaning")


# get player names col as keys for the instances
#player_names = nba_data.pop_column("PLAYER") 


## TODO create a plot for distribution
# get ratings col (y) 
ratings = nba_data.pop_column("rankings")
ratings = [myutils.ratings_discretizer(x) for x in ratings]

nba_data = nba_data.get_subtable(["PLAYER", "AGE", "MIN", "W", "GP", "PTS", "FG%", "FT%", "REB"])
simple_dataset = nba_data.get_subtable(["AGE", "W", "MIN", "PTS"])

# change GP and W attributes to W% (win percentage) 
gp = nba_data.pop_column("GP")

win_index = nba_data.column_names.index("W")
nba_data.column_names[win_index] = "W%"

for i, row in enumerate(nba_data.data):
  nba_data.data[i][win_index] = round(nba_data.data[i][win_index] / gp[i], 2)



# discretize continuous data (TODO finish)
nba_data.discretize_col(["AGE"], myutils.age_discretizer)
nba_data.discretize_col(["FT%"], myutils.free_throw_discretizer)
nba_data.discretize_col(["MIN"], myutils.mins_played_discretizer)
nba_data.print_data()


print(myutils.get_frequencies(ratings))




358 rows were removed during cleaning
PLAYER                    AGE    MIN          W%    PTS    FG%  FT%       REB
------------------------  -----  ---------  ----  -----  -----  ------  -----
Aaron Gordon              young  high       0.48   14.4   43.7  bad       7.7
Aaron Holiday             young  moderate   0.64    9.5   41.4  great     2.4
Abdel Nader               young  moderate   0.67    6.3   46.8  decent    1.8
Adam Mokoka               young  low        0.27    2.9   42.9  bad       0.9
Admiral Schofield         young  low        0.27    3     38    bad       1.4
Al Horford                old    high       0.58   11.9   45    decent    6.8
Al-Farouq Aminu           old    moderate   0.39    4.3   29.1  bad       4.8
Alec Burks                young  high       0.32   15     41.8  great     4.3
Alen Smailagic            young  low        0.07    4.2   50    decent    1.9
Alex Caruso               young  moderate   0.75    5.5   41.2  decent    1.9
Alex Len                  

## Naive Bayes Classification
Does the simple dataset classify better than the dataset with all of the attributes?


Simple Dataset: (AGE, WINS, LOSSES, MINS, PTS)


In [41]:
## get train / test folds (full and simple datasets)
full_train_folds, full_test_folds = myevaluation.stratified_kfold_cross_validation(nba_data.data, ratings, n_splits=10)
simple_train_folds, simple_test_folds = myevaluation.stratified_kfold_cross_validation(simple_dataset.data, ratings, n_splits=10)

## get scores
full_acc, full_err, full_pres, full_recall, full_f1, full_matrix = \
  myutils.get_scores_from_folds(nba_data.data, ratings, full_train_folds, full_test_folds, myclassifiers.MyDummyClassifier())

simple_acc, simple_err, simple_pres, simple_recall, simple_f1, simple_matrix = \
  myutils.get_scores_from_folds(simple_dataset.data, ratings, simple_train_folds, simple_test_folds, myclassifiers.MyNaiveBayesClassifier())


In [42]:
scores_table = mypytable.MyPyTable()
scores_table.column_names = ["Dataset", "Accuracy", "Error", "Precision", "Recall", "F1 score"]
scores_table.data = []
scores_table.data.append(["Full dataset", full_acc, full_err, full_pres, full_recall, full_f1])
scores_table.data.append(["Simple dataset", simple_acc, simple_err, simple_pres, simple_recall, simple_f1])

scores_table.print_data()



Dataset           Accuracy    Error    Precision    Recall    F1 score
--------------  ----------  -------  -----------  --------  ----------
Full dataset          0.86     0.14            0         0           0
Simple dataset        0.86     0.14            0         0           0
