# **NBA Ratings Classifier EDA**

In [9]:
import importlib

import mysklearn.mypytable as mypytable
import mysklearn.myutils as myutils
import mysklearn.myevaluation as myevaluation
import mysklearn.myclassifiers as myclassifiers


importlib.reload(myutils)
importlib.reload(mypytable)
importlib.reload(myevaluation)

<module 'mysklearn.myevaluation' from '/home/CPSC322-Final-Project/mysklearn/myevaluation.py'>

### Dataset: nba_ratings.csv (https://www.kaggle.com/datasets/willyiamyu/nba-2k-ratings-with-real-nba-stats?select=nba_rankings_2014-2020.csv)
> We will immediately remote several attributes from the dataset that will are either too specific (ex. player name, team name, season) 
or redundant (ex. offensive rebounds, defensive rebounds, 3pt attempts, FG attempts)


> The dataset that we will be using will consist of the attributes below. We will derive all subsets of data from this dataset.
* AGE (Age)
* MIN (minutes played)
* W% (W / GP)
* PTS (points per game)
* FG % (field goal percentage)
* FGM (field goals made)
* 3P% (3 pointers made)
* FT% (free throw percentage)
* REB (rebounds)
* +/- (plus/minus metric)

> These attributes will be used to classify the "ranking" (NBA 2K Ratings) of each instance (player)


In [10]:
## load data into a MyPyTable
nba_data = mypytable.MyPyTable()
nba_data.load_from_file("nba_ratings.csv")
before_len = len(nba_data.data)


# clean table by removing players with 0% 3 point percentage or FG percentage

nba_data.drop_rows_with_zero_in_col(["3P%", "FG%"])
after_len = len(nba_data.data)
print(before_len - after_len, "rows were removed during cleaning")


# get player names col as keys for the instances
player_names = nba_data.pop_column("PLAYER") 



## TODO create a plot for distribution
# get ratings (y) 
ratings = nba_data.pop_column("rankings")
ratings = [myutils.ratings_discretizer(x) for x in ratings]

nba_data = nba_data.get_subtable(["AGE", "MIN", "W", "GP", "PTS", "FG%", "REB"])
simple_dataset = nba_data.get_subtable(["AGE", "W", "MIN", "PTS"])

# discretize continuous data (TODO finish)
nba_data.discretize_col(["AGE"], myutils.age_discretizer)
nba_data.print_data()



358 rows were removed during cleaning
AGE      MIN    W    GP    PTS    FG%    REB
-----  -----  ---  ----  -----  -----  -----
young   32.5   30    62   14.4   43.7    7.7
young   24.5   42    66    9.5   41.4    2.4
young   15.8   37    55    6.3   46.8    1.8
young   10.2    3    11    2.9   42.9    0.9
young   11.2    9    33    3     38      1.4
old     30.2   39    67   11.9   45      6.8
old     21.1    7    18    4.3   29.1    4.8
young   26.6   21    66   15     41.8    4.3
young    9.9    1    14    4.2   50      1.9
young   18.4   48    64    5.5   41.2    1.9
young   17.6   18    55    8     55.5    5.8
young   14.8   11    40    4.6   42.7    2.8
young    6.9   10    17    2     41.4    2.8
young   17.6    5    37    4.6   35.6    2.1
young   12.1    2    24    6.5   48.1    1.2
young    8.8   11    18    3.2   42.6    0.9
young   33     21    57   17.7   53.3   15.2
old     19.9   10    21    4.6   43.2    3.7
young   12.4    3     7    2.9   27.6    3.9
young   34.4   15

## Naive Bayes Classification
Does the simple dataset classify better than the dataset with all of the attributes?


Simple Dataset: (AGE, WINS, LOSSES, MINS, PTS)


In [11]:
## get train / test folds (full and simple datasets)
full_train_folds, full_test_folds = myevaluation.stratified_kfold_cross_validation(nba_data.data, ratings, n_splits=10)
simple_train_folds, simple_test_folds = myevaluation.stratified_kfold_cross_validation(simple_dataset.data, ratings, n_splits=10)

## get scores
full_acc, full_err, full_pres, full_recall, full_f1, full_matrix = \
  myutils.get_scores_from_folds(nba_data.data, ratings, full_train_folds, full_test_folds, myclassifiers.MyDummyClassifier())

simple_acc, simple_err, simple_pres, simple_recall, simple_f1, simple_matrix = \
  myutils.get_scores_from_folds(simple_dataset.data, ratings, simple_train_folds, simple_test_folds, myclassifiers.MyNaiveBayesClassifier())


In [12]:
scores_table = mypytable.MyPyTable()
scores_table.column_names = ["Dataset", "Accuracy", "Error", "Precision", "Recall", "F1 score"]
scores_table.data = []
scores_table.data.append(["Full dataset", full_acc, full_err, full_pres, full_recall, full_f1])
scores_table.data.append(["Simple dataset", simple_acc, simple_err, simple_pres, simple_recall, simple_f1])

scores_table.print_data()



Dataset           Accuracy    Error    Precision    Recall    F1 score
--------------  ----------  -------  -----------  --------  ----------
Full dataset          0.68     0.32            0         0           0
Simple dataset        0.02     0.98            0         0           0
