# **NBA Ratings Classifier EDA**

In [10]:
import importlib

import mysklearn.mypytable as mypytable
import mysklearn.myutils as myutils
import mysklearn.myevaluation as myevaluation
import mysklearn.myclassifiers as myclassifiers


importlib.reload(myutils)
importlib.reload(mypytable)
importlib.reload(myevaluation)

<module 'mysklearn.myevaluation' from '/home/CPSC322-Final-Project/mysklearn/myevaluation.py'>

### Dataset: nba_ratings.csv (https://www.kaggle.com/datasets/willyiamyu/nba-2k-ratings-with-real-nba-stats?select=nba_rankings_2014-2020.csv)
> We will immediately remote several attributes from the dataset that will are either too specific (ex. player name, team name, season) 
or redundant (ex. offensive rebounds, defensive rebounds, 3pt attempts, FG attempts)


> The dataset that we will be using will consist of the attributes below. We will derive all subsets of data from this dataset.
* GP (Games Played)
* AGE (Age)
* W (wins)
* L (losses)
* MIN (minutes played)
* PTS (points per game)
* FG % (field goal percentage)
* FGM (field goals made)
* 3P% (3 pointers made)
* FT% (free throw percentage)
* REB (rebounds)
* +/- (plus/minus metric)

> These attributes will be used to classify the "ranking" (NBA 2K Ratings) of each instance (player)


In [11]:
## load data into a MyPyTable
nba_data = mypytable.MyPyTable()
nba_data.load_from_file("nba_ratings.csv")
before_len = len(nba_data.data)


# clean table by removing players with 0% 3 point percentage or FG percentage

nba_data.drop_rows_with_zero_in_col(["3P%", "FG%"])
after_len = len(nba_data.data)
print(before_len - after_len, "rows were removed during cleaning")

# remove NAME, SEASON, and TEAM cols (too specific)
player_names = nba_data.pop_column("PLAYER") 
nba_data.pop_column("TEAM")
nba_data.pop_column("SEASON")
nba_data.pop_column("")

## TODO create a plot for distribution
# get ratings (y) 
ratings = nba_data.pop_column("rankings")
ratings = [myutils.ratings_discretizer(x) for x in ratings]
print(ratings)


simple_dataset = nba_data.get_subtable(["AGE", "W", "L", "MIN", "PTS"])



358 rows were removed during cleaning
['low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'high', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'high', 'high', 'low', 'low', 'low', 'high', 'low', 'high', 'low', 'low', 'high', 'low', 'high', 'low', 'low', 'low', 'low', 'high', 'high', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'high', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'high', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'high', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'high'

## Naive Bayes Classification
Does the simple dataset classify better than the dataset with all of the attributes?


Simple Dataset: (AGE, WINS, LOSSES, MINS, PTS)


In [22]:
## get train / test folds (full and simple datasets)
full_train_folds, full_test_folds = myevaluation.stratified_kfold_cross_validation(nba_data.data, ratings, n_splits=10)
simple_train_folds, simple_test_folds = myevaluation.stratified_kfold_cross_validation(simple_dataset.data, ratings, n_splits=10)

## get scores
full_acc, full_err, full_pres, full_recall, full_f1, full_matrix = \
  myutils.get_scores_from_folds(nba_data.data, ratings, full_train_folds, full_test_folds, myclassifiers.MyDummyClassifier())

simple_acc, simple_err, simple_pres, simple_recall, simple_f1, simple_matrix = \
  myutils.get_scores_from_folds(simple_dataset.data, ratings, simple_train_folds, simple_test_folds, myclassifiers.MyNaiveBayesClassifier())


In [30]:
scores_table = mypytable.MyPyTable()
scores_table.column_names = ["Dataset", "Accuracy", "Error", "Precision", "Recall", "F1 score"]
scores_table.data = []
scores_table.data.append(["Full dataset", full_acc, full_err, full_pres, full_recall, full_f1])
scores_table.data.append(["Simple dataset", simple_acc, simple_err, simple_pres, simple_recall, simple_f1])

scores_table.print_data()



Dataset           Accuracy    Error    Precision    Recall    F1 score
--------------  ----------  -------  -----------  --------  ----------
Full dataset          0.83     0.17            0         0           0
Simple dataset        0.83     0.17            0         0           0
