In [76]:
import importlib

import mysklearn.mypytable as mypytable
import mysklearn.myutils as myutils
import mysklearn.myevaluation as myevaluation
import mysklearn.myclassifiers as myclassifiers
from tabulate import tabulate

importlib.reload(myutils)
importlib.reload(mypytable)
importlib.reload(myevaluation)
importlib.reload(myclassifiers)

<module 'mysklearn.myclassifiers' from '/home/CPSC322-Final-Project/mysklearn/myclassifiers.py'>

# **NBA Ratings Classifier EDA**

In this EDA we will be looking at a datasets which contains instances of NBA players with their stats and NBA 2K (videogame) ratings


We will use the attributes of this dataset to classify instances by their card type
>In NBA 2K, each player is given a card of a two general types based directly on their rating
>* If a player has a rating of 79 and below they are given a **basic** card represented by a Gold, Silver, or Bronze theme
>* If a player has a rating 80 or above they are given a **special** card represented by a Emerald, Sapphire, Ruby, Amythest, and Diamond themes

><img src="images/goldcard.png" width="200"> <img src="images/drose.png" width="200"> <img src="images/chrisPaul.png" width="200"><br>
>* The first card above is an example of a **basic** card (79 rated "gold")<br>
>* The following two cards are both **special** cards (80 rated "emerald" and 92 rated "amythest")   

## Data Preperation
### Dataset: nba_ratings.csv (https://www.kaggle.com/datasets/willyiamyu/nba-2k-ratings-with-real-nba-stats?select=nba_rankings_2014-2020.csv)
> We will immediately remote several attributes from the dataset that will are either too specific (ex. player name, team name, season) 
or redundant (ex. offensive rebounds, defensive rebounds, 3pt attempts, FG attempts)

>Also, to create a more equal distribution of class labels, we are are only interested in the basic type cards that are over 74 rated. All players with ratings less than 75 will be removed 

> The dataset that we will be using will consist of the attributes below. We will derive all subsets of data from this dataset.
* AGE (Age)
* MIN (minutes played)
* W% (W / GP)
* PTS (points per game)
* FG % (field goal percentage)
* 3PM (3 pt attemps)
* FT% (free throw percentage)
* REB (rebounds)

> These attributes will be used to classify the "ranking" (NBA 2K Ratings) of each instance (player)


In [80]:
import numpy as np


## load data into a MyPyTable
nba_data = mypytable.MyPyTable()
nba_data.load_from_file("nba_ratings.csv")
before_len = len(nba_data.data)

# clean table by removing players with 0% 3 point percentage or FG percentage or below 73 rated

rows_to_drop = []
for i, row in enumerate(nba_data.data):
  if nba_data.data[i][-1] < 75:
    rows_to_drop.append(i)
nba_data.drop_rows(rows_to_drop)

nba_data.drop_rows_with_zero_in_col(["3P%", "FG%"])
after_len = len(nba_data.data)
print(before_len - after_len, "/", before_len, "rows were removed during cleaning")


# get player names col as keys for the instances
#player_names = nba_data.pop_column("PLAYER") 


## TODO create a plot for distribution
# get ratings col (y) 
ratings = nba_data.get_column("rankings")
ratings = [myutils.ratings_discretizer(x) for x in ratings]

nba_data = nba_data.get_subtable(["PLAYER", "AGE", "MIN", "W", "GP", "PTS", "FG%", "3PM", "FT%", "REB"])

# change GP and W attributes to W% (win percentage) 
gp = nba_data.pop_column("GP")

win_index = nba_data.column_names.index("W")
nba_data.column_names[win_index] = "W%"

for i, row in enumerate(nba_data.data):
  nba_data.data[i][win_index] = round(nba_data.data[i][win_index] / gp[i], 2)



nba_data.discretize_col(["AGE"], myutils.age_discretizer)
nba_data.discretize_col(["FT%"], myutils.free_throw_discretizer)
nba_data.discretize_col(["MIN"], myutils.mins_played_discretizer)
nba_data.discretize_col(["PTS"], myutils.pts_discretizer)
nba_data.discretize_col(["3PM"], myutils.threePTRS_made_discretizer)
nba_data.discretize_col(["REB"], myutils.reb_discretizer)
nba_data.discretize_col(["FG%"], myutils.fg_discretizer)
nba_data.discretize_col(["W%"], myutils.win_percent_discretizer)

nba_data.column_names.append("card type")
nba_data.data = [nba_data.data[i] + [ratings[i]] for i in range(len(ratings)) ] 



simple_dataset = nba_data.get_subtable(["FG%", "MIN", "PTS", "3PM"])


sample_table = []
for i in range(10):
  sample_table.append(nba_data.data[np.random.randint(0, len(nba_data.data))])

sample_table = mypytable.MyPyTable(nba_data.column_names, sample_table)
print("Sample instances of discretized data")
sample_table.print_data()


1316 / 2412 rows were removed during cleaning
Sample instances of discretized data
PLAYER           AGE    MIN       W%       PTS     FG%          3PM          FT%     REB             card type
---------------  -----  --------  -------  ------  -----------  -----------  ------  --------------  -----------
Kawhi Leonard    young  high      average  top     efficient    low volume   decent  average volume  special
Gorgui Dieng     young  moderate  average  bottom  efficient    low volume   decent  average volume  basic
Kevin Love       young  high      average  mid     inefficient  high volume  decent  high volume     special
Justise Winslow  young  high      low      bottom  inefficient  low volume   poor    average volume  basic
Marco Belinelli  old    moderate  low      mid     inefficient  low volume   great   low volume      basic
Al Jefferson     old    high      low      mid     efficient    low volume   poor    average volume  special
Larry Nance Jr.  young  moderate  low      bo

## Naive Bayes Classification
Does the simple dataset classify better than the dataset with all of the attributes?


Simple Dataset: (AGE, WINS, LOSSES, MINS, PTS)


In [78]:
## get train / test folds (full and simple datasets)
full_train_folds, full_test_folds = myevaluation.stratified_kfold_cross_validation(nba_data.data, ratings, n_splits=10)
simple_train_folds, simple_test_folds = myevaluation.stratified_kfold_cross_validation(simple_dataset.data, ratings, n_splits=10)

## get scores
full_acc, full_err, full_pres, full_recall, full_f1, full_matrix = \
  myutils.get_scores_from_folds(nba_data.data, ratings, full_train_folds, full_test_folds, myclassifiers.MyDummyClassifier())

simple_acc, simple_err, simple_pres, simple_recall, simple_f1, simple_matrix = \
  myutils.get_scores_from_folds(simple_dataset.data, ratings, simple_train_folds, simple_test_folds, myclassifiers.MyNaiveBayesClassifier())


In [79]:
scores_table = mypytable.MyPyTable()
scores_table.column_names = ["Dataset", "Accuracy", "Error", "Precision", "Recall", "F1 score"]
scores_table.data = []
scores_table.data.append(["Full dataset", full_acc, full_err, full_pres, full_recall, full_f1])
scores_table.data.append(["Simple dataset", simple_acc, simple_err, simple_pres, simple_recall, simple_f1])

scores_table.print_data()









Dataset           Accuracy    Error    Precision    Recall    F1 score
--------------  ----------  -------  -----------  --------  ----------
Full dataset          0.62     0.38            0         0           0
Simple dataset        0.38     0.62            0         0           0
