In [1]:
import importlib
from tabulate import tabulate
import os

import myutils
importlib.reload(myutils)
import myutils as myutils

import dataPrepUtils
importlib.reload(dataPrepUtils)
import dataPrepUtils as dputils

import mypytable
importlib.reload(mypytable)
from mypytable import MyPyTable 

import myclassifiers
importlib.reload(myclassifiers)
from myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyRandomForrestClassifier

import myevaluation
importlib.reload(myevaluation)
import myevaluation as myevaluation

# Prepare data for fitting

In [2]:
# Load data into a mypytable
players_table = MyPyTable()
players_table.load_from_file("cleaned-data.txt")

# create X_train and y_train
# four attributes got: career_PER, career_PTS, career_G, draft_year, win shares
# will test using win shares and PER
per_col = players_table.get_column("career_PER")
pts_col = players_table.get_column("career_PTS")
games_col = players_table.get_column("career_G")
year_col = players_table.get_column("draft_year")
ws_col = players_table.get_column("career_WS")
# convert PER, PTS, and G to categorical
per_cutoffs = dputils.compute_equal_width_cutoffs(per_col, 5)
pts_cutoffs = dputils.compute_equal_width_cutoffs(pts_col, 5)
games_cutoffs = dputils.compute_equal_width_cutoffs(games_col, 10)
ws_cutoffs = dputils.compute_equal_width_cutoffs(ws_col, 5)
categorical_per = myutils.convert_to_categorical(per_col, per_cutoffs)
categorical_pts = myutils.convert_to_categorical(pts_col, pts_cutoffs)
categorical_games = myutils.convert_to_categorical(games_col, games_cutoffs)
categorical_ws = myutils.convert_to_categorical(ws_col, ws_cutoffs)
X = []
for i in range(len(categorical_per)):
    temp = []
    # will test using win shares and PER
    #temp.append(year_col[i])
    temp.append(categorical_pts[i])
    #temp.append(categorical_per[i])
    temp.append(categorical_games[i])
    temp.append(categorical_ws[i])
    X.append(temp)
salaries = players_table.get_column("avg_salary")
salaries_cutoffs = dputils.compute_equal_width_cutoffs(salaries, 10)
y = myutils.convert_to_categorical(salaries, salaries_cutoffs)
player_names = players_table.get_column("name")
index = 271
print(X[index], y[index])
print(salaries_cutoffs)



[2, 3, 1] 3
[70000.0, 1609676.1, 3149352.2, 4689028.3, 6228704.4, 7768380.5, 9308056.6, 10847732.7, 12387408.8, 13927084.9, 15466761.0]


# kNN, Naive Bayes, Decision Tree Classifiers and Predictive Accuracies

In [3]:
# perform stratified k-fold cross validation (k=10)

X_train_folds, X_test_folds = myevaluation.stratified_kfold_cross_validation(X, y, 10)
X_test = []
X_train = []
y_train = []
y_test = []
for fold in X_test_folds:
    x_temp = []
    y_temp = []
    for i in range(len(fold)):
        x_temp.append(X[fold[i]].copy())
        y_temp.append(y[fold[i]])
    X_test.append(x_temp)
    y_test.append(y_temp)
for fold in X_train_folds:
    x_temp = []
    y_temp = []
    for i in range(len(fold)):
        x_temp.append(X[fold[i]].copy())
        y_temp.append(y[fold[i]])
    X_train.append(x_temp)
    y_train.append(y_temp)
#print(X_test)
# declare classifiers
knn_classifier = MyKNeighborsClassifier()
n_bayes_classifier = MyNaiveBayesClassifier()
d_tree_classifier = MyDecisionTreeClassifier()

# test/train with each fold
knn_accuracies = []
dtree_accuracies = []
nbayes_accuracies = []
knn_predicted_total = []
dtree_predicted_total = []
nbayes_predicted_total = []

for k in range(len(X_test)):
    # fit classifiers using X_train and y_train
    knn_classifier.fit(X_train[k], y_train[k])
    d_tree_classifier.fit(X_train[k], y_train[k])
    X_train_copy = X_train[k].copy() # n_bayes needs a copy since it   modifies X_train
    n_bayes_classifier.fit(X_train_copy, y_train[k])
    #d_tree_classifier.print_decision_rules(attribute_names=["draft year", "career pts avg", "career per", "career games"],  class_name="salary ranking")
    # make predictions
    knn_predicted = knn_classifier.predict(X_test[k])
    knn_predicted_total.append(knn_predicted)
    dtree_predicted = d_tree_classifier.predict(X_test[k])
    dtree_predicted_total.append(dtree_predicted)
    nbayes_predicted = n_bayes_classifier.predict(X_test[k])
    nbayes_predicted_total.append(nbayes_predicted)
    count1 = 0
    count2 = 0
    count3 = 0
    for i in range(len(knn_predicted)):
        if knn_predicted[i] == y_test[k][i]:
            count1 += 1
        if dtree_predicted[i] == y_test[k][i]:
            count2 += 1
        if nbayes_predicted[i] == y_test[k][i]:
            count3 += 1
    knn_accuracies.append(count1 / len(knn_predicted))
    dtree_accuracies.append(count2 / len(dtree_predicted))
    nbayes_accuracies.append(count3 / len(nbayes_predicted))
knn_accuracy = sum(knn_accuracies) / len(knn_accuracies)
dtree_accuracy = sum(dtree_accuracies) / len(dtree_accuracies)
nbayes_accuracy = sum(nbayes_accuracies) / len(nbayes_accuracies)
# flatten total predicted lists
knn_predicted_total = [item for sublist in knn_predicted_total for item in sublist]
dtree_predicted_total = [item for sublist in dtree_predicted_total for item in sublist]
nbayes_predicted_total = [item for sublist in nbayes_predicted_total for item in sublist]
# print accuracies
print("=================================")
print("Predictive Accuracy")
print("=================================")
print("Stratified 10-Fold Cross Validation")
print("k Nearest Neighbors: accuracy = " + str(knn_accuracy) + ", error rate = " + str(1 - knn_accuracy))
print("Decision Tree: accuracy = " + str(dtree_accuracy) + ", error rate = " + str(1 - dtree_accuracy))
print("Naive Bayes: accuracy = " + str(nbayes_accuracy) + ", error rate = " + str(1 - nbayes_accuracy))
print()

Predictive Accuracy
Stratified 10-Fold Cross Validation
k Nearest Neighbors: accuracy = 0.12377564979480164, error rate = 0.8762243502051984
Decision Tree: accuracy = 0.7665526675786593, error rate = 0.23344733242134075
Naive Bayes: accuracy = 0.340109439124487, error rate = 0.659890560875513



# Confusion Matrices

In [4]:
# Confusion Matrices
categories = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# flatten y_test
y_test_total = [item for sublist in y_test for item in sublist]
knn_matrix = myevaluation.confusion_matrix(y_test_total, knn_predicted_total,  categories)
d_tree_matrix = myevaluation.confusion_matrix(y_test_total, dtree_predicted_total, categories)
n_bayes_matrix = myevaluation.confusion_matrix(y_test_total, nbayes_predicted_total, categories)

print("=================================")
print("Confusion Matrices")
print("=================================")

# create matrix header
header = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "total", "Recognition %"]

# knn matrix
for i in range(len(knn_matrix)): 
    total = sum(knn_matrix[i])
    knn_matrix[i].append(total)
    recognition = 0
    if total != 0: 
        recognition = ((knn_matrix[i][i]/total) * 100)
    knn_matrix[i].append(recognition)
    knn_matrix[i].insert(0, (i + 1))
print("k-Nearest Neighbors (Stratified 10-Fold Cross Validation Results)")
print(tabulate(knn_matrix, headers= header))
print()

# decision tree matrix
for i in range(len(d_tree_matrix)): 
    total = sum(d_tree_matrix[i])
    d_tree_matrix[i].append(total)
    recognition = 0
    if total != 0: 
        recognition = ((d_tree_matrix[i][i]/total) * 100)
    d_tree_matrix[i].append(recognition)
    d_tree_matrix[i].insert(0, (i + 1))
print("Decision Tree (Stratified 10-Fold Cross Validation Results)")
print(tabulate(d_tree_matrix, headers= header))
print()

# naive bayes matrix
for i in range(len(n_bayes_matrix)): 
    total = sum(n_bayes_matrix[i])
    n_bayes_matrix[i].append(total)
    recognition = 0
    if total != 0: 
        recognition = ((n_bayes_matrix[i][i]/total) * 100)
    n_bayes_matrix[i].append(recognition)
    n_bayes_matrix[i].insert(0, (i + 1))
print("Naive Bayes (Stratified 10-Fold Cross Validation Results)")
print(tabulate(n_bayes_matrix, headers= header))

Confusion Matrices
k-Nearest Neighbors (Stratified 10-Fold Cross Validation Results)
      1    2    3    4    5    6    7    8    9    10    total    Recognition %
--  ---  ---  ---  ---  ---  ---  ---  ---  ---  ----  -------  ---------------
 1    9  126   44   53   22   46   11    5    0     0      316          2.8481
 2    1   42   13   26   14   14    5    1    0     0      116         36.2069
 3    5   66   23   21    6   21    9    3    0     0      154         14.9351
 4    0   29   10   16    7   12    1    3    0     0       78         20.5128
 5    2   42   15   24    9   18    5    1    0     0      116          7.75862
 6    4    6    5    5    5    6    2    3    0     0       36         16.6667
 7    1    5    9    3    1    5    1    1    0     0       26          3.84615
 8    2    2    1    1    1    1    1    0    0     0        9          0
 9    0    0    0    0    0    0    0    0    0     0        0          0
10    1    0    1    0    0    1    2    0    0     

# Test Random Forest Classifier

## Prepare Data

In [5]:
# prepare all 10 attributes for selection
per_col = players_table.get_column("career_PER")
pts_col = players_table.get_column("career_PTS")
games_col = players_table.get_column("career_G")
ws_col = players_table.get_column("career_WS")
ast_col = players_table.get_column("career_AST")
fg_col = players_table.get_column("career_FG%")
fg3_col = players_table.get_column("career_FG3%")
ft_col = players_table.get_column("career_FT%")
trb_col = players_table.get_column("career_TRB")
efg_col = players_table.get_column("career_eFG%")
# convert attributes to categorical values
per_cutoffs = dputils.compute_equal_width_cutoffs(per_col, 5)
pts_cutoffs = dputils.compute_equal_width_cutoffs(pts_col, 5)
games_cutoffs = dputils.compute_equal_width_cutoffs(games_col, 10)
ws_cutoffs = dputils.compute_equal_width_cutoffs(ws_col, 5)
categorical_per = myutils.convert_to_categorical(per_col, per_cutoffs)
categorical_pts = myutils.convert_to_categorical(pts_col, pts_cutoffs)
categorical_games = myutils.convert_to_categorical(games_col, games_cutoffs)
categorical_ws = myutils.convert_to_categorical(ws_col, ws_cutoffs)
ast_cutoffs = dputils.compute_equal_width_cutoffs(ast_col, 5)
fg_cutoffs = dputils.compute_equal_width_cutoffs(fg_col, 10)
fg3_cutoffs = dputils.compute_equal_width_cutoffs(fg3_col, 10)
players_table.remove_rows_with_missing_values()
ft_cutoffs = dputils.compute_equal_width_cutoffs(ft_col, 5)
trb_cutoffs = dputils.compute_equal_width_cutoffs(trb_col, 5)
efg_cutoffs = dputils.compute_equal_width_cutoffs(efg_col, 10)
categorical_ast = myutils.convert_to_categorical(ast_col, ast_cutoffs)
categorical_fg = myutils.convert_to_categorical(fg_col, fg_cutoffs)
categorical_fg3 = myutils.convert_to_categorical(fg3_col, fg3_cutoffs)
categorical_ft = myutils.convert_to_categorical(ft_col, ft_cutoffs)
categorical_trb = myutils.convert_to_categorical(trb_col, trb_cutoffs)
categorical_efg = myutils.convert_to_categorical(efg_col, efg_cutoffs)

## Prepare Train Sets From Columns

In [6]:
X = []
for i in range(len(categorical_per)):
    temp = []
    temp.append(categorical_pts[i])
    temp.append(categorical_per[i])
    temp.append(categorical_games[i])
    temp.append(categorical_ws[i])
    temp.append(categorical_ast[i])
    temp.append(categorical_fg[i])
    temp.append(categorical_fg3[i])
    temp.append(categorical_ft[i])
    temp.append(categorical_trb[i])
    temp.append(categorical_efg[i])
    X.append(temp)


# Fit and Predict Using Random Forrest Classifier

In [7]:
importlib.reload(myutils)
rf_classifier = MyRandomForrestClassifier()
rf_classifier.fit(X, y)
X_test = []
y_test = []
for test in rf_classifier.test_set:
    X_test.append(test[0])
    y_test.append(test[1])
predictions = rf_classifier.predict(X_test)

#Calculate accuracy 
correct = 0 
total = len(X)
for i in range(len(predictions)):
    if predictions[i] == y_test[i]:
        correct += 1
accuracy = correct/total
error = 1 - accuracy
print("=================================")
print("Predictive Accuracy:", accuracy)
print("=================================")
print("=================================")
print("Error Rate:", error)
print("=================================")


Predictive Accuracy: 0.06658878504672897
Error Rate: 0.9334112149532711
