In [57]:
import importlib
from tabulate import tabulate
import os

import myutils
importlib.reload(myutils)
import myutils as myutils

import dataPrepUtils
importlib.reload(dataPrepUtils)
import dataPrepUtils as dputils

import mypytable
importlib.reload(mypytable)
from mypytable import MyPyTable 

import myclassifiers
importlib.reload(myclassifiers)
from myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import myevaluation
importlib.reload(myevaluation)
import myevaluation as myevaluation

# Prepare data for fitting

In [58]:
# Load data into a mypytable
players_table = MyPyTable()
players_table.load_from_file("cleaned-data.txt")

# create X_train and y_train
# four attributes used: career_PER, career_PTS, career_G, draft_year
per_col = players_table.get_column("career_PER")
pts_col = players_table.get_column("career_PTS")
games_col = players_table.get_column("career_G")
year_col = players_table.get_column("draft_year")
# convert PER, PTS, and G to categorical
per_cutoffs = dputils.compute_equal_width_cutoffs(per_col, 5)
pts_cutoffs = dputils.compute_equal_width_cutoffs(pts_col, 5)
games_cutoffs = dputils.compute_equal_width_cutoffs(games_col, 10)
categorical_per = myutils.convert_to_categorical(per_col, per_cutoffs)
categorical_pts = myutils.convert_to_categorical(pts_col, pts_cutoffs)
categorical_games = myutils.convert_to_categorical(games_col, games_cutoffs)
X = []
for i in range(len(categorical_per)):
    temp = []
    temp.append(year_col[i])
    temp.append(categorical_pts[i])
    temp.append(categorical_per[i])
    temp.append(categorical_games[i])
    X.append(temp)
salaries = players_table.get_column("avg_salary")
salaries_cutoffs = dputils.compute_equal_width_cutoffs(salaries, 10)
y = myutils.convert_to_categorical(salaries, salaries_cutoffs)
player_names = players_table.get_column("name")
index = 271
print(X[index], y[index])
print(salaries_cutoffs)



[2009.0, 4, 5, 5] 4
[65000.0, 2631960.17, 5198920.33, 7765880.5, 10332840.67, 12899800.83, 15466761.0]


# kNN, Naive Bayes, Decision Tree Classifiers and Predictive Accuracies

In [59]:
# perform stratified k-fold cross validation (k=10)

X_train_folds, X_test_folds = myevaluation.stratified_kfold_cross_validation(X, y, 10)
X_test = []
X_train = []
y_train = []
y_test = []
for fold in X_test_folds:
    x_temp = []
    y_temp = []
    for i in range(len(fold)):
        x_temp.append(X[fold[i]].copy())
        y_temp.append(y[fold[i]])
    X_test.append(x_temp)
    y_test.append(y_temp)
for fold in X_train_folds:
    x_temp = []
    y_temp = []
    for i in range(len(fold)):
        x_temp.append(X[fold[i]].copy())
        y_temp.append(y[fold[i]])
    X_train.append(x_temp)
    y_train.append(y_temp)
#print(X_test)
# declare classifiers
knn_classifier = MyKNeighborsClassifier()
n_bayes_classifier = MyNaiveBayesClassifier()
d_tree_classifier = MyDecisionTreeClassifier()

# test/train with each fold
knn_accuracies = []
dtree_accuracies = []
nbayes_accuracies = []
knn_predicted_total = []
dtree_predicted_total = []
nbayes_predicted_total = []

for k in range(len(X_test)):
    # fit classifiers using X_train and y_train
    knn_classifier.fit(X_train[k], y_train[k])
    d_tree_classifier.fit(X_train[k], y_train[k])
    X_train_copy = X_train[k].copy() # n_bayes needs a copy since it   modifies X_train
    n_bayes_classifier.fit(X_train_copy, y_train[k])
    #d_tree_classifier.print_decision_rules(attribute_names=["draft year", "career pts avg", "career per", "career games"],  class_name="salary ranking")
    # make predictions
    knn_predicted = knn_classifier.predict(X_test[k])
    knn_predicted_total.append(knn_predicted)
    dtree_predicted = d_tree_classifier.predict(X_test[k])
    dtree_predicted_total.append(dtree_predicted)
    nbayes_predicted = n_bayes_classifier.predict(X_test[k])
    nbayes_predicted_total.append(nbayes_predicted)
    count1 = 0
    count2 = 0
    count3 = 0
    for i in range(len(knn_predicted)):
        if knn_predicted[i] == y_test[k][i]:
            count1 += 1
        if dtree_predicted[i] == y_test[k][i]:
            count2 += 1
        if nbayes_predicted[i] == y_test[k][i]:
            count3 += 1
    knn_accuracies.append(count1 / len(knn_predicted))
    dtree_accuracies.append(count2 / len(dtree_predicted))
    nbayes_accuracies.append(count3 / len(nbayes_predicted))
knn_accuracy = sum(knn_accuracies) / len(knn_accuracies)
dtree_accuracy = sum(dtree_accuracies) / len(dtree_accuracies)
nbayes_accuracy = sum(nbayes_accuracies) / len(nbayes_accuracies)
# flatten total predicted lists
knn_predicted_total = [item for sublist in knn_predicted_total for item in sublist]
dtree_predicted_total = [item for sublist in dtree_predicted_total for item in sublist]
nbayes_predicted_total = [item for sublist in nbayes_predicted_total for item in sublist]
# print accuracies
print("=================================")
print("Predictive Accuracy")
print("=================================")
print("Stratified 10-Fold Cross Validation")
print("k Nearest Neighbors: accuracy = " + str(knn_accuracy) + ", error rate = " + str(1 - knn_accuracy))
print("Decision Tree: accuracy = " + str(dtree_accuracy) + ", error rate = " + str(1 - dtree_accuracy))
print("Naive Bayes: accuracy = " + str(nbayes_accuracy) + ", error rate = " + str(1 - nbayes_accuracy))
print()

ValueError: '1970.0' is not in list

# Confusion Matrices

In [56]:
# Confusion Matrices
categories = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# flatten y_test
y_test_total = [item for sublist in y_test for item in sublist]
knn_matrix = myevaluation.confusion_matrix(knn_predicted, y_test_total, categories)
d_tree_matrix = myevaluation.confusion_matrix(dtree_predicted, y_test_total, categories)
n_bayes_matrix = myevaluation.confusion_matrix(nbayes_predicted, y_test_total, categories)

print("=================================")
print("Confusion Matrices")
print("=================================")

# create matrix header
header = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "total", "Recognition %"]

# knn matrix
for i in range(len(knn_matrix)):
    knn_matrix[i].append(sum(knn_matrix[i]))
total = 0
for row in knn_matrix:
    total += row[10]
for i in range(len(knn_matrix)):
    knn_matrix[i].append(knn_matrix[i][10]/total*100)
    knn_matrix[i].insert(0, header[i])
print("k-Nearest Neighbors (Stratified 10-Fold Cross Validation Results)")
print(tabulate(knn_matrix, headers= header))
print()

# decision tree matrix
for i in range(len(d_tree_matrix)):
    d_tree_matrix[i].append(sum(d_tree_matrix[i]))
total = 0
for row in d_tree_matrix:
    total += row[10]
for i in range(len(d_tree_matrix)):
    d_tree_matrix[i].append(d_tree_matrix[i][10]/total*100)
    d_tree_matrix[i].insert(0, header[i])
print("Decision Tree (Stratified 10-Fold Cross Validation Results)")
print(tabulate(d_tree_matrix, headers= header))
print()

# naive bayes matrix
for i in range(len(n_bayes_matrix)):
    n_bayes_matrix[i].append(sum(n_bayes_matrix[i]))
total = 0
for row in n_bayes_matrix:
    total += row[10]
for i in range(len(n_bayes_matrix)):
    n_bayes_matrix[i].append(n_bayes_matrix[i][10]/total*100)
    n_bayes_matrix[i].insert(0, header[i])
print("Naive Bayes (Stratified 10-Fold Cross Validation Results)")
print(tabulate(n_bayes_matrix, headers= header))

Confusion Matrices
k-Nearest Neighbors (Stratified 10-Fold Cross Validation Results)
      1    2    3    4    5    6    7    8    9    10    total    Recognition %
--  ---  ---  ---  ---  ---  ---  ---  ---  ---  ----  -------  ---------------
 1    0    0    0    0    0    0    0    0    0     0        0         0
 2   15   17   16   17   17    7   14   12    1     6      122        90.3704
 3    1    2    1    1    0    1    0    1    0     0        7         5.18519
 4    1    0    0    0    1    1    2    0    0     0        5         3.7037
 5    0    0    0    0    0    1    0    0    0     0        1         0.740741
 6    0    0    0    0    0    0    0    0    0     0        0         0
 7    0    0    0    0    0    0    0    0    0     0        0         0
 8    0    0    0    0    0    0    0    0    0     0        0         0
 9    0    0    0    0    0    0    0    0    0     0        0         0
10    0    0    0    0    0    0    0    0    0     0        0         0

D