In [236]:
from sklearn.cluster import KMeans
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
import sklearn
import matplotlib.pyplot as plt

In [3]:
class dataReader(object):

    def __init__(self,X,batch_size=1):
        self.X = X
        self.num_examples = X.shape[0]
        self.batch_number = 0
        self.batch_size = batch_size
        self.num_batches = int(np.ceil(X.shape[0] / batch_size))

    def next_batch(self):
        low_ix = self.batch_number*self.batch_size
        up_ix = (self.batch_number + 1)*self.batch_size
        if up_ix >= self.X.shape[0]:
            up_ix = self.X.shape[0]
            self.batch_number = 0 # reset batch_number to zero
        else:
            self.batch_number = self.batch_number + 1
        return self.X[low_ix:up_ix,:]

# Retrieving NBA Data

In [339]:
AVG_PLAYER_STATS_QUERY = """SELECT 
    player_id, 
    game_id,
    FGM, 
    FGA, 
    3PM, 
    3PA, 
    FTM, 
    FTA, 
    OREB, 
    DREB, 
    AST, 
    TOV, 
    STL, 
    BLK, 
    PF, 
    PM
FROM regboxscores WHERE MIN > 0;"""
PLAYERS_QUERY = """SELECT code_name, id FROM players"""
GAMES_QUERY = """SELECT id, dt FROM reggames"""

engine = create_engine('mysql+pymysql://root:beer@localhost/nba')
raw_box_scores = pd.read_sql_query(AVG_PLAYER_STATS_QUERY, con = engine)
raw_players = pd.read_sql_query(PLAYERS_QUERY, con = engine)
raw_games = pd.read_sql_query(GAMES_QUERY, con = engine)

In [347]:
box_scores = raw_box_scores.dropna().copy()
players = raw_players.dropna().copy()
games = raw_games.dropna().copy()

In [9]:
box_scores = box_scores.merge(games, left_on='game_id', right_on='id', how='inner')
box_scores = box_scores.drop('id', 1)
box_scores = box_scores.drop('game_id', 1)
box_scores = players.merge(box_scores, left_on='id', right_on='player_id', how='inner')
box_scores = box_scores.drop('id', 1)

box_scores['dt'] = pd.to_datetime(box_scores['dt'])

box_scores['dt'] = np.where(box_scores['dt'].dt.month >= 10,
                            box_scores['dt'].dt.year,
                            box_scores['dt'].dt.year - 1)

box_scores['code_name'] = box_scores['code_name'].astype(str) + ' (' + box_scores['dt'].astype(str) + '-' + (box_scores['dt'] + 1).astype(str) + ')'
box_scores = box_scores.drop('dt', 1)
box_scores = box_scores.groupby('code_name').mean().reset_index()
box_scores['player_id'] = box_scores['player_id'].astype(int)

df1 = box_scores.iloc[:, :2] # player ids + names
df2 = box_scores.iloc[:, 2:] # features

# Autoencoder

In [349]:
def train_model(data, n_hidden2, n_epochs):
    n_inputs = 14
    n_hidden1 = 14
    n_hidden3 = n_hidden1
    n_outputs = n_inputs
    learning_rate = 0.01
    l2_reg = 0.001

    X = tf.placeholder(tf.float32, shape=[None, n_inputs])
    with tf.contrib.framework.arg_scope(
            [fully_connected],
            activation_fn=tf.nn.elu,
            weights_initializer=tf.contrib.layers.variance_scaling_initializer(),
            weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg)):
        hidden1 = fully_connected(X, n_hidden1)
        hidden2 = fully_connected(hidden1, n_hidden2)
        hidden3 = fully_connected(hidden2, n_hidden3)
        outputs = fully_connected(hidden3, n_outputs, activation_fn=None)

    reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))

    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss = tf.add_n([reconstruction_loss] + reg_losses)

    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

    init = tf.global_variables_initializer()


    batch_size = 70

    data_reader = dataReader(data, batch_size)

    with tf.Session() as sess:
        init.run()
        for epoch in range(n_epochs):
            n_batches = data_reader.num_batches
            for iteration in range(n_batches):

                X_batch = data_reader.next_batch()
                sess.run(training_op, feed_dict={X: X_batch})
        print('d=' + str(n_hidden2) +', ' + 'loss: ' + str(loss.eval(feed_dict={X: data})))
        return pd.DataFrame(outputs.eval(feed_dict={X: data})), pd.DataFrame(hidden2.eval(feed_dict={X: data}))

In [350]:
def kmeans_cluster(data, col_name):
    km = KMeans(n_clusters=5).fit(data)
    labels = km.labels_
    df = pd.DataFrame([labels]).T # TODO: is transpose needed?
    df.columns = [col_name]
    return df

# Train model with different dimensionalities for 2nd hidden layer

In [10]:
X_box = df2.values

outputs_1, hidden_1 = train_model(X_box, 1, 50)
outputs_2, hidden_2 = train_model(X_box, 2, 50)
outputs_5, hidden_5 = train_model(X_box, 5, 50)
outputs_10, hidden_10 = train_model(X_box, 10, 50)
outputs_14, hidden_14 = train_model(X_box, 14, 50)

# Create clusterings for the results

In [1]:
results = pd.concat([df1,
                     kmeans_cluster(hidden_1, 'Bucket (d=1)'),
                     kmeans_cluster(hidden_2, 'Bucket (d=2)'),
                     kmeans_cluster(hidden_5, 'Bucket (d=5)'),
                     kmeans_cluster(hidden_10, 'Bucket (d=10)'),
                     kmeans_cluster(hidden_14, 'Bucket (d=14)'),
                     kmeans_cluster(df2, 'Bucket (original)')], axis=1)

# Plot results

In [2]:
ax = results[['player_id', 'Bucket (original)']].groupby(['Bucket (original)']).count().plot(kind='bar',
              title ="K-Means Clustering of Original Data", 
              figsize=(8, 5),
              legend=False,
              fontsize=12)
ax.set_xlabel("Bucket", fontsize=12)
ax.set_ylabel("Number of Players", fontsize=12)
plt.xticks(rotation=360)
plt.show()

In [3]:
ax = results[['player_id', 'Bucket (d=14)']].groupby(['Bucket (d=14)']).count().plot(kind='bar', 
              title ="K-Means Clustering of d=14 Data",
              figsize=(8, 5),
              legend=False,
              fontsize=12)
ax.set_xlabel("Bucket", fontsize=12)
ax.set_ylabel("Number of Players", fontsize=12)
plt.xticks(rotation=360)
plt.show()

In [6]:
# have to do this again because kmeans function doesn't return x and y -- should change that
km = KMeans(n_clusters=5).fit(hidden_2)
labels = km.labels_
df = pd.DataFrame([labels]).T # TODO: is transpose needed?
df.columns = ['d=2']
new = pd.concat([results, df, hidden_2], axis=1)
new = new.drop('Bucket (d=2)', 1)
new

In [5]:
plt.scatter(new[0],new[1],c=new['d=2'],alpha=0.5)
plt.annotate('steph curry (15-16)', (new[0][1562], new[1][1562]))
plt.annotate('kobe bryant (02-03)', (new[0][1009], new[1][1009]))
plt.annotate('michael jordan (02-03)', (new[0][3698], new[1][3698]))
plt.annotate('shaq (01-02)', (new[0][5051], new[1][5051]))
plt.annotate('james young (15-16)', (new[0][7274], new[1][7274]))
plt.annotate('yao ming (10-11)', (new[0][7266], new[1][7266]))
plt.annotate('rick fox (01-02)', (new[0][2223], new[1][2223]))
plt.annotate('reggie williams (09-10)', (new[0][7123], new[1][7123]))
plt.show()

# Metrics

In [16]:
def calculate_rand_index(df, col_1, col_2):
    n11 = 0
    n00 = 0
    # TODO: better loop iteration
    for i in df.index:
        for j in df.index:
            if j <= i:
                continue
            x1 = df.iloc[i, col_1]
            x2 = df.iloc[j, col_1]
            y1 = df.iloc[i, col_2]
            y2 = df.iloc[j, col_2]
            if x1 == x2 and y1 == y2:
                n11 += 1
            elif not x1 == x2 and not y1 == y2:
                n00 += 1
    return 2 * (n11 + n00) / (len(results) * (len(results) - 1))

In [50]:
def calculate_jaccard_index(df, col_1, col_2):
    n11 = 0
    n10 = 0
    n01 = 0
    # TODO: better loop iteration
    for i in df.index:
        for j in df.index:
            if j <= i:
                continue
            x1 = df.iloc[i, col_1]
            x2 = df.iloc[j, col_1]
            y1 = df.iloc[i, col_2]
            y2 = df.iloc[j, col_2]
            if x1 == x2 and y1 == y2:
                n11 += 1
            elif x1 == x2 and not y1 == y2:
                n10 += 1
            elif not x1 == x2 and y1 == y2:
                n01 += 1
    return n11 / (n11 + n10 + n01)

In [None]:
cols = [2, 3, 4, 5, 6]

In [None]:
for col in cols:
    print(calculate_rand_index(results, col, 7))

In [8]:
for col in cols:
    print(calculate_jaccard_index(results, col, 7))