#### Problem Statement

In this assignment, students will be using the K-nearest neighbors algorithm to predict how many points NBA players scored in the 2013-2014 season.

A look at the data

Before we dive into the algorithm, let’s take a look at our data. Each row in the data contains information on how a player performed in the 2013-2014 NBA season.


Download 'nba_2013.csv' file from this link:

https://www.dropbox.com/s/b3nv38jjo5dxcl6/nba_2013.csv?dl=0


Here are some selected columns from the data:

player - name of the player

pos - the position of the player

g - number of games the player was in

gs - number of games the player started

pts - total points the player scored

There are many more columns in the data, mostly containing information about average player game performance over the course of the season. See this site for an explanation of the rest of them. We can read our dataset in and figure out which columns are present:

import pandas

with open("nba_2013.csv", 'r') as csvfile:

nba = pandas.read_csv(csvfile)


In [1]:
import pandas
with open(r"F:\Data Science Master\Data\nba_2013.csv", 'r') as csvfile:
    nba = pandas.read_csv(csvfile)

# The names of all the columns in the data.
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [2]:
nba.shape

(481, 31)

#### Euclidean distance

In [3]:
import math

# Select Lebron James from our dataset
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
# Choose only the numeric columns (we'll use these to compute euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']
def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
        return math.sqrt(inner_value)
    
# Find the distance from each player in the dataset to lebron.
lebron_distance = nba.apply(euclidean_distance, axis=1)
lebron_distance.head()

0    6.0
1    9.0
2    2.0
3    1.0
4    4.0
dtype: float64

#### Normalizing columns

In [4]:
# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]

# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
nba_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,,...,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,,...,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


#### Finding the nearest neighbor

In [5]:
from scipy.spatial import distance

# Fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

nba_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,0.0,...,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,0.0,...,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


In [9]:
# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
print(second_smallest)
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]
print(most_similar_to_lebron)

17.0
Carmelo Anthony


#### Generating training and testing sets

In [10]:
import random
from numpy.random import permutation

# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba_normalized.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
train = nba_normalized.loc[random_indices[test_cutoff:]]

In [11]:
test.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
370,-1.0741,-0.523389,-0.322732,-0.934387,-0.930448,-0.966417,0.492171,-0.739609,-0.731674,-0.832726,...,-0.139971,-0.319009,-0.721145,-0.622031,-0.744441,-0.583053,-0.456802,-0.843084,-0.447513,-0.938693
320,-0.597712,-0.325938,-0.862207,-0.917669,-0.697664,-0.703438,0.056385,-0.130041,-0.159177,0.543512,...,-1.388674,-0.512242,-0.652345,-0.632013,-0.752074,-0.698049,-0.424414,-0.715496,-0.587936,-0.696358
223,0.355062,-0.918292,-0.862207,-1.216357,-1.05848,-1.088417,0.421229,-0.778936,-0.822068,-1.808704,...,-1.107716,-0.657166,-0.934426,-0.881568,-0.836031,-1.04304,-0.521579,-0.986621,-1.149626,-1.038603
366,0.593256,-0.720841,0.284178,-0.279057,-0.069146,-0.071746,0.188135,0.656499,0.563978,0.694547,...,1.202385,-0.753783,-0.672985,-0.721853,-0.263597,-0.32431,-0.68352,-0.476268,-0.573893,0.032775
96,2.022417,-1.234214,-0.828489,-1.187379,-1.05848,-1.085706,0.218538,-0.778936,-0.807002,-1.808704,...,0.172205,-0.785988,-1.037626,-0.996363,-0.828398,-0.899294,-0.748297,-1.034466,-1.065372,-1.044981


#### Building model using sklearn

In [12]:
# The columns that we will be making predictions with.
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict.
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])
# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])

In [18]:
predictions

array([[-0.7809625 ],
       [-0.7996691 ],
       [-1.01479499],
       [-0.15003994],
       [-0.95825004],
       [ 0.31252324],
       [-1.01692074],
       [-1.01139379],
       [ 2.9824651 ],
       [-0.72144151],
       [-0.95229794],
       [-0.97865724],
       [-0.38684847],
       [-0.92211229],
       [-0.51184257],
       [ 0.23472079],
       [ 2.28266824],
       [ 0.47960718],
       [-0.42256107],
       [-0.7358966 ],
       [-1.02329799],
       [-1.09344774],
       [-1.01904649],
       [-0.86301645],
       [ 1.70616487],
       [-0.19042918],
       [-0.45274672],
       [-0.45742337],
       [-0.88002245],
       [-0.8557889 ],
       [-0.30394423],
       [-0.20828548],
       [ 0.57101443],
       [-0.93699254],
       [-0.853238  ],
       [-1.03392674],
       [ 1.35031434],
       [ 1.60072767],
       [-1.09599864],
       [ 0.27766094],
       [-0.47357907],
       [-0.86386675],
       [ 0.40648139],
       [-1.04115429],
       [-0.11942914],
       [-0

#### Computing error

In [17]:
# Get the actual values for the test set.
actual = test[y_column]

# Compute the mean squared error of our predictions.
mse = (((predictions - actual) ** 2).sum()) / len(predictions)
print("Mean Squared Error "+str(mse))

Mean Squared Error pts    0.041746
dtype: float64
