https://www.dataquest.io/blog/k-nearest-neighbors-in-python/

In [46]:
import pandas as pd
import math
import scipy
from scipy.spatial import distance
import random
from numpy.random import permutation
from sklearn.neighbors import KNeighborsRegressor

In [36]:
path = '~/Desktop/image_analysis/Statistics/'
nba = pd.read_csv(path+'nba_2013.csv')
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [37]:
selected_player = nba[nba['player'] == 'LeBron James'].iloc[0]

In [38]:
selected_player

player          LeBron James
pos                       PF
age                       29
bref_team_id             MIA
g                         77
gs                        77
mp                      2902
fg                       767
fga                     1353
fg.                    0.567
x3p                      116
x3pa                     306
x3p.                0.379085
x2p                      651
x2pa                    1047
x2p.                0.621777
efg.                    0.61
ft                       439
fta                      585
ft.                     0.75
orb                       81
drb                      452
trb                      533
ast                      488
stl                      121
blk                       26
tov                      270
pf                       126
pts                     2089
season             2013-2014
season_end              2013
Name: 225, dtype: object

In [39]:
# Choosing only the numeric columns (we'll use these to complete euclidean distance calculations)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k]-selected_player[k])**2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)

In [40]:
# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]
# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean())/nba_numeric.std()

In [41]:
nba_normalized

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.260690,-0.129462,-0.013116,-0.645220,-0.468056,0.061410,-0.667650,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,,...,-0.882950,1.387883,0.187020,0.565852,-0.530733,0.020680,1.065446,-0.013760,1.363938,-0.534801
2,0.116868,-0.010016,-0.457600,-0.308035,-0.290291,-0.405214,0.846880,-0.778936,-0.829601,,...,-0.520826,0.743773,0.283340,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.383420,0.462221,0.216475,1.033919,-0.123066,-0.683520,1.182380,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.319180,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.553630,-0.468056,0.709175,-0.141348,1.139262,-0.400878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,-1.550487,0.740298,-0.322732,0.588028,0.885271,1.039814,-0.095633,0.007604,0.586576,-0.458948,...,-0.508339,0.212382,-0.026261,0.046777,0.797313,1.113149,-0.262473,2.107395,0.633741,0.897955
477,0.355062,0.424376,-0.558752,0.638181,1.129694,1.259415,-0.014557,1.875637,1.806899,0.638211,...,0.640468,-0.431728,-0.177622,-0.262671,-0.133846,0.193175,-0.392026,0.369005,0.703952,1.333733
478,-0.359519,1.016730,1.767734,1.650152,2.264518,2.327598,0.178000,0.990779,1.369994,0.146594,...,-0.065049,1.774349,1.012624,1.284571,0.530177,3.671827,0.385292,1.485402,1.504361,1.914063
479,-1.312293,1.135201,-0.761055,0.199066,-0.121522,-0.055479,-0.105767,-0.778936,-0.822068,-1.808704,...,0.047334,1.001417,0.496621,0.670665,-0.156743,0.020680,0.547234,0.241416,0.900544,-0.056507


In [42]:
# Fill in NaN values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james
lebron_normalized = nba_normalized[nba['player'] == 'LeBron James']

# Find the distance between lebron james and everyone else
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances:
distance_frame = pd.DataFrame(data={'dist': euclidean_distances, 'idx': euclidean_distances.index})
distance_frame.sort_values('dist', inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]['player']

In [43]:
most_similar_to_lebron

'Carmelo Anthony'

Let's start making predictions of points (pts) using KNN approach

In [44]:
# Randomly shuffle the index of nba data frame
random_indices = permutation(nba.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices
test = nba.loc[random_indices[1:test_cutoff]]

# Generate the training set with the rest of the data
train = nba.loc[random_indices[test_cutoff:]]

In [49]:
train[x_columns]

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf
358,27,28,0,99,19,46,0.413,6,22,0.272727,...,2,0.000,1,9,10,13,1,0,7,5
98,33,81,0,1353,134,241,0.556,4,17,0.235294,...,93,0.710,116,177,293,104,30,28,71,187
426,22,77,41,1742,171,372,0.460,67,167,0.401198,...,73,0.712,73,174,247,73,53,12,60,144
377,23,82,82,2638,255,670,0.381,44,133,0.330827,...,283,0.802,61,281,342,704,191,11,221,218
173,29,37,6,355,27,61,0.443,0,1,0.000000,...,20,0.550,42,69,111,22,10,8,31,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303,23,45,7,723,73,166,0.440,25,77,0.324675,...,36,0.806,9,43,52,43,24,9,22,88
210,24,2,0,7,0,4,0.000,0,2,0.000000,...,2,0.500,0,0,0,1,1,0,0,0
423,24,72,54,2497,496,1096,0.453,127,364,0.348901,...,407,0.850,47,163,210,454,93,8,213,185
119,26,47,3,494,64,153,0.418,18,53,0.339623,...,35,0.886,14,59,73,65,22,6,35,48


In [52]:
null_columns=train.columns[train.isnull().any()]
train[train.isnull().any(axis=1)][null_columns]

Unnamed: 0,fg.,x3p.,x2p.,efg.,ft.
137,0.5,,0.5,0.5,
39,0.436,,0.435583,0.436,0.905
353,0.517,,0.517179,0.517,0.561
2,0.52,,0.52,0.52,0.639
90,,,,,
139,0.473,,0.473333,0.473,0.554
442,0.6,,0.6,0.6,0.519
386,0.505,,0.504545,0.505,0.871
317,0.421,,0.421053,0.421,0.8
424,0.222,,0.222222,0.222,0.5


In [47]:
# The columns that we will be making predictions with:
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict:
y_column = ['pts']

# Create the knn model
# Look at the five closest neighbors
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data
knn.fit(train[x_columns], train[y_column])
# Make point predictions on the test set using the fit model
predictions = knn.predict(test[x_columns])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').