In [1]:
import pandas
with open("nba_2013.csv", 'r') as csvfile:
    nba = pandas.read_csv(csvfile)

# The names of all the columns in the data.
print(nba.columns.values) 

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [2]:
# Select Lebron James from our dataset
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
print("selected_player", selected_player)

selected_player player          LeBron James
pos                       PF
age                       29
bref_team_id             MIA
g                         77
gs                        77
mp                      2902
fg                       767
fga                     1353
fg.                    0.567
x3p                      116
x3pa                     306
x3p.                0.379085
x2p                      651
x2pa                    1047
x2p.                0.621777
efg.                    0.61
ft                       439
fta                      585
ft.                     0.75
orb                       81
drb                      452
trb                      533
ast                      488
stl                      121
blk                       26
tov                      270
pf                       126
pts                     2089
season             2013-2014
season_end              2013
Name: 225, dtype: object


In [3]:
# Choose only the numeric columns 
# (we'll use these to compute euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

print("distance_columns", distance_columns)

distance_columns ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']


In [7]:
import math
def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

In [8]:
# Find the distance from each player in the dataset to lebron.
lebron_distance = nba.apply(euclidean_distance, axis=1)

In [9]:
print("lebron_distance=", lebron_distance)

lebron_distance= 0      3475.792868
1              NaN
2              NaN
3      1189.554979
4      3216.773098
5              NaN
6       960.443178
7      3131.071083
8      2326.129199
9      2806.955657
10     2277.933945
11             NaN
12     2819.058890
13     2534.074598
14     1970.085795
15     3262.065464
16     2451.378405
17      485.856006
18             NaN
19     3246.515831
20     1539.172839
21             NaN
22     2969.043638
23             NaN
24     2023.603985
25             NaN
26             NaN
27             NaN
28     3754.041967
29     3835.882699
          ...     
451     716.243023
452    2996.450583
453    4135.156714
454    3023.456473
455    4138.570811
456            NaN
457    2206.524879
458    1347.758158
459    2136.309449
460            NaN
461            NaN
462    1922.713718
463    2364.771676
464    3033.755934
465    2625.998112
466    2495.296784
467    2232.354830
468            NaN
469    3525.434026
470    3574.911070
471    2873.50

In [10]:
# Normalizing columns
# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]
# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

In [11]:
# Find the distance from each player in the dataset to lebron.
lebron_distance = nba_normalized.apply(euclidean_distance, axis=1)

In [12]:
print("lebron_distance=", lebron_distance)

lebron_distance= 0      4269.690713
1              NaN
2              NaN
3      4264.046570
4      4268.914413
5              NaN
6      4261.950640
7      4268.807580
8      4267.062925
9      4267.997363
10     4266.689368
11             NaN
12     4268.017939
13     4267.611435
14     4266.086623
15     4269.130743
16     4267.187818
17     4260.328584
18             NaN
19     4269.045275
20     4264.618827
21             NaN
22     4268.447307
23             NaN
24     4266.127071
25             NaN
26             NaN
27             NaN
28     4270.295189
29     4270.431332
          ...     
451    4261.766767
452    4268.615441
453    4271.113537
454    4268.692225
455    4271.131461
456            NaN
457    4266.811468
458    4264.101745
459    4265.815859
460            NaN
461            NaN
462    4265.804838
463    4267.034025
464    4268.664718
465    4267.628059
466    4267.381259
467    4266.669488
468            NaN
469    4269.811132
470    4269.879721
471    4268.31

In [15]:
# Finding the nearest neighbor
from scipy.spatial import distance

# Fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

In [16]:
print("second_smallest=", second_smallest)

second_smallest= 17.0


In [17]:
print("most_similar_to_lebron=", most_similar_to_lebron)

most_similar_to_lebron= Carmelo Anthony


In [26]:
#-------------------------------------
# Generating training and testing sets
#--------------------------------------
import random
import numpy as np
from numpy.random import permutation

# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = nba.loc[random_indices[test_cutoff:]]



In [20]:
#---------------------------------------
# Using sklearn for k nearest neighbors
#---------------------------------------
# Sklearn performs the normalization and 
# distance finding automatically, and lets 
# us specify how many neighbors we want to look at.
#-----------------------------------------
# The columns that we will be making predictions with.
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict.
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor

# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])

# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').