## Predicting score using K-nearest neigbor
In this assignment, I will be using the K-nearest neighbors algorithm to predict how many points NBA players scored in the 2013-2014 season

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsRegressor

In [2]:
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1500)

## Load NBA dataset

In [3]:
nba_data = pd.read_csv('nba_2013.csv')
nba_data.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,4,15,0.266667,62,126,0.492063,0.482,35,53,0.66,72,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,0,0,,93,185,0.502703,0.503,79,136,0.581,142,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,0,0,,143,275,0.52,0.52,76,119,0.639,102,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,128,300,0.426667,336,711,0.472574,0.522,274,336,0.815,32,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,0,1,0.0,136,248,0.548387,0.546,56,67,0.836,94,183,277,40,23,46,63,187,328,2013-2014,2013


## Perform analysis of dataset

In [4]:
nba_data.describe()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season_end
count,481.0,481.0,481.0,481.0,481.0,481.0,479.0,481.0,481.0,414.0,481.0,481.0,478.0,479.0,481.0,481.0,461.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0
mean,26.509356,53.253638,25.571726,1237.386694,192.881497,424.463617,0.436436,39.613306,110.130977,0.285111,153.268191,314.33264,0.466947,0.480752,91.205821,120.642412,0.722419,55.810811,162.817048,218.627859,112.536383,39.280665,24.10395,71.862786,105.869023,516.582121,2013.0
std,4.198265,25.322711,29.658465,897.25884,171.832793,368.850833,0.098672,50.855639,132.751732,0.157633,147.223161,294.174554,0.104448,0.099552,103.667725,131.240639,0.160166,62.101191,145.348116,200.356507,131.019557,34.78359,30.875381,62.70169,71.213627,470.422228,0.0
min,19.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.0
25%,23.0,32.0,0.0,388.0,47.0,110.0,0.4005,0.0,3.0,0.234355,31.0,67.0,0.434719,0.451,16.0,22.0,0.654,12.0,43.0,55.0,20.0,9.0,4.0,21.0,44.0,115.0,2013.0
50%,26.0,61.0,10.0,1141.0,146.0,332.0,0.438,16.0,48.0,0.330976,110.0,227.0,0.474475,0.488,53.0,73.0,0.751,35.0,135.0,168.0,65.0,32.0,14.0,58.0,104.0,401.0,2013.0
75%,29.0,76.0,54.0,2016.0,307.0,672.0,0.4795,68.0,193.0,0.375,230.0,459.0,0.513729,0.526,126.0,179.0,0.821,73.0,230.0,310.0,152.0,60.0,32.0,108.0,158.0,821.0,2013.0
max,39.0,83.0,82.0,3122.0,849.0,1688.0,1.0,261.0,615.0,1.0,706.0,1408.0,1.0,1.0,703.0,805.0,1.0,440.0,783.0,1114.0,721.0,191.0,219.0,295.0,273.0,2593.0,2013.0


In [5]:
nba_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 31 columns):
player          481 non-null object
pos             481 non-null object
age             481 non-null int64
bref_team_id    481 non-null object
g               481 non-null int64
gs              481 non-null int64
mp              481 non-null int64
fg              481 non-null int64
fga             481 non-null int64
fg.             479 non-null float64
x3p             481 non-null int64
x3pa            481 non-null int64
x3p.            414 non-null float64
x2p             481 non-null int64
x2pa            481 non-null int64
x2p.            478 non-null float64
efg.            479 non-null float64
ft              481 non-null int64
fta             481 non-null int64
ft.             461 non-null float64
orb             481 non-null int64
drb             481 non-null int64
trb             481 non-null int64
ast             481 non-null int64
stl             481 non-null int64
blk    

## Find missing values in columns and fill the missing values

In [6]:
nba_data.isnull().any()

player          False
pos             False
age             False
bref_team_id    False
g               False
gs              False
mp              False
fg              False
fga             False
fg.              True
x3p             False
x3pa            False
x3p.             True
x2p             False
x2pa            False
x2p.             True
efg.             True
ft              False
fta             False
ft.              True
orb             False
drb             False
trb             False
ast             False
stl             False
blk             False
tov             False
pf              False
pts             False
season          False
season_end      False
dtype: bool

In [7]:
#fill missing values with mean
nba_data.fillna(nba_data.mean(), inplace = True)

## Perform One Hot Encoding on categorical fields pos, bref_team_id

In [8]:
nba_data = pd.get_dummies(nba_data, columns=['pos',  'bref_team_id'])

In [9]:
nba_data.head()

Unnamed: 0,player,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end,pos_C,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG,bref_team_id_ATL,bref_team_id_BOS,bref_team_id_BRK,bref_team_id_CHA,bref_team_id_CHI,bref_team_id_CLE,bref_team_id_DAL,bref_team_id_DEN,bref_team_id_DET,bref_team_id_GSW,bref_team_id_HOU,bref_team_id_IND,bref_team_id_LAC,bref_team_id_LAL,bref_team_id_MEM,bref_team_id_MIA,bref_team_id_MIL,bref_team_id_MIN,bref_team_id_NOP,bref_team_id_NYK,bref_team_id_OKC,bref_team_id_ORL,bref_team_id_PHI,bref_team_id_PHO,bref_team_id_POR,bref_team_id_SAC,bref_team_id_SAS,bref_team_id_TOR,bref_team_id_TOT,bref_team_id_UTA,bref_team_id_WAS
0,Quincy Acy,23,63,0,847,66,141,0.468,4,15,0.266667,62,126,0.492063,0.482,35,53,0.66,72,144,216,28,23,26,30,122,171,2013-2014,2013,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,Steven Adams,20,81,20,1197,93,185,0.503,0,0,0.285111,93,185,0.502703,0.503,79,136,0.581,142,190,332,43,40,57,71,203,265,2013-2014,2013,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jeff Adrien,27,53,12,961,143,275,0.52,0,0,0.285111,143,275,0.52,0.52,76,119,0.639,102,204,306,38,24,36,39,108,362,2013-2014,2013,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Arron Afflalo,28,73,73,2552,464,1011,0.459,128,300,0.426667,336,711,0.472574,0.522,274,336,0.815,32,230,262,248,35,3,146,136,1330,2013-2014,2013,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Alexis Ajinca,25,56,30,951,136,249,0.546,0,1,0.0,136,248,0.548387,0.546,56,67,0.836,94,183,277,40,23,46,63,187,328,2013-2014,2013,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


## Prepare data for feature X and target Y
1. Get feature X from nba_data by removing unwanted field 'player', 'season', 'season_end', 'pts' from nba_data
2. Get target Y by taking only field 'pts' from nba_data

In [10]:
X = nba_data.drop(['player', 'season', 'season_end', 'pts' ], axis=1)
Y = nba_data[[ 'pts']]

In [11]:
X.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pos_C,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG,bref_team_id_ATL,bref_team_id_BOS,bref_team_id_BRK,bref_team_id_CHA,bref_team_id_CHI,bref_team_id_CLE,bref_team_id_DAL,bref_team_id_DEN,bref_team_id_DET,bref_team_id_GSW,bref_team_id_HOU,bref_team_id_IND,bref_team_id_LAC,bref_team_id_LAL,bref_team_id_MEM,bref_team_id_MIA,bref_team_id_MIL,bref_team_id_MIN,bref_team_id_NOP,bref_team_id_NYK,bref_team_id_OKC,bref_team_id_ORL,bref_team_id_PHI,bref_team_id_PHO,bref_team_id_POR,bref_team_id_SAC,bref_team_id_SAS,bref_team_id_TOR,bref_team_id_TOT,bref_team_id_UTA,bref_team_id_WAS
0,23,63,0,847,66,141,0.468,4,15,0.266667,62,126,0.492063,0.482,35,53,0.66,72,144,216,28,23,26,30,122,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,20,81,20,1197,93,185,0.503,0,0,0.285111,93,185,0.502703,0.503,79,136,0.581,142,190,332,43,40,57,71,203,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,27,53,12,961,143,275,0.52,0,0,0.285111,143,275,0.52,0.52,76,119,0.639,102,204,306,38,24,36,39,108,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,28,73,73,2552,464,1011,0.459,128,300,0.426667,336,711,0.472574,0.522,274,336,0.815,32,230,262,248,35,3,146,136,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,25,56,30,951,136,249,0.546,0,1,0.0,136,248,0.548387,0.546,56,67,0.836,94,183,277,40,23,46,63,187,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
Y.head()

Unnamed: 0,pts
0,171
1,265
2,362
3,1330
4,328


## Divide feature and target into train and test dataset using train_test_split

In [13]:

# Import train_test_split
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)



## Apply KNeighborsRegressor on train data and then calculate accurracy on test data using different values of K

In [14]:
#fitting the model 
for K in range(25):
    K_value = K+1
    neighbor = KNeighborsRegressor(n_neighbors = K_value, weights='uniform', algorithm='auto')
    neighbor.fit(X_train, y_train) 
    y_pred = neighbor.predict(X_test)
    print("Accuracy is ",neighbor.score(X_test,y_test) ," for K-Value:",K_value)

Accuracy is  0.9550293841767484  for K-Value: 1
Accuracy is  0.9667533821036304  for K-Value: 2
Accuracy is  0.9722822534883708  for K-Value: 3
Accuracy is  0.9726339258124398  for K-Value: 4
Accuracy is  0.9733831983576475  for K-Value: 5
Accuracy is  0.9756387408444478  for K-Value: 6
Accuracy is  0.9746027472541899  for K-Value: 7
Accuracy is  0.9770228646408327  for K-Value: 8
Accuracy is  0.9763740703273166  for K-Value: 9
Accuracy is  0.9748523596389462  for K-Value: 10
Accuracy is  0.9751299739952604  for K-Value: 11
Accuracy is  0.9745794319825458  for K-Value: 12
Accuracy is  0.9739574285892518  for K-Value: 13
Accuracy is  0.9729335547824521  for K-Value: 14
Accuracy is  0.9712182744081604  for K-Value: 15
Accuracy is  0.9700118313803022  for K-Value: 16
Accuracy is  0.9691800087657443  for K-Value: 17
Accuracy is  0.9676174344271444  for K-Value: 18
Accuracy is  0.9651109011024993  for K-Value: 19
Accuracy is  0.9637795637548277  for K-Value: 20
Accuracy is  0.96221851186386

## Conclusion: 
From the result it is evident that accuracy is 97% for K value between 4 and 16, I can take K value as 8
as as gives maximum accuracy is 97.70%