In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
with open("nba_2013.csv",'r') as csvfile:
    nba = pd.read_csv(csvfile)

In [6]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [7]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   player        481 non-null    object 
 1   pos           481 non-null    object 
 2   age           481 non-null    int64  
 3   bref_team_id  481 non-null    object 
 4   g             481 non-null    int64  
 5   gs            481 non-null    int64  
 6   mp            481 non-null    int64  
 7   fg            481 non-null    int64  
 8   fga           481 non-null    int64  
 9   fg.           479 non-null    float64
 10  x3p           481 non-null    int64  
 11  x3pa          481 non-null    int64  
 12  x3p.          414 non-null    float64
 13  x2p           481 non-null    int64  
 14  x2pa          481 non-null    int64  
 15  x2p.          478 non-null    float64
 16  efg.          479 non-null    float64
 17  ft            481 non-null    int64  
 18  fta           481 non-null    

In [8]:
nba.isna().sum()

player           0
pos              0
age              0
bref_team_id     0
g                0
gs               0
mp               0
fg               0
fga              0
fg.              2
x3p              0
x3pa             0
x3p.            67
x2p              0
x2pa             0
x2p.             3
efg.             2
ft               0
fta              0
ft.             20
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
tov              0
pf               0
pts              0
season           0
season_end       0
dtype: int64

In [9]:
nba['fg.'].fillna(nba['fg.'].median(),inplace=True)
nba['x3p.'].fillna(nba['x3p.'].median(),inplace=True)
nba['x2p.'].fillna(nba['x2p.'].median(),inplace=True)
nba['efg.'].fillna(nba['efg.'].median(),inplace=True)
nba['ft.'].fillna(nba['ft.'].median(),inplace=True)

In [10]:
nba.drop('player',axis=1,inplace=True)
nba.drop('bref_team_id',axis=1,inplace=True)
nba.drop('season',axis=1,inplace=True)
nba.drop('season_end', axis=1, inplace=True)

In [11]:
nba['pos'].value_counts()

SG    109
SF     99
PF     96
C      90
PG     85
G       1
F       1
Name: pos, dtype: int64

In [12]:
new_pos = pd.get_dummies(nba['pos'],prefix='pos',drop_first=True)
new_pos.head()

Unnamed: 0,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
0,0,0,0,0,1,0
1,0,0,0,0,0,0
2,0,0,1,0,0,0
3,0,0,0,0,0,1
4,0,0,0,0,0,0


In [13]:
nba.drop('pos',axis=1,inplace=True)

In [14]:
df = pd.concat([nba,new_pos],axis=1)

In [15]:
df.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,blk,tov,pf,pts,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
0,23,63,0,847,66,141,0.468,4,15,0.266667,...,26,30,122,171,0,0,0,0,1,0
1,20,81,20,1197,93,185,0.503,0,0,0.330976,...,57,71,203,265,0,0,0,0,0,0
2,27,53,12,961,143,275,0.52,0,0,0.330976,...,36,39,108,362,0,0,1,0,0,0
3,28,73,73,2552,464,1011,0.459,128,300,0.426667,...,3,146,136,1330,0,0,0,0,0,1
4,25,56,30,951,136,249,0.546,0,1,0.0,...,46,63,187,328,0,0,0,0,0,0


In [16]:
X = df.drop('pts',axis=1)
y = df['pts']

In [17]:
from sklearn.preprocessing import Normalizer
norm = Normalizer()
X_norm = norm.fit_transform(X)

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_norm,y,test_size=0.3,random_state=42)

In [21]:
from sklearn.neighbors import KNeighborsRegressor
knr_norm = KNeighborsRegressor()
knr_norm.fit(x_train,y_train)

KNeighborsRegressor()

In [22]:
knr_norm.score(x_test,y_test)

0.6820088360944019

In [23]:
y_new_pred = knr_norm.predict(x_test)

In [25]:
from sklearn.metrics import mean_squared_error
print(f'MSE:{mean_squared_error(y_test,y_new_pred)}')
print(f'RMSE:{np.sqrt(mean_squared_error(y_test,y_new_pred))}')

MSE:63335.620689655174
RMSE:251.66569231751708
