# Exercise 10

## KNN exercise with NBA player data

## Introduction

- NBA player statistics from 2014-2015 (partial season): [data](https://github.com/justmarkham/DAT4-students/blob/master/kerry/Final/NBA_players_2015.csv), [data dictionary](https://github.com/justmarkham/DAT-project-examples/blob/master/pdf/nba_paper.pdf)
- **Goal:** Predict player position using assists, steals, blocks, turnovers, and personal fouls

## Read the data into Pandas

In [1]:
# read the data into a DataFrame
import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/DAT4-students/master/kerry/Final/NBA_players_2015.csv'
nba = pd.read_csv(url, index_col=0)

In [2]:
# examine the columns
nba.columns

Index(['season_end', 'player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp',
       'fg', 'fga', 'fg_', 'x3p', 'x3pa', 'x3p_', 'x2p', 'x2pa', 'x2p_', 'ft',
       'fta', 'ft_', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP'],
      dtype='object')

In [3]:
# examine the positions
nba.pos.value_counts()

G    200
F    199
C     79
Name: pos, dtype: int64

TypeError: unhashable type: 'slice'

## Create X and y

Use the following features: assists, steals, blocks, turnovers, personal fouls

In [4]:
# map positions to numbers
nba['pos_num'] = nba.pos.map({'C':0, 'F':1, 'G':2})

In [5]:
# create feature matrix (X)
feature_cols = ['ast', 'stl', 'blk', 'tov', 'pf']
X = nba[feature_cols]

In [6]:
# alternative way to create X
X = nba.loc[:, 'ast':'pf']

In [7]:
# create response vector (y)
y = nba.pos_num

# Exercice 10.1

* Split the data in train and test
* Train a KNN model (K=5)
* Evaluate the accuracy

In [8]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(y_train.shape, y_test.shape)
print(X_train.shape, X_test.shape)

(358,) (120,)
(358, 5) (120, 5)


In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_pred_prob = knn.predict_proba(X_test)
y_pred_prob[:5]

array([[ 0.8,  0.2,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0.4,  0.6,  0. ],
       [ 0.2,  0.8,  0. ],
       [ 0.2,  0.6,  0.2]])

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[12, 10,  3],
       [ 7, 36,  8],
       [ 0,  8, 36]])

In [11]:
#Accuracy
(y_pred == y_test).mean() 

0.69999999999999996

# Exercice 10.2 

Predict player position and calculate predicted probability of each position

Predict for a player with these statistics: 1 assist, 1 steal, 0 blocks, 1 turnover, 2 personal fouls

In [12]:
# create a list to represent a player
import numpy as np
player = np.array([1, 1, 0, 1, 2]).reshape(1, -1) 

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
probs = knn.predict_proba(player)
probs

array([[ 0. ,  0.4,  0.6]])

In [14]:
y_pred = knn.predict(player)
y_pred
#2: GUARDS

array([2])

# Exercice 10.3  

Repeat steps 10.1 and 10.2 using K=50

In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_pred_prob = knn.predict_proba(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 4, 20,  1],
       [ 0, 43,  8],
       [ 0, 16, 28]])

In [16]:
(y_pred == y_test).mean() 

0.625

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train, y_train)
probs = knn.predict_proba(player)
probs

array([[ 0.06,  0.52,  0.42]])

In [18]:
y_pred = knn.predict(player)
y_pred
#1: FORWARD

array([1])

# Exercice 10.4 (3 points) 

Explore the features to decide which ones are predictive

In [46]:
from sklearn.feature_selection import SelectKBest
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)

sel = []
for i in range(1,4):
    sel = SelectKBest(k=i)
    sel.fit(X, y)
    sel.get_support()
    X_sel = sel.transform(X)
    print(i,pd.Series(cross_val_score(logreg, X_sel, y, cv=10, scoring='accuracy')).mean())  



1 0.6088768115942029
2 0.6880434782608694
3 0.6819746376811595


In [38]:
feature =['season_end', 'age', 'g', 'gs', 'mp',
       'fg', 'fga', 'fg_', 'x3p', 'x3pa', 'x3p_', 'x2p', 'x2pa', 'x2p_', 'ft',
       'fta', 'ft_', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP']
X = nba[list(feature)]
X.shape

(478, 46)

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
logreg = LogisticRegression(C=1e9)
from sklearn.feature_selection import RFE
sel = RFE(estimator=logreg, n_features_to_select=10)
sel.fit(X, y)

RFE(estimator=LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  estimator_params=None, n_features_to_select=10, step=1, verbose=0)

In [48]:
sel.get_support()

array([False, False, False, False, False, False,  True,  True,  True,
        True, False, False,  True,  True, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False, False], dtype=bool)

In [44]:
print(np.array(X)[sel.get_support()])

[[  2.01500000e+03   2.30000000e+01   2.50000000e+01   1.00000000e+00
    1.07000000e+01   6.00000000e-01   1.30000000e+00   4.85000000e-01
    0.00000000e+00   0.00000000e+00   0.00000000e+00   6.00000000e-01
    1.30000000e+00   5.00000000e-01   2.00000000e-01   4.00000000e-01
    5.00000000e-01   1.70000000e+00   2.20000000e+00   3.90000000e+00
    6.00000000e-01   4.00000000e-01   4.00000000e-01   4.00000000e-01
    1.80000000e+00   1.50000000e+00   2.50000000e+01   2.67000000e+02
    1.29000000e+01   4.95000000e-01   3.00000000e-02   3.03000000e-01
    2.00000000e+01   9.60000000e+00   2.10000000e+00   3.10000000e+00
    2.11000000e+01   7.80000000e+00   3.00000000e-01   4.00000000e-01
    7.00000000e-01   1.27000000e-01  -2.90000000e+00   3.10000000e+00
    2.00000000e-01   2.00000000e-01]
 [  2.01500000e+03   2.60000000e+01   4.50000000e+01   1.20000000e+01
    1.59000000e+01   2.30000000e+00   4.80000000e+00   4.77000000e-01
    0.00000000e+00   0.00000000e+00   0.00000000e+00 

  if __name__ == '__main__':
