In [1]:
import numpy as np
import pandas as pd
import warnings
import codecs
import os

import aut_vinc_bballCrawler as bc

## Get 2017 data

In [2]:
pergame = bc.getSoupFromURL('http://www.basketball-reference.com/leagues/NBA_2017_per_game.html')
pg = pergame.findAll('table')
header = []
for th in pg[0].findAll('th'):
    if not th.getText() in header:
        header.append(th.getText())
rows = pg[0].findAll('tr')[1:]  # all rows but the header
rows = [r for r in rows if len(r.findAll('td')) > 0]
parsed_table = [[col.getText() for col in row.findAll('td')] for row in rows]
ptable = pd.io.parsers.TextParser(parsed_table, names=header[1:30], index_col=0).get_chunk()
# invert the turnover metric
ptable.TOV = 1/ptable.TOV
ptable.TOV[np.isinf(ptable.TOV)] = ptable.TOV[~np.isinf(ptable.TOV)].mean()
ptable.replace(np.nan, 0, inplace=True)
feature_vec = ['FG%','FT%','3P','TRB','AST','STL','BLK','TOV','PS/G']

bt = pd.read_excel('autvinc_table.xlsx')
bt.tov = (1/bt.tov)*(10**3)
bt.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,wk,season,fg.,ft.,X3p,trb,ast,stl,blk,tov,pts
count,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0
mean,8.883721,1.697674,0.456008,0.773432,45.0,241.060465,134.090698,42.786047,27.516279,12.740421,616.323256
std,5.454663,0.700812,0.023588,0.04746,13.700084,38.487864,24.213766,9.158322,8.908416,2.39144,87.580746
min,1.0,1.0,0.3812,0.6119,14.0,132.0,73.0,19.0,7.0,8.264463,363.0
25%,4.0,1.0,0.439925,0.7433,34.0,213.0,117.0,37.0,21.0,11.111111,561.25
50%,8.0,2.0,0.4558,0.7778,44.0,238.5,133.0,43.0,27.0,12.345679,615.0
75%,14.0,2.0,0.4743,0.808475,54.0,267.75,150.0,48.0,33.0,13.888889,668.75
max,19.0,3.0,0.5207,0.882,90.0,355.0,205.0,78.0,64.0,23.809524,913.0


In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [20]:
X = np.array(bt.iloc[:,3:])
ss = StandardScaler()
ssX = ss.fit_transform(X)
labels = []
for i in range(0, X.shape[0], 2):
    if len(np.where((X[i, :] - X[i+1,:]) > 0)[0]) > 5:
        labels.append('win')
        labels.append('loss')
    else:
        labels.append('loss')
        labels.append('win')

le = LabelEncoder()
y = le.fit_transform(np.array(labels))
X_train, X_test, y_train, y_test = train_test_split(ssX, y,
                                                    test_size=0.1)

In [21]:
from sklearn.ensemble import RandomForestClassifier
feat_labels = feature_vec
forest = RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion='entropy')
forest.fit(X_train, y_train)

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%3d) %-*s %f" % (f+1, 30, feat_labels[indices[f]], importances[indices[f]]))

  1) PS/G                           0.154064
  2) FG%                            0.122052
  3) TRB                            0.117921
  4) AST                            0.115945
  5) FT%                            0.107894
  6) BLK                            0.106282
  7) 3P                             0.097999
  8) STL                            0.088940
  9) TOV                            0.088904


In [6]:
from sklearn.metrics import accuracy_score

In [22]:
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
forest_train = accuracy_score(y_train, y_train_pred)
forest_test = accuracy_score(y_test, y_test_pred)
print('Train/test accuracies: %.3f/%.3f' % (forest_train, forest_test))

Train/test accuracies: 1.000/0.767


## SVC and tuning

In [17]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score

In [32]:
pipe_svc = Pipeline([('pca', PCA()),
                     ('clf', SVC(probability=True))])

param_range = [10**-4, 10**-3, 10**-2, 10**-1, 1.0, 10.0, 10**2]
param_grid = [{'clf__C': param_range,
               'clf__kernel': ['linear']},
              {'clf__C': param_range,
               'clf__gamma': param_range,
               'clf__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,
                 param_grid=param_grid,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)
gsfit = gs.fit(X_train, y_train)
print(gsfit.best_params_)
print(gsfit.best_estimator_)
print('Best score: %.3f' % gsfit.best_score_)
scores = cross_val_score(gsfit.best_estimator_, X=X_test, y=y_test, cv=10, n_jobs=-1)
print('CV accuracy %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

{'clf__kernel': 'linear', 'clf__C': 0.01}
Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
Best score: 0.651
CV accuracy 0.843 +/- 0.178


## Get league rosters

In [24]:
import espn_fantasy_scrape
rosters = espn_fantasy_scrape.return_rosters()

mysquad = rosters["delonte\'s donuts"]
for nom in mysquad:
    if nom == 'Otto Porter Jr.':
        mysquad.remove(nom)
        mysquad.append('Otto Porter')
    if nom == 'Patty Mills':
        mysquad.remove(nom)
        mysquad.append('Patrick Mills')

ser_plyr = []
fr_plyr = []
my_plyr_inds = []
for player in mysquad:
    if type(ptable.loc[player, feature_vec]) is pd.Series:
        ser_plyr.append(player)
        my_plyr_inds.append(ptable.index.get_loc(player))
    if type(ptable.loc[player, feature_vec]) is pd.DataFrame:
        fr_plyr.append(player)
        my_plyr_inds.append(np.where(ptable.index.get_loc(player) == True)[0][0])
mysqd_table = ptable.loc[ser_plyr, feature_vec]
for fplayer in fr_plyr:
    mysqd_table = mysqd_table.append(ptable.loc[fplayer, feature_vec].iloc[0, ])
mysqd_table = mysqd_table.reset_index()

mysqd_vec = mysqd_table.loc[:, feature_vec].mean(0)
mysqd_table

username: ········
password: ········


Unnamed: 0,Player,FG%,FT%,3P,TRB,AST,STL,BLK,TOV,PS/G
0,Kemba Walker,0.466,0.784,2.6,3.6,5.0,1.5,0.3,0.454545,22.8
1,Josh Richardson,0.404,0.667,1.8,3.1,2.0,0.9,0.5,0.769231,10.9
2,Myles Turner,0.534,0.802,0.6,7.2,1.0,0.9,2.3,0.666667,15.0
3,Joel Embiid,0.458,0.778,1.3,7.6,1.7,0.6,2.5,0.27027,18.2
4,Chris Paul,0.462,0.895,2.2,5.2,9.6,2.5,0.2,0.454545,17.9
5,Rudy Gobert,0.671,0.676,0.0,11.6,0.9,0.5,2.8,0.625,11.8
6,Patrick Beverley,0.42,0.6,1.5,5.4,4.5,1.4,0.6,0.714286,7.9
7,Giannis Antetokounmpo,0.524,0.766,0.6,8.9,5.9,2.0,2.0,0.285714,22.3
8,Kevin Durant,0.535,0.865,1.9,8.2,4.6,1.3,1.7,0.454545,25.9
9,Langston Galloway,0.397,0.75,2.0,2.4,1.5,1.1,0.0,1.25,10.0


In [33]:
ss_ptable = ss.fit_transform(ptable.loc[:, feature_vec])
print('SVC prediction: %d' % gsfit.predict(ss_ptable[my_plyr_inds, :].mean(0)[np.newaxis, :]))
l, w = gsfit.predict_proba(ss_ptable[my_plyr_inds, :].mean(0)[np.newaxis, :])[0]
print('SVC predict win: %.4f, loss: %.4f' % (w, l))

SVC prediction: 1
SVC predict win: 0.7733, loss: 0.2267


In [26]:
def roster_indices(squad, ptable):
    """Give roster and get player indices.
    
    Example input arg:
        squad: rosters["sami\'s mannschaft"]
        ptable: player table from basketball-reference.com
    """
    plyr_inds = []
    for player in squad:
        if player == 'Otto Porter Jr.':
            player = 'Otto Porter'
        if player == 'Patty Mills':
            player = 'Patrick Mills'
        if player == 'Louis Williams':
            player = 'Lou Williams'
        if player == 'TJ Warren':
            player = 'T.J. Warren'
        if type(ptable.loc[player, feature_vec]) is pd.Series:
            plyr_inds.append(ptable.index.get_loc(player))
        if type(ptable.loc[player, feature_vec]) is pd.DataFrame:
            plyr_inds.append(np.where(ptable.index.get_loc(player) == True)[0][0])

    return plyr_inds

In [27]:
list(rosters.keys())

["sami's mannschaft",
 'conquest pain',
 'no look no pass',
 'what the blood clot',
 'soccer karate',
 'accidental twitter hog',
 "conor's turpentines",
 'ian banh mahinmi',
 'paul george, john and ...',
 "delonte's donuts"]

In [34]:
winpct = {}
for ros in rosters:
    indxs = roster_indices(rosters[ros], ptable)
    l, w = gsfit.predict_proba(ss_ptable[indxs, :].mean(0)[np.newaxis, :])[0]
    winpct[ros] = w
    print('%s SVC predict win: %.4f, loss: %.4f' % (ros, w, l))

sami's mannschaft SVC predict win: 0.7200, loss: 0.2800
conquest pain SVC predict win: 0.7050, loss: 0.2950
no look no pass SVC predict win: 0.7068, loss: 0.2932
what the blood clot SVC predict win: 0.7122, loss: 0.2878
soccer karate SVC predict win: 0.7099, loss: 0.2901
accidental twitter hog SVC predict win: 0.7046, loss: 0.2954
conor's turpentines SVC predict win: 0.7411, loss: 0.2589
ian banh mahinmi SVC predict win: 0.6973, loss: 0.3027
paul george, john and ... SVC predict win: 0.6575, loss: 0.3425
delonte's donuts SVC predict win: 0.7733, loss: 0.2267


## Predicted standings

In [35]:
import operator
srtd_winpct = sorted(winpct.items(), key=operator.itemgetter(1), reverse=True)
for tm in srtd_winpct:
    print('%s SVC predict win: %.4f' % (tm[0], tm[1]))

delonte's donuts SVC predict win: 0.7733
conor's turpentines SVC predict win: 0.7411
sami's mannschaft SVC predict win: 0.7200
what the blood clot SVC predict win: 0.7122
soccer karate SVC predict win: 0.7099
no look no pass SVC predict win: 0.7068
conquest pain SVC predict win: 0.7050
accidental twitter hog SVC predict win: 0.7046
ian banh mahinmi SVC predict win: 0.6973
paul george, john and ... SVC predict win: 0.6575


## logistic regression

In [40]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score


pipe_lr = Pipeline([('pca', PCA()),
                   ('clf', LogisticRegression())])
pipe_lr.fit(X_train, y_train)
print('Test accuracy: %.3f' % pipe_lr.score(X_test, y_test))
scores = cross_val_score(estimator=pipe_lr, X=ssX, y=y, cv=10, n_jobs=-1)
print('CV accuracy %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Test accuracy: 0.767
CV accuracy 0.649 +/- 0.072
