## NBA Analysis KDnuggets

In [42]:
import pandas as pd
nba = pd.read_csv("F:/nba_2013.csv")

In [43]:
## Dimensions (481 records, 31 variables/columns)
nba.shape

(481, 31)

In [44]:
nba.head(1)

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013


In [45]:
Means = pd.DataFrame(nba.mean())
Medians = pd.DataFrame(nba.median())

In [46]:
frames = [Means,Medians]
result = pd.concat(frames, axis=1)
result.columns = ['Means','Medians']

In [47]:
result.head(3)

Unnamed: 0,Means,Medians
age,26.509356,26
g,53.253638,61
gs,25.571726,10


## NBA Analysis Plots

In [48]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(nba[["ast", "fg", "trb"]])
plt.show()

## NBA Clusters Kmeans

In [49]:
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=5, random_state=1)
good_columns = nba._get_numeric_data().dropna(axis=1)
kmeans_model.fit(good_columns)
labels = kmeans_model.labels_

In [50]:
from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(good_columns)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=labels)
plt.show()

In [55]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def Kmeans(df,Target,Clusters):
    
    Clusters = Clusters+ 1
    # transform into matrix
    Matrix = df.values
    
    # index of target
    TargetIndex = df.columns.get_loc(Target)

    # Extract all features and target
    li = list(range(0, (Matrix.shape[1])))

    # Remove Target
    li.remove(TargetIndex)
    
    # print list of feature indexes
    print('features: ' + str(li))

    # Select our features (predictors)
    MatrixFeatures = Matrix[:,li]

    # Select our target
    MatrixTarget = Matrix[:,TargetIndex]

    li = []
    cl = []
    for i in range(1,Clusters):
        k_means = KMeans(n_clusters=i)
        k_means.fit(MatrixFeatures) 
        print(k_means.inertia_)
        li.append(k_means.inertia_)
        cl.append('cluster:'+str(i))

    print('Note: distance based models expect data to be standardized otherwise large values will dominate distance based calculations')

    print('To return actual clusters execute: Clusters = k_means.predict(MatrixFeatures)')

    df = pd.DataFrame(li)
    df.columns = ['Sum of Squares']
    cl = pd.DataFrame(cl)
    cl.columns = ['Clusters']
    
    cl.reset_index(inplace=True)
    df.reset_index(inplace=True)

    data = pd.merge(cl, df, on='index')
    data = data.drop('index', 1)
    data.plot('Clusters','Sum of Squares',legend = None)
    plt.legend(data[[1]], loc='best') 
    plt.xticks(rotation=55)
    return(data,k_means)

In [72]:
good_columns = nba._get_numeric_data().dropna(axis=1)
good_columns.head(1)

Unnamed: 0,age,g,gs,mp,fg,fga,x3p,x3pa,x2p,x2pa,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,season_end
0,23,63,0,847,66,141,4,15,62,126,...,72,144,216,28,23,26,30,122,171,2013


## Standardize the data

In [73]:
good_columns = good_columns.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
good_columns.head(1)

Unnamed: 0,age,g,gs,mp,fg,fga,x3p,x3pa,x2p,x2pa,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,season_end
0,0.2,0.756098,0,0.271067,0.077739,0.083531,0.015326,0.02439,0.087819,0.089489,...,0.163636,0.183908,0.193896,0.038835,0.120419,0.118721,0.101695,0.446886,0.065947,


In [74]:
data,km = Kmeans(good_columns,'season_end',9)

features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
470.668029593
223.706610802
166.925742309
142.225814369
125.24477853
114.913843727
106.990157112
101.206381661
96.663632579
Note: distance based models expect data to be standardized otherwise large values will dominate distance based calculations
To return actual clusters execute: Clusters = k_means.predict(MatrixFeatures)


## Split into test and train

In [63]:
train = nba.sample(frac=0.8, random_state=1)
test = nba.loc[~nba.index.isin(train.index)]

## Regression

In [64]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train[["fg"]], train["ast"])
predictions = lr.predict(test[["fg"]])

In [65]:
import statsmodels.formula.api as sm
model = sm.ols(formula='ast ~ fga', data=train)
fitted = model.fit()
fitted.summary()

0,1,2,3
Dep. Variable:,ast,R-squared:,0.562
Model:,OLS,Adj. R-squared:,0.561
Method:,Least Squares,F-statistic:,492.1
Date:,"Mon, 26 Oct 2015",Prob (F-statistic):,1.03e-70
Time:,11:29:23,Log-Likelihood:,-2273.3
No. Observations:,385,AIC:,4551.0
Df Residuals:,383,BIC:,4559.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-2.3302,6.855,-0.340,0.734,-15.808 11.147
fga,0.2712,0.012,22.183,0.000,0.247 0.295

0,1,2,3
Omnibus:,162.497,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,849.356
Skew:,1.744,Prob(JB):,3.67e-185
Kurtosis:,9.386,Cond. No.,847.0


## Random Forest

In [67]:
from sklearn.ensemble import RandomForestRegressor
predictor_columns = ["age", "mp", "fg", "trb", "stl", "blk"]
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=3)
rf.fit(train[predictor_columns], train["ast"])
predictions = rf.predict(test[predictor_columns])

In [68]:
from sklearn.metrics import mean_squared_error
mean_squared_error(test["ast"], predictions)

4928.4015313151476