Create a dataset that can be visualized and therefore better understood using the preprocessing techniques I've learned 

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:
FILENAME = 'NBADATA.csv'

In [3]:
#retain relevant columns. 
data = pd.read_csv(FILENAME) 
data['3P%'] = np.divide(data['3P'].values,data['3PA'].values) 
del data['3P'],data['3PA']
data['FG%'] = np.divide(data['FG'].values,data['FGA'].values)
del data['FG'],data['FGA']
data['FT%'] = np.divide(data['FT'].values,data['FTA'].values)
del data['Unnamed: 0'],data['PLUS_MINUS'],data['TOTAL']
del data['FT'],data['FTA']
del data['OU']
#del data['Team']
#data = pd.get_dummies(data)

In [4]:
#to get rolling stats, need to consider one team at a time. 
teams = data.Team.unique()

In [15]:
#iterate over those teams, make a rolling window over n games. 
N_GAMES = 1
nba_data = pd.DataFrame([])
for team in teams:
    team_data = data.loc[data['Team'] == team]  #this contains the box score of every team game from 2013 to 2018. 
    stuff_to_turn_into_avgs = ['OR', 'DR', 'TOT', 'PF', 'ST', 'TO', 'BL', '3P%', 'FG%', 'FT%']
    #if seasons are the same, do this here, use the GAME_ID signifier. 
    for col in team_data.columns:
        if col in stuff_to_turn_into_avgs:
            #split each season up here, 
            team_data['Rolling ' + col] = team_data[col].rolling(window=N_GAMES).mean().shift(1)
            del team_data[col]
                
    nba_data =  nba_data.append(team_data)
           # df = pd.concat([road_df,home_df],axis=1)
#reorganize the dataset. 
nba_data_splits = nba_data.sort_values(by = ['GAME_ID', 'Home'], ascending=[True, True])
nba_data_splits.dropna(inplace=True)  #null values come with rolling means, drop those now. 

#delete columns no longer of use, ie team name etc. Can consider keeping team name and see if helps chances. 
del nba_data_splits['GAME_ID'],nba_data_splits['Home'],nba_data_splits['Away'],nba_data_splits['Date']
del nba_data_splits['Team']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
#nba_data_splits = pd.get_dummies(nba_data_splits)

In [17]:
#now align the box scores so its one big one for each game, home team and road teams. 

road_df = nba_data_splits.iloc[::2]
home_df = nba_data_splits.iloc[1::2]
for col in nba_data_splits.columns:
    road_df['road_' + col] = road_df[col]
    home_df['home_' + col] = home_df[col]
    
    del road_df[col],home_df[col]

home_df.reset_index(inplace=True)
road_df.reset_index(inplace=True)

#merged into a dataframe here. 
df = pd.concat([road_df,home_df],axis=1)
del df['index']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
#create the dataset here. Can consider the spread, or winner. 
#at the moment only using a single classifier, that seems sufficient. A home team loss is synonymous with a road team win. 
spread = True
winner = False

df['final_SPREAD'] = df['road_PTS'] - df['home_PTS']
del df['road_PTS'], df['home_PTS'],df['home_SPREAD']
           # if openspread + endspread <0:
            #    y.append(np.array([0,1,0]))  #home team covered
            #elif openspread + endspread >0:
            #    y.append(np.array([1,0,0]))  #road covered
           # else: 
           #     y.append(np.array([0,0,1]))  #push!
y = []

if spread: 
    for i in range(len(df)):
        if df['road_SPREAD'].values[i] + df['final_SPREAD'].values[i] < 0:
            y.append(1) #home team covers
        else: # df['road_SPREAD'].values[i] + df['final_SPREAD'].values[i] > 0:
            y.append(0) #road team covers or push
    #else:
    #    y.append(np.array([0,1]))  #push! 
    
if winner:
    for i in range(len(df)):
        if df['final_SPREAD'].values[i] < 0: #home team won. 
            y.append(1)
        else:
            y.append(0)

del df['final_SPREAD']

y_names = np.array(['road team win', 'home team win']) #for preprocessing/visualization. 

In [19]:
#scale and split the data here. 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_train,X_test, y_train,y_test = train_test_split(df.values,y,random_state = 42)

In [None]:
df

In [20]:
#this is done in a relatively un-pythonic way. Can also all be in one line! 
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)

In [21]:
from sklearn.feature_selection import SelectPercentile #univariate statistics. 

select = SelectPercentile(percentile = 50)
select.fit(X_train,y_train)



X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

In [22]:
#use some ML to see how well behaved the data is. 

from sklearn.neural_network import MLPClassifier

clf = MLPClassifier()
clf.fit(X_train,y_train)

print("Traditional Model: ")
print("training score = ",clf.score(X_train,y_train))

print("testing score ", clf.score(X_test,y_test))


clf.fit(X_train_selected,y_train)
print("Univariate Statistics Model: ")
print("training score = ",clf.score(X_train_selected,y_train))

print("testing score ", clf.score(X_test_selected,y_test))

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators = 100,random_state = 42),threshold = 'median')
select.fit(X_train,y_train)
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

clf.fit(X_train_selected,y_train)
print("Model Based Feature Extraction: ")
print("training score = ",clf.score(X_train_selected,y_train))

print("testing score ", clf.score(X_test_selected,y_test))



Traditional Model: 
training score =  0.523595505618
testing score  0.522671568627
Univariate Statistics Model: 
training score =  0.5291113381
testing score  0.477328431373
Model Based Feature Extraction: 
training score =  0.521348314607
testing score  0.487745098039


In [23]:
from keras.models import Sequential
from keras.layers import Dense

In [26]:
X_train.shape

(4895, 21)

Now attempting Keras version, slightly more freedom when building the NN

In [31]:
model=Sequential()
model.add(Dense(20,input_dim=np.shape(X_train)[1],activation='sigmoid'))
model.add(Dense(4,activation='relu'))
model.add(Dense(5,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=40,epochs=20,validation_split=.2)
scores = model.evaluate(X_test,y_test)
print(scores[1])


Train on 3916 samples, validate on 979 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.513480392157


In [None]:
#can visualize the principal components of the dataset here. Sometimes it tells you something, not so much here. 
from sklearn.decomposition import PCA
principal_features = 2
pca = PCA(n_components = principal_features)

pca.fit(X_train)
X_pca = pca.transform(X_train)  #turns it into a two feature dataset. 
 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import mglearn

plt.figure(figsize=(8,8))
mglearn.discrete_scatter(X_pca[:,0],X_pca[:,1],y_train)
plt.legend(y_names,loc='best')
plt.gca().set_aspect('equal')
plt.title('Principal Component Analysis')  #create a plot visualizing this, notice a trend or separation? Good!

There is a little bit of separation, meaning there is something to exploit! By examining the features below, we can see which play the biggest role, and could be responsible for this...

In [None]:
#see how each feature plays into the principal components. 
plt.figure()
plt.matshow(pca.components_,cmap='viridis')
plt.yticks([0,1],['First PC','Second PC'])
plt.colorbar()
plt.xticks(range(len(np.array(df.columns))),np.array(df.columns),rotation=60,ha='left')
plt.xlabel('Feature')
plt.ylabel('Principal Components')

Based on this it appears the first PC is mainly rooted in the field goal %, given the home fg% is the most important, followed by spread it seems. 

In [None]:
X_train.shape

In [None]:
cancer = load_breast_cancer()

In [None]:
cancer.feature_names

In [None]:
y_names = np.array(['road team', 'home team'])

In [None]:
y_names