In [114]:
import pandas as pd

df = pd.read_csv("cbb.csv")

In [115]:
df.columns

Index(['TEAM', 'CONF', 'G', 'W', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'EFG_D',
       'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', '3P_O',
       '3P_D', 'ADJ_T', 'WAB', 'POSTSEASON', 'SEED', 'YEAR'],
      dtype='object')

In [116]:
df.head()

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,ACC,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,Wisconsin,B10,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,Michigan,B10,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,Texas Tech,B12,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,Gonzaga,WCC,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017


### Goal: Predict post season of team based on stats

In [117]:
df.POSTSEASON.value_counts()

R64          224
R32          112
S16           56
R68           28
E8            28
F4            14
2ND            7
Champions      7
Name: POSTSEASON, dtype: int64

#### Step 1: Figure out x and y samples
<ul>
    <li>df.POSTSEASON will be predicted so that is y</li>
    <li>Combinations of different of attributes is x </li>
</ul>



In [118]:
def getXY(df, xColumns, yColumns = ["POSTSEASON"]):
    df = df.copy()
    xData = df.loc[:,xColumns]
    yData = df.loc[:,yColumns]
    return (xData,yData)

xData, yData = getXY(df, ["CONF", "ADJOE", "ADJDE"])
print("x data:\n", xData)
print("y data:\n", yData)

x data:
       CONF  ADJOE  ADJDE
0      ACC  123.3   94.9
1      B10  129.1   93.6
2      B10  114.4   90.4
3      B12  115.2   85.2
4      WCC  117.8   86.3
...    ...    ...    ...
2450   B10  111.4   87.8
2451   P12  114.4   92.2
2452   P12  104.8   88.6
2453   A10  112.0   96.2
2454  ASun  103.4   96.3

[2455 rows x 3 columns]
y data:
      POSTSEASON
0           2ND
1           2ND
2           2ND
3           2ND
4           2ND
...         ...
2450        S16
2451        S16
2452        S16
2453        S16
2454        S16

[2455 rows x 1 columns]


#### Step 2: Encoding data

* df.CONF in xData and df.POSTSEASON in yData are categorical so they must be one hot encoded
* Every distinct category in a column gets its own index in a vector of 0s

In [119]:
import numpy as np

def oneHotEncode(column):
    nCategories = len(list(set(column.to_list())))
    encoded = []
    mapping = {}
    currentInd = 0
    for value in column:
        if(value not in mapping.keys()):
            mapping[value] = currentInd
            currentInd += 1
        encoded.append([0 if i != mapping[value] else 1 for i in range(nCategories)])
    return np.array(encoded), mapping


CONFohe, CONFmapping = oneHotEncode(xData["CONF"])

print("Before one hot encoding:\n",xData["CONF"][:5])
print("After one hot encoding:\n",CONFohe)


Before one hot encoding:
 0    ACC
1    B10
2    B10
3    B12
4    WCC
Name: CONF, dtype: object
After one hot encoding:
 [[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Next the one hot encoded vectors must be combined with non-categorical columns (ADJOE and ADJDE)

In [120]:
xEncoded = np.concatenate([CONFohe, xData.iloc[:,[1,2]]], axis = 1)
print(xEncoded)

[[  1.    0.    0.  ...   0.  123.3  94.9]
 [  0.    1.    0.  ...   0.  129.1  93.6]
 [  0.    1.    0.  ...   0.  114.4  90.4]
 ...
 [  0.    0.    0.  ...   0.  104.8  88.6]
 [  0.    0.    0.  ...   0.  112.   96.2]
 [  0.    0.    0.  ...   0.  103.4  96.3]]


In [121]:
yEncoded, POSTSEASONmapping = oneHotEncode(yData["POSTSEASON"])
print(yEncoded)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]


#### Step 3: Train test split

* Train data used to train
* Test data used to evaluate performance on unseen data
* Test data usually ~20% of data
* Train_test_split in sklearn library

In [122]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(xEncoded,yEncoded, test_size = 0.2)

print("Dimensions")
print("----------")
print("Training data: ", xTrain.shape, yTrain.shape)
print("Testing data: ",xTest.shape, yTest.shape)

Dimensions
----------
Training data:  (1964, 37) (1964, 9)
Testing data:  (491, 37) (491, 9)


#### Step 4: Model creation
* Vanilla neural network 
* Start simple

In [125]:
from keras.models import Sequential
from keras.layers import Dense

inputDim = 37
outputDim = 9

model = Sequential()
model.add(Dense(16, activation = 'relu', input_shape = (37,)))
model.add(Dense(outputDim, activation = 'softmax'))
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_36 (Dense)             (None, 16)                608       
_________________________________________________________________
dense_37 (Dense)             (None, 9)                 153       
Total params: 761
Trainable params: 761
Non-trainable params: 0
_________________________________________________________________


In [126]:
model.fit(xTrain,yTrain, validation_data = (xTest,yTest), epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x25059af60a0>

~82% of time this model makes the correct prediction. Not bad.

In [102]:
predictions = model.predict(xTest)
print(predictions)

[[2.1644834e-02 1.9341450e-02 5.5187024e-02 ... 2.0466171e-01
  2.6706148e-02 9.5386982e-02]
 [4.9517467e-03 1.4550710e-02 2.9242275e-02 ... 1.6684827e-01
  2.2280121e-02 6.9428876e-02]
 [5.1646894e-06 1.2868379e-04 4.7356763e-04 ... 1.8912174e-02
  4.3660123e-03 3.9262528e-04]
 ...
 [1.5193698e-02 1.5270739e-02 4.5832142e-02 ... 2.0135576e-01
  2.9491760e-02 6.6213071e-02]
 [1.0005021e-02 1.3221782e-02 5.1140778e-02 ... 2.8214189e-01
  1.7475381e-02 1.5188107e-01]
 [3.4254765e-05 6.3782372e-04 1.4105245e-03 ... 2.9975075e-02
  8.0905156e-03 1.5485827e-03]]


In [103]:
predictedIndices = [np.argmax(sample) for sample in predictions]
print(predictedIndices)

[5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 5, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 

In [104]:
inversePOSTSEASONmapping = dict([(item[1], item[0]) for item in POSTSEASONmapping.items()])
inversePOSTSEASONmapping

{0: '2ND',
 1: 'Champions',
 2: 'E8',
 3: 'F4',
 4: nan,
 5: 'R32',
 6: 'R64',
 7: 'R68',
 8: 'S16'}

In [105]:
print([(inversePOSTSEASONmapping[predictedIndices[i]], inversePOSTSEASONmapping[np.argmax(yTest[i])]) for i in range(len(predictedIndices))])

[('R32', 'R32'), (nan, 'S16'), (nan, nan), (nan, nan), ('R32', 'R32'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, 'R64'), (nan, nan), (nan, nan), ('R32', 'S16'), (nan, nan), (nan, nan), (nan, 'R68'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, 'R32'), (nan, 'R64'), (nan, 'R32'), (nan, nan), (nan, nan), ('R32', 'R32'), (nan, nan), ('R32', 'S16'), (nan, 'R32'), (nan, 'R64'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, 'R32'), (nan, 'R64'), (nan, 'R64'), (nan, nan), ('R32', 'E8'), (nan, nan), ('R32', 'F4'), (nan, nan), (nan, nan), (nan, nan), ('R32', 'R64'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, 'R64'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), ('R32', '2ND'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan,

Often the more information that you allow the network to access, the better it performs. But if the information is redundent there won't be improvement.

In [106]:
newData = df.loc[:, ["2P_O", "2P_D", "3P_O", "3P_D"]]
newData

Unnamed: 0,2P_O,2P_D,3P_O,3P_D
0,53.9,44.6,32.7,36.2
1,54.8,44.7,36.5,37.5
2,54.7,46.8,35.2,33.2
3,52.8,41.9,36.5,29.7
4,56.3,40.0,38.2,29.0
...,...,...,...,...
2450,50.4,44.3,34.1,30.1
2451,50.6,43.4,37.1,35.8
2452,49.1,44.9,33.3,33.4
2453,49.3,50.6,37.7,30.2


In [107]:
xEncodedWithNew = np.concatenate([xEncoded, newData], axis = 1)
print(xEncodedWithNew.shape)
xEncodedWithNew

(2455, 41)


array([[ 1. ,  0. ,  0. , ..., 44.6, 32.7, 36.2],
       [ 0. ,  1. ,  0. , ..., 44.7, 36.5, 37.5],
       [ 0. ,  1. ,  0. , ..., 46.8, 35.2, 33.2],
       ...,
       [ 0. ,  0. ,  0. , ..., 44.9, 33.3, 33.4],
       [ 0. ,  0. ,  0. , ..., 50.6, 37.7, 30.2],
       [ 0. ,  0. ,  0. , ..., 46.9, 33.4, 31.3]])

In [111]:
xTrain, xTest, yTrain, yTest = train_test_split(xEncoded,yEncoded, test_size = 0.2)
from keras.models import Sequential
from keras.layers import Dense

inputDim = 41
outputDim = 9

model = Sequential()
model.add(Dense(16, activation = 'relu', input_shape = (37,)))
model.add(Dense(outputDim, activation = 'softmax'))
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_32 (Dense)             (None, 16)                608       
_________________________________________________________________
dense_33 (Dense)             (None, 9)                 153       
Total params: 761
Trainable params: 761
Non-trainable params: 0
_________________________________________________________________


In [112]:
model.fit(xTrain,yTrain, validation_data = (xTest,yTest), epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x25054497730>

In [113]:
predictions = model.predict(xTest)
predictedIndices = [np.argmax(sample) for sample in predictions]
print([(inversePOSTSEASONmapping[predictedIndices[i]], inversePOSTSEASONmapping[np.argmax(yTest[i])]) for i in range(len(predictedIndices))])

[(nan, nan), (nan, nan), (nan, nan), (nan, nan), ('S16', 'R32'), ('R64', nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), ('R32', 'R64'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), ('R64', 'R64'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), ('S16', 'F4'), (nan, 'R32'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, 'R64'), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), (nan, nan), ('R64', 'R32'), (nan, nan), (nan, nan), (nan, nan), (n