In [56]:
from scipy.stats import linregress
import numpy as np
import pandas as pd
import datetime as dt
import sklearn
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

In [57]:
# Call in portion of data for basic manipulation
df = pd.read_csv('./Data/data.csv')
df.head()
# df.describe()

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,Final,Open,Close,ML,2H
0,1109,711,V,FloridaIntl,30,42,72,146,150.5,10000,79
1,1109,712,H,NorthCarolina,46,42,88,30,30.5,-30000,14.5
2,1109,713,V,MurrayState,26,44,70,132,139.5,700,73
3,1109,714,H,California,34,41,75,13,13.5,-1100,6
4,1109,715,V,AlcornState,28,32,60,NL,NL,16000,NL


In [58]:
#finding outlier games/teams
count_df = df['Team'].value_counts()
count_dict = count_df.to_dict()
df['Team Frequency'] = df['Team'].map(count_dict)

In [59]:
# encode the team names for easier training/testing
le = LabelEncoder()
le.fit(df['Team'])
team_code = le.transform(df['Team'])
df['Team Code'] = team_code
df.head()

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,Final,Open,Close,ML,2H,Team Frequency,Team Code
0,1109,711,V,FloridaIntl,30,42,72,146,150.5,10000,79,270,107
1,1109,712,H,NorthCarolina,46,42,88,30,30.5,-30000,14.5,382,232
2,1109,713,V,MurrayState,26,44,70,132,139.5,700,73,313,211
3,1109,714,H,California,34,41,75,13,13.5,-1100,6,344,46
4,1109,715,V,AlcornState,28,32,60,NL,NL,16000,NL,78,9


In [60]:
track_df = pd.DataFrame(columns=['Team','Code'])


track_df['Team'] = df['Team'].unique()
track_df['Code'] = df['Team Code'].unique()

team_dict = dict(zip(track_df.Team, track_df.Code))
# team_dict

In [61]:
# Transferring home/away status to numeric
def home_away_func(row):
    if row['VH'] == 'H':
        return 1
    elif row['VH'] == 'V':
        return 0
    else:
        return 0.5

In [62]:
df['Home/Away/Neutral'] = df.apply(home_away_func, axis=1)
# df.head()

In [63]:
# Reorder to put entire game data on single row
odd = df.iloc[::2]
even = df.iloc[1::2]

In [64]:
merged_df = odd.reset_index(drop=True).merge(even.reset_index(drop=True), left_index=True, right_index=True)
merged_df.head()

Unnamed: 0,Date_x,Rot_x,VH_x,Team_x,1st_x,2nd_x,Final_x,Open_x,Close_x,ML_x,...,1st_y,2nd_y,Final_y,Open_y,Close_y,ML_y,2H_y,Team Frequency_y,Team Code_y,Home/Away/Neutral_y
0,1109,711,V,FloridaIntl,30,42,72,146,150.5,10000,...,46,42,88,30,30.5,-30000,14.5,382,232,1.0
1,1109,713,V,MurrayState,26,44,70,132,139.5,700,...,34,41,75,13,13.5,-1100,6.0,344,46,1.0
2,1109,715,V,AlcornState,28,32,60,NL,NL,16000,...,56,44,100,34,39.0,-48000,15.0,361,251,1.0
3,1109,717,V,AlbanyNY,20,23,43,NL,NL,1500,...,38,37,75,24,20.0,-3000,7.0,348,327,1.0
4,1111,727,V,DetroitU,32,29,61,142,140.5,1350,...,40,55,95,18,17.0,-2300,8.5,344,46,1.0


In [65]:
cleaned_df = merged_df.rename(columns = {'Date_x':'Date','VH_x':'Loc Team X','Team_x':'Team X','1st_x':'Team X 1st Half',
                                      '2nd_x':'Team X 2nd Half','Final_x':'Team X Final','VH_y':'Loc Team Y',
                                      'Team_y':'Team Y','1st_y':'Team Y 1st Half','2nd_y':'Team Y 2nd Half',
                                     'Final_y':'Team Y Final','Open_x':'Total Open','Close_x':'Total Close','ML_x':'Team X Moneyline',
                                        '2H_x':'2nd Half Total','Open_y':'Spread Open','Close_y':'Spread Close','ML_y':'Team Y Moneyline',
                                        '2H_y':'2nd Half Spread'})
cleaned_df.head()

Unnamed: 0,Date,Rot_x,Loc Team X,Team X,Team X 1st Half,Team X 2nd Half,Team X Final,Total Open,Total Close,Team X Moneyline,...,Team Y 1st Half,Team Y 2nd Half,Team Y Final,Spread Open,Spread Close,Team Y Moneyline,2nd Half Spread,Team Frequency_y,Team Code_y,Home/Away/Neutral_y
0,1109,711,V,FloridaIntl,30,42,72,146,150.5,10000,...,46,42,88,30,30.5,-30000,14.5,382,232,1.0
1,1109,713,V,MurrayState,26,44,70,132,139.5,700,...,34,41,75,13,13.5,-1100,6.0,344,46,1.0
2,1109,715,V,AlcornState,28,32,60,NL,NL,16000,...,56,44,100,34,39.0,-48000,15.0,361,251,1.0
3,1109,717,V,AlbanyNY,20,23,43,NL,NL,1500,...,38,37,75,24,20.0,-3000,7.0,348,327,1.0
4,1111,727,V,DetroitU,32,29,61,142,140.5,1350,...,40,55,95,18,17.0,-2300,8.5,344,46,1.0


In [66]:
cleaned_df.drop(cleaned_df[cleaned_df['Total Close'] == 'NL'].index, inplace = True) 
cleaned_df.drop(cleaned_df[cleaned_df['Spread Close'] == 'NL'].index, inplace=True)
cleaned_df.drop(cleaned_df[cleaned_df['2nd Half Spread'] == 'NL'].index, inplace=True)
cleaned_df.drop(cleaned_df[cleaned_df['2nd Half Total'] == 'NL'].index, inplace=True)
cleaned_df.drop(cleaned_df[cleaned_df['Team X Moneyline'] == 'NL'].index, inplace=True)
cleaned_df.drop(cleaned_df[cleaned_df['Team Y Moneyline'] == 'NL'].index, inplace=True)

In [67]:
cleaned_df.drop(columns=['Total Open','Spread Open'])
cleaned_df.head()

Unnamed: 0,Date,Rot_x,Loc Team X,Team X,Team X 1st Half,Team X 2nd Half,Team X Final,Total Open,Total Close,Team X Moneyline,...,Team Y 1st Half,Team Y 2nd Half,Team Y Final,Spread Open,Spread Close,Team Y Moneyline,2nd Half Spread,Team Frequency_y,Team Code_y,Home/Away/Neutral_y
0,1109,711,V,FloridaIntl,30,42,72,146.0,150.5,10000,...,46,42,88,30.0,30.5,-30000,14.5,382,232,1.0
1,1109,713,V,MurrayState,26,44,70,132.0,139.5,700,...,34,41,75,13.0,13.5,-1100,6.0,344,46,1.0
4,1111,727,V,DetroitU,32,29,61,142.0,140.5,1350,...,40,55,95,18.0,17.0,-2300,8.5,344,46,1.0
7,1112,505,V,JamesMadison,25,19,44,149.0,140.5,1500,...,34,38,72,16.5,19.0,-2600,8.5,361,251,1.0
8,1112,507,V,GeorgiaState,29,24,53,137.5,135.0,450,...,32,37,69,10.0,10.5,-600,5.5,333,215,1.0


In [68]:
cleaned_df = cleaned_df.loc[(cleaned_df['Team Frequency_x'] > 10) & (cleaned_df['Team Frequency_y'] > 10)]

In [69]:
# Find winner
def win_func(row):
    if row['Team X Final'] > row['Team Y Final']:
        return row['Team X']
    else:
        return row['Team Y']

cleaned_df['Winner'] = cleaned_df.apply(win_func, axis=1)

In [70]:
# Drop meaningless columns
shortened_df = cleaned_df.drop(columns=['Rot_x','Date_y','Rot_y','Total Open','Spread Open'])

In [71]:
# Convert objects to floats
shortened_df['Total Close'] = pd.to_numeric(shortened_df['Total Close'])
shortened_df['Spread Close'] = pd.to_numeric(shortened_df['Spread Close'])
shortened_df['2nd Half Spread'] = pd.to_numeric(shortened_df['2nd Half Spread'])
shortened_df['2nd Half Total'] = pd.to_numeric(shortened_df['2nd Half Total'])
shortened_df['Team X Moneyline'] = pd.to_numeric(shortened_df['Team X Moneyline'])
shortened_df['Team Y Moneyline'] = pd.to_numeric(shortened_df['Team Y Moneyline'])

In [72]:
###### Use the df.apply to create corrected columns then drop bad column
def spread_func(row):
    if row['Total Close'] < row['Spread Close']:
        return row['Total Close']
    else:
        return (row['Spread Close'] * -1)
    
def total_func(row):
    if row['Total Close'] < row['Spread Close']:
        return row['Spread Close']
    else:
        return row['Total Close']

In [73]:
shortened_df['Corrected Spread'] = shortened_df.apply(spread_func, axis=1)
shortened_df['Corrected Total'] = shortened_df.apply(total_func, axis=1)
shortened_df.head()

Unnamed: 0,Date,Loc Team X,Team X,Team X 1st Half,Team X 2nd Half,Team X Final,Total Close,Team X Moneyline,2nd Half Total,Team Frequency_x,...,Team Y Final,Spread Close,Team Y Moneyline,2nd Half Spread,Team Frequency_y,Team Code_y,Home/Away/Neutral_y,Winner,Corrected Spread,Corrected Total
0,1109,V,FloridaIntl,30,42,72,150.5,10000,79.0,270,...,88,30.5,-30000,14.5,382,232,1.0,NorthCarolina,-30.5,150.5
1,1109,V,MurrayState,26,44,70,139.5,700,73.0,313,...,75,13.5,-1100,6.0,344,46,1.0,California,-13.5,139.5
4,1111,V,DetroitU,32,29,61,140.5,1350,73.5,326,...,95,17.0,-2300,8.5,344,46,1.0,California,-17.0,140.5
7,1112,V,JamesMadison,25,19,44,140.5,1500,72.5,312,...,72,19.0,-2600,8.5,361,251,1.0,OhioState,-19.0,140.5
8,1112,V,GeorgiaState,29,24,53,135.0,450,71.0,322,...,69,10.5,-600,5.5,333,215,1.0,NCState,-10.5,135.0


In [74]:
shortened_df['Team X Spread'] = shortened_df['Corrected Spread'] * -1
shortened_df['Team Y Spread'] = shortened_df['Corrected Spread']

In [75]:
### Create function to see who wins against the spread
### if they cover, they get a 1, if no cover or push, then 0
def team_to_cover(row):
    if (row['Team Y Final'] + row['Corrected Spread']) < row['Team X Final']:
        return 0
    elif (row['Team Y Final'] + row['Corrected Spread']) > row['Team X Final']:
        return 1
    else:
        return 0.5


In [76]:
# Team to cover key: 1 = Team Y, 0 = Team x, 0.5 = Push ATS
shortened_df['Team to Cover'] = shortened_df.apply(team_to_cover, axis=1)

In [77]:
# Check output
final_df = shortened_df
final_df.head(50)

Unnamed: 0,Date,Loc Team X,Team X,Team X 1st Half,Team X 2nd Half,Team X Final,Total Close,Team X Moneyline,2nd Half Total,Team Frequency_x,...,2nd Half Spread,Team Frequency_y,Team Code_y,Home/Away/Neutral_y,Winner,Corrected Spread,Corrected Total,Team X Spread,Team Y Spread,Team to Cover
0,1109,V,FloridaIntl,30,42,72,150.5,10000,79.0,270,...,14.5,382,232,1.0,NorthCarolina,-30.5,150.5,30.5,-30.5,0.0
1,1109,V,MurrayState,26,44,70,139.5,700,73.0,313,...,6.0,344,46,1.0,California,-13.5,139.5,13.5,-13.5,0.0
4,1111,V,DetroitU,32,29,61,140.5,1350,73.5,326,...,8.5,344,46,1.0,California,-17.0,140.5,17.0,-17.0,1.0
7,1112,V,JamesMadison,25,19,44,140.5,1500,72.5,312,...,8.5,361,251,1.0,OhioState,-19.0,140.5,19.0,-19.0,1.0
8,1112,V,GeorgiaState,29,24,53,135.0,450,71.0,322,...,5.5,333,215,1.0,NCState,-10.5,135.0,10.5,-10.5,1.0
9,1113,V,CSNorthridge,20,43,63,142.0,2000,74.5,295,...,7.0,348,269,1.0,Purdue,-22.5,142.0,22.5,-22.5,1.0
10,1113,V,William&Mary,31,35,66,136.0,2250,73.0,291,...,10.5,336,71,1.0,Connecticut,-23.5,136.0,23.5,-23.5,0.0
11,1113,V,YoungstownState,24,33,57,137.0,1150,73.0,293,...,4.5,357,402,1.0,Xavier,-16.5,137.0,16.5,-16.5,1.0
12,1113,V,MiamiOhio,42,29,71,1.5,-125,70.0,316,...,0.0,290,349,1.0,Towson,1.5,125.0,-1.5,1.5,1.0
13,1113,V,ClevelandState,25,37,62,1.0,-120,3.0,321,...,70.0,317,315,1.0,StBonaventure,1.0,134.0,-1.0,1.0,1.0


In [79]:
predict_winner_df = final_df.drop(['Date','Team X 1st Half','Team X 2nd Half','Team X Final',
                                  'Total Close','2nd Half Total','Team X','Team Y',
                                   'Team Y 1st Half','Team Y 2nd Half','Team Y Final',
                                   'Spread Close','2nd Half Spread','Winner','Corrected Spread',
                                  'Loc Team X','Loc Team Y','Team Frequency_x','Team Frequency_y',
                                  ], axis=1)
predict_winner_df.head()

Unnamed: 0,Team X Moneyline,Team Code_x,Home/Away/Neutral_x,Team Y Moneyline,Team Code_y,Home/Away/Neutral_y,Corrected Total,Team X Spread,Team Y Spread,Team to Cover
0,10000,107,0.0,-30000,232,1.0,150.5,30.5,-30.5,0.0
1,700,211,0.0,-1100,46,1.0,139.5,13.5,-13.5,0.0
4,1350,84,0.0,-2300,46,1.0,140.5,17.0,-17.0,1.0
7,1500,154,0.0,-2600,251,1.0,140.5,19.0,-19.0,1.0
8,450,119,0.0,-600,215,1.0,135.0,10.5,-10.5,1.0


In [27]:
x = predict_winner_df.drop(['Team to Cover'], axis=1)
y = predict_winner_df['Team to Cover']

print(x.shape, y.shape)

(39349, 9) (39349,)


In [28]:
y.nunique()

3

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [35]:
scalar = StandardScaler()

In [36]:
scalar.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [37]:
x_train_scaled = scalar.transform(x_train)

In [38]:
x_test_scaled = scalar.transform(x_test)

In [39]:
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [40]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=9))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [41]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [42]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               1000      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 303       
Total params: 21,503
Trainable params: 21,503
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.fit(
    x_train_scaled,
    y_train,
    epochs=200,
    shuffle=True,
    verbose=2
)

Train on 29511 samples
Epoch 1/200
29511/29511 - 6s - loss: 0.7011 - acc: 0.4923
Epoch 2/200
29511/29511 - 6s - loss: 0.6945 - acc: 0.4977
Epoch 3/200
29511/29511 - 6s - loss: 0.6936 - acc: 0.4978
Epoch 4/200
29511/29511 - 6s - loss: 0.6932 - acc: 0.4973
Epoch 5/200
29511/29511 - 6s - loss: 0.6930 - acc: 0.4984
Epoch 6/200
29511/29511 - 6s - loss: 0.6929 - acc: 0.4991
Epoch 7/200
29511/29511 - 9s - loss: 0.6928 - acc: 0.4978
Epoch 8/200
29511/29511 - 5s - loss: 0.6925 - acc: 0.4997
Epoch 9/200
29511/29511 - 7s - loss: 0.6923 - acc: 0.5009
Epoch 10/200
29511/29511 - 5s - loss: 0.6921 - acc: 0.5022
Epoch 11/200
29511/29511 - 4s - loss: 0.6920 - acc: 0.5010
Epoch 12/200
29511/29511 - 7s - loss: 0.6919 - acc: 0.5005
Epoch 13/200
29511/29511 - 4s - loss: 0.6916 - acc: 0.5037
Epoch 14/200
29511/29511 - 12s - loss: 0.6914 - acc: 0.5054
Epoch 15/200
29511/29511 - 4s - loss: 0.6912 - acc: 0.5055
Epoch 16/200
29511/29511 - 4s - loss: 0.6913 - acc: 0.5078
Epoch 17/200
29511/29511 - 8s - loss: 0.6

Epoch 138/200
29511/29511 - 9s - loss: 0.6224 - acc: 0.6193
Epoch 139/200
29511/29511 - 11s - loss: 0.6217 - acc: 0.6210
Epoch 140/200
29511/29511 - 11s - loss: 0.6221 - acc: 0.6219
Epoch 141/200
29511/29511 - 15s - loss: 0.6208 - acc: 0.6249
Epoch 142/200
29511/29511 - 11s - loss: 0.6213 - acc: 0.6211
Epoch 143/200
29511/29511 - 20s - loss: 0.6193 - acc: 0.6223
Epoch 144/200
29511/29511 - 8s - loss: 0.6185 - acc: 0.6244
Epoch 145/200
29511/29511 - 13s - loss: 0.6191 - acc: 0.6250
Epoch 146/200
29511/29511 - 10s - loss: 0.6176 - acc: 0.6264
Epoch 147/200
29511/29511 - 10s - loss: 0.6173 - acc: 0.6232
Epoch 148/200
29511/29511 - 15s - loss: 0.6172 - acc: 0.6274
Epoch 149/200
29511/29511 - 12s - loss: 0.6152 - acc: 0.6270
Epoch 150/200
29511/29511 - 12s - loss: 0.6140 - acc: 0.6270
Epoch 151/200
29511/29511 - 10s - loss: 0.6144 - acc: 0.6264
Epoch 152/200
29511/29511 - 15s - loss: 0.6148 - acc: 0.6276
Epoch 153/200
29511/29511 - 12s - loss: 0.6133 - acc: 0.6288
Epoch 154/200
29511/29511 

<tensorflow.python.keras.callbacks.History at 0x26bdbd7e278>

In [None]:
# model_array = ['xml','teamxcode','locxcode','yml','teamycode','locycode','total','xspread','yspread']

In [80]:
model.save('second_model.h5')

In [89]:
encoded_predictions = model.predict_classes(x_test_scaled[:5])
print(encoded_predictions)

[0 0 0 0 0]


In [84]:
# print(x_train_scaled)

[[-0.08466278 -0.11541212  2.67604706 ... -1.66787674 -0.10082904
   0.10082904]
 [ 1.29104209 -1.20899688 -0.37368551 ... -0.50188496  2.0253552
  -2.0253552 ]
 [-0.07615327 -1.1321042  -0.37368551 ...  1.6435399   0.0206672
  -0.0206672 ]
 ...
 [ 0.28408285  1.21739431 -0.37368551 ... -1.2481197   1.66086648
  -1.66086648]
 [-0.0960088  -0.27774111 -0.37368551 ... -0.82836266 -0.40456965
   0.40456965]
 [-0.2690356  -0.10686849 -0.37368551 ... -0.36196595 -1.61953208
   1.61953208]]


In [87]:
kansas_tcu = [-430,155,0,315,328,1,131,-8,8]
kansas_tcu_output = model.predict_classes(kansas_tcu[0:8])
print(kansas_tcu_output)

In [None]:
kansas_tcu_output = list(team_dict.keys())[list(team_dict.values()).index(kansas_tcu)]
print(kansas_tcu_output)

In [None]:
umiami_fsu = [700,191,0,-1200,108,1,144,13.5,-13.5]
model.predict(umiami_fsu)

In [None]:
umiami_fsu_output = list(team_dict.keys())[list(team_dict.values()).index(umiami_fsu)]
print(umiami_fsu_output)

In [None]:
auburn_lsu = [175,160,0,-210,21,1,155.5,4.5,-4.5]
model.predict(auburn_lsu)

In [None]:
auburn_lsu_output = list(team_dict.keys())[list(team_dict.values()).index(auburn_lsu)]
print(auburn_lsu_output)

In [None]:
msu_mich = [120,194,0,-140,193,1,142.5,3,-3]
model.predict(msu_mich)

In [None]:
msu_mich_output = list(team_dict.keys())[list(team_dict.values()).index(msu_mich)]
print(msu_mich_output)

In [None]:
uk_tenn = [-165,159,0,145,332,1,132.5,-3.5,3.5]
model.predict(uk_tenn)

In [None]:
uk_tenn_output = list(team_dict.keys())[list(team_dict.values()).index(uk_tenn)]
print(uk_tenn_output)