In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

import numpy as np
from matplotlib import pyplot as plt
import math
import statistics as stat

import sqlalchemy
from sqlalchemy import create_engine
import psycopg2
from config import db_password

import datetime
from datetime import date
from datetime import time

import re

## Read Kaggle data from the cleaned mls_data_final.csv file

Import the clean and prepared `mls_away_avg.csv` (originally from Kaggle) data on major league soccer (MLS).

In [2]:
# Read .csv from FinalProject/Resources
file_dir = "mls_away_avg.csv"    

# Create a pandas DataFrame 
game_df = pd.read_csv(file_dir, low_memory=False, skipinitialspace = True)

# Show all columns
pd.set_option("display.max_columns", None)

game_df

Unnamed: 0.1,Unnamed: 0,home,away,day,year,venue,home_score,away_score,Outcome,average_away_possession_from_previous_year,average_away_total_shots_previous_year,average_away_shots_on_goal_previous_year,average_away_redcards_previous_year,average_away_corners_won_previous_year
0,0,Columbus Crew,Toronto FC,Saturday,2008,MAPFRE Stadium,2,0,Win,0.498000,7.133333,3.400000,0.200000,3.866667
1,1,Real Salt Lake,Chicago Fire FC,Saturday,2008,Rice-Eccles Stadium,1,1,Tie,0.488235,8.235294,4.176471,0.000000,3.411765
2,2,Sporting Kansas City,DC United,Saturday,2008,Community America Ballpark,2,0,Win,0.507333,7.400000,3.733333,0.266667,4.333333
3,3,New England Revolution,Houston Dynamo FC,Saturday,2008,Gillette Stadium,3,0,Win,0.517059,10.352941,4.235294,0.117647,4.411765
4,4,Colorado Rapids,LA Galaxy,Saturday,2008,Dick's Sporting Goods Park,4,0,Win,0.492667,10.066667,5.133333,0.066667,3.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4508,4508,Sporting Kansas City,FC Dallas,Saturday,2021,Children's Mercy Park,1,2,Loss,0.515000,11.250000,3.375000,0.000000,3.375000
4509,4509,Austin FC,Colorado Rapids,Saturday,2021,Q2 Stadium,0,1,Loss,0.460000,11.142857,3.857143,0.000000,4.285714
4510,4510,Vancouver Whitecaps,Minnesota United FC,Saturday,2021,Rio Tinto Stadium,2,2,Tie,0.478750,11.750000,4.125000,0.125000,3.625000
4511,4511,Philadelphia Union,Chicago Fire FC,Sunday,2021,Subaru Park,1,1,Tie,0.503333,11.000000,3.000000,0.333333,3.000000


In [3]:
# Get all the column names
game_df.columns

Index(['Unnamed: 0', 'home', 'away', 'day', 'year', 'venue', 'home_score',
       'away_score', 'Outcome', 'average_away_possession_from_previous_year',
       'average_away_total_shots_previous_year',
       'average_away_shots_on_goal_previous_year',
       'average_away_redcards_previous_year',
       'average_away_corners_won_previous_year'],
      dtype='object')

In [4]:
# Find the total null values in each column
game_df.isnull().sum()

Unnamed: 0                                    0
home                                          0
away                                          0
day                                           0
year                                          0
venue                                         1
home_score                                    0
away_score                                    0
Outcome                                       0
average_away_possession_from_previous_year    0
average_away_total_shots_previous_year        0
average_away_shots_on_goal_previous_year      0
average_away_redcards_previous_year           0
average_away_corners_won_previous_year        0
dtype: int64

In [5]:
game_df = game_df.dropna()
game_df.isnull().sum()

Unnamed: 0                                    0
home                                          0
away                                          0
day                                           0
year                                          0
venue                                         0
home_score                                    0
away_score                                    0
Outcome                                       0
average_away_possession_from_previous_year    0
average_away_total_shots_previous_year        0
average_away_shots_on_goal_previous_year      0
average_away_redcards_previous_year           0
average_away_corners_won_previous_year        0
dtype: int64

In [6]:
# No. of unique data in each column
game_df.nunique()

Unnamed: 0                                    4512
home                                            28
away                                            28
day                                              7
year                                            14
venue                                           59
home_score                                       8
away_score                                       8
Outcome                                          3
average_away_possession_from_previous_year     222
average_away_total_shots_previous_year         212
average_away_shots_on_goal_previous_year       164
average_away_redcards_previous_year             43
average_away_corners_won_previous_year         157
dtype: int64

In [7]:
# Check datatypes of each column
game_df.dtypes

Unnamed: 0                                      int64
home                                           object
away                                           object
day                                            object
year                                            int64
venue                                          object
home_score                                      int64
away_score                                      int64
Outcome                                        object
average_away_possession_from_previous_year    float64
average_away_total_shots_previous_year        float64
average_away_shots_on_goal_previous_year      float64
average_away_redcards_previous_year           float64
average_away_corners_won_previous_year        float64
dtype: object

It looks like there are 5 columns with **categorical** values having consistent data type as *object*.

We should **generate a list of categorical variable** names using Python's "df.dtypes" property. In that case, we can use our variable list to perform the **one-hot encoding** *once*, rather than for each individual variable.

In [8]:
# Generate our categorical variable list
game_cat = game_df.dtypes[game_df.dtypes == "object"].index.tolist()
game_cat

['home', 'away', 'day', 'venue', 'Outcome']

In [9]:
# Check the number of unique values in each column
game_df[game_cat].nunique()

home       28
away       28
day         7
venue      59
Outcome     3
dtype: int64

Before going to encode these columns using Scikit-learn's OneHotEncoder module, we need to make sure that if the categorical variables require **bucketing**. Since we want to predict outcome for each individual `home` team, we need to see the relation of all individual `home` team, `venue` and other columns, we choose not to to any binning/bucketing, and we're ready to use **OneHotEncoder**.

In [10]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(game_df[game_cat]))
encode_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(game_cat)
encode_df.head()

Unnamed: 0,home_Atlanta United FC,home_Austin FC,home_CF Montréal,home_Chicago Fire FC,home_Chivas USA,home_Colorado Rapids,home_Columbus Crew,home_DC United,home_FC Cincinnati,home_FC Dallas,home_Houston Dynamo FC,home_Inter Miami CF,home_LA Galaxy,home_LAFC,home_Minnesota United FC,home_Nashville SC,home_New England Revolution,home_New York City FC,home_New York Red Bulls,home_Orlando City SC,home_Philadelphia Union,home_Portland Timbers,home_Real Salt Lake,home_San Jose Earthquakes,home_Seattle Sounders FC,home_Sporting Kansas City,home_Toronto FC,home_Vancouver Whitecaps,away_Atlanta United FC,away_Austin FC,away_CF Montréal,away_Chicago Fire FC,away_Chivas USA,away_Colorado Rapids,away_Columbus Crew,away_DC United,away_FC Cincinnati,away_FC Dallas,away_Houston Dynamo FC,away_Inter Miami CF,away_LA Galaxy,away_LAFC,away_Minnesota United FC,away_Nashville SC,away_New England Revolution,away_New York City FC,away_New York Red Bulls,away_Orlando City SC,away_Philadelphia Union,away_Portland Timbers,away_Real Salt Lake,away_San Jose Earthquakes,away_Seattle Sounders FC,away_Sporting Kansas City,away_Toronto FC,away_Vancouver Whitecaps,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,venue_AT&T Stadium,venue_Allianz Field,venue_Audi Field,venue_BBVA Stadium,venue_BC Place,venue_BMO Field,venue_Banc of California Stadium,venue_Bobby Dodd Stadium,venue_Buck Shaw Stadium,venue_Camping World Stadium,venue_CenturyLink Field,venue_Children's Mercy Park,venue_Citi Field Stadium,venue_Community America Ballpark,venue_DRV PNK Stadium,venue_Dick's Sporting Goods Park,venue_Dignity Health Sports Park,venue_ESPN Wide World of Sports Complex,venue_Earthquakes Stadium,venue_Empire Field,venue_Exploria Stadium,venue_FedExField,venue_Giants Stadium,venue_Gillette Stadium,venue_Historic Crew Stadium,venue_Inter Miami CF Stadium,venue_Levi's Stadium,venue_Lower.com Field,venue_Lumen Field,venue_MAPFRE Stadium,venue_Maryland SoccerPlex,venue_McAfee Coliseum,venue_Mercedes-Benz Stadium,venue_Navy-Marine Corps Memorial Stadium,venue_Nippert Stadium,venue_Nissan Stadium,venue_PayPal Park,venue_Pizza Hut Park,venue_Pratt & Whitney Stadium at Rentschler Field,venue_Providence Park,venue_Q2 Stadium,venue_Qwest Field,venue_RFK Stadium,venue_Red Bull Arena,venue_Rice-Eccles Stadium,venue_Rio Tinto Stadium,venue_Robertson Stadium,venue_Rogers Centere,venue_SeatGeek Stadium,venue_Soldier Field,venue_Stade Olympique,venue_Stade Saputo,venue_Stanford Stadium,venue_StubHub Center,venue_Subaru Park,venue_TCF Bank Stadium,venue_TQL Stadium,venue_Toyota Stadium,venue_Yankee Stadium,Outcome_Loss,Outcome_Tie,Outcome_Win
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Now that our categorical variables have been encoded, they are ready to replace our unencoded categorical variables in our dataset using *panda's* **merge** and **drop** methods.

In [12]:
# Merge OneHotEncoded features and drop the originals from the updated df
game_df = game_df.merge(encode_df, left_index=True, right_index=True).drop(game_df[game_cat],1)
game_df.head()

Unnamed: 0.1,Unnamed: 0,year,home_score,away_score,average_away_possession_from_previous_year,average_away_total_shots_previous_year,average_away_shots_on_goal_previous_year,average_away_redcards_previous_year,average_away_corners_won_previous_year,home_Atlanta United FC,home_Austin FC,home_CF Montréal,home_Chicago Fire FC,home_Chivas USA,home_Colorado Rapids,home_Columbus Crew,home_DC United,home_FC Cincinnati,home_FC Dallas,home_Houston Dynamo FC,home_Inter Miami CF,home_LA Galaxy,home_LAFC,home_Minnesota United FC,home_Nashville SC,home_New England Revolution,home_New York City FC,home_New York Red Bulls,home_Orlando City SC,home_Philadelphia Union,home_Portland Timbers,home_Real Salt Lake,home_San Jose Earthquakes,home_Seattle Sounders FC,home_Sporting Kansas City,home_Toronto FC,home_Vancouver Whitecaps,away_Atlanta United FC,away_Austin FC,away_CF Montréal,away_Chicago Fire FC,away_Chivas USA,away_Colorado Rapids,away_Columbus Crew,away_DC United,away_FC Cincinnati,away_FC Dallas,away_Houston Dynamo FC,away_Inter Miami CF,away_LA Galaxy,away_LAFC,away_Minnesota United FC,away_Nashville SC,away_New England Revolution,away_New York City FC,away_New York Red Bulls,away_Orlando City SC,away_Philadelphia Union,away_Portland Timbers,away_Real Salt Lake,away_San Jose Earthquakes,away_Seattle Sounders FC,away_Sporting Kansas City,away_Toronto FC,away_Vancouver Whitecaps,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,venue_AT&T Stadium,venue_Allianz Field,venue_Audi Field,venue_BBVA Stadium,venue_BC Place,venue_BMO Field,venue_Banc of California Stadium,venue_Bobby Dodd Stadium,venue_Buck Shaw Stadium,venue_Camping World Stadium,venue_CenturyLink Field,venue_Children's Mercy Park,venue_Citi Field Stadium,venue_Community America Ballpark,venue_DRV PNK Stadium,venue_Dick's Sporting Goods Park,venue_Dignity Health Sports Park,venue_ESPN Wide World of Sports Complex,venue_Earthquakes Stadium,venue_Empire Field,venue_Exploria Stadium,venue_FedExField,venue_Giants Stadium,venue_Gillette Stadium,venue_Historic Crew Stadium,venue_Inter Miami CF Stadium,venue_Levi's Stadium,venue_Lower.com Field,venue_Lumen Field,venue_MAPFRE Stadium,venue_Maryland SoccerPlex,venue_McAfee Coliseum,venue_Mercedes-Benz Stadium,venue_Navy-Marine Corps Memorial Stadium,venue_Nippert Stadium,venue_Nissan Stadium,venue_PayPal Park,venue_Pizza Hut Park,venue_Pratt & Whitney Stadium at Rentschler Field,venue_Providence Park,venue_Q2 Stadium,venue_Qwest Field,venue_RFK Stadium,venue_Red Bull Arena,venue_Rice-Eccles Stadium,venue_Rio Tinto Stadium,venue_Robertson Stadium,venue_Rogers Centere,venue_SeatGeek Stadium,venue_Soldier Field,venue_Stade Olympique,venue_Stade Saputo,venue_Stanford Stadium,venue_StubHub Center,venue_Subaru Park,venue_TCF Bank Stadium,venue_TQL Stadium,venue_Toyota Stadium,venue_Yankee Stadium,Outcome_Loss,Outcome_Tie,Outcome_Win
0,0,2008,2,0,0.498,7.133333,3.4,0.2,3.866667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,2008,1,1,0.488235,8.235294,4.176471,0.0,3.411765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,2008,2,0,0.507333,7.4,3.733333,0.266667,4.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,2008,3,0,0.517059,10.352941,4.235294,0.117647,4.411765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,2008,4,0,0.492667,10.066667,5.133333,0.066667,3.6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


We need to **split** our **training** and **testing** data *before* fitting our **StandardScaler** instance. This <u> prevents testing data from influencing the standardization </u> function.

To build our training and testing datasets, we need to separate two values:

input values (which are our *independent variables* commonly referred to as **model features or "X"**) and **target output** ( *dependent variable* commonly referred to as **target or "y"** in TensorFlow documentation).

We want to build a model that will predict whether or not a team is winning; therefore, we must separate the `Outcome_win` column from the rest of the input data. Also, the `Outcome_loss`, `Outcome_tie`columns have the same input for all the data, so we can drop this column.

In [13]:
# Split our preprocessed data into our features and target arrays
y = game_df["Outcome_Win"].values
X = game_df.drop(["Outcome_Win","Outcome_Loss", "Outcome_Tie" ], 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

Now that our training and testing data have been allocated, we're ready to build our **StandardScalerobject** and standardize the numerical features.

In [14]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Our data is now **preprocessed** via one-hot encoding and standardization. 

### Define Neural Network model

For our **input layer**, we must add the **number of input features equal to the number of variables in our feature** DataFrame.

In our **hidden layers**, we'll add **three hidden layers** with only a few neurons in each layer. To create the *second hidden layer*, we'll add another **Keras Dense class** while defining our model. All of our hidden layers will use the **relu activation** function to identify nonlinear characteristics from the input values.

In the **output layer**, we'll use the `sigmoid` activation function that will help us predict the probability that a team is winning or not.

In [15]:
len(X_train[0])

131

In [16]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  262
hidden_nodes_layer2 = 131
hidden_nodes_layer3 = 20

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 262)               34584     
_________________________________________________________________
dense_1 (Dense)              (None, 131)               34453     
_________________________________________________________________
dense_2 (Dense)              (None, 20)                2640      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 21        
Total params: 71,698
Trainable params: 71,698
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
36/36 - 0s - loss: 0.4306 - accuracy: 0.5142
Loss: 0.4305809438228607, Accuracy: 0.5141844153404236
