In [34]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

import numpy as np
from matplotlib import pyplot as plt

import sqlalchemy
from sqlalchemy import create_engine
import psycopg2
from config import db_password


## Read data from PostgreSQL

Import the table `matches_data` from PostgreSQL database `major_league_soccer`.

In [35]:
#Add the code to create the connection to the PostgreSQL database.
# For our local server, the connection string will be as follows:
db_link = f"postgres://postgres:{db_password}@127.0.0.1:5432/major_league_soccer"    

# Create the database engine 
engine = sqlalchemy.create_engine(db_link)

In [36]:
# Read data from PostgreSQL table
mls_matches = pd.read_sql_table("matches_data", engine)
mls_matches

Unnamed: 0.1,index,Unnamed: 0,id,home,away,year,attendance,venue,game_status,shootout,...,away_yellowCards,home_redCards,away_redCards,home_offsides,away_offsides,home_wonCorners,away_wonCorners,home_saves,away_saves,Outcome
0,0,1797,336076,Colorado Rapids,Columbus Crew SC,2012,14746.0,"Dick's Sporting Goods Park, Denver",FT,False,...,3,0,0,2,4,8,4,3,3,Win
1,1,1798,336077,Vancouver Whitecaps,Montreal Impact,2012,21000.0,"BC Place, Vancouver",FT,False,...,2,0,0,2,2,4,4,7,3,Win
2,2,1799,336078,DC United,Sporting Kansas City,2012,16314.0,"RFK Stadium, Washington, D.C.",FT,False,...,1,0,0,3,0,3,5,7,1,Loss
3,3,1800,336080,LA Galaxy,Real Salt Lake,2012,27000.0,"Dignity Health Sports Park, Los Angeles",FT,False,...,2,0,0,3,4,7,2,2,3,Loss
4,4,1801,336079,San Jose Earthquakes,New England Revolution,2012,10525.0,Buck Shaw Stadium,FT,False,...,1,0,0,2,3,3,3,1,0,Win
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,3429,5226,597845,Vancouver Whitecaps,Real Salt Lake,2021,12009.0,"Rio Tinto Stadium, Sandy, UT",FT,False,...,2,0,0,0,0,5,4,4,2,Loss
3116,3430,5227,597846,LA Galaxy,FC Dallas,2021,16453.0,"Dignity Health Sports Park, Carson",FT,False,...,3,0,0,0,3,2,2,3,2,Win
3117,3431,5228,597848,Nashville SC,Atlanta United FC,2021,22913.0,"Nissan Stadium, Nashville, Nashville, TN",FT,False,...,4,0,1,6,1,6,3,0,6,Tie
3118,3432,5229,597847,New York Red Bulls,Philadelphia Union,2021,10013.0,"Red Bull Arena, Harrison",FT,False,...,2,0,1,1,4,5,5,5,2,Tie


The *mls_df* dataframe is **not showing all the columns**, as by default it shows 20 coumns. But we need to know all the Column headers to decide on testing/training data. To display all columns we can do the following,

In [41]:
# settings to display all columns
pd.set_option("display.max_columns", None)
mls_matches.head(3)

Unnamed: 0.1,index,Unnamed: 0,id,home,away,year,attendance,venue,game_status,shootout,home_score,away_score,home_possessionPct,away_possessionPct,home_foulsCommitted,away_foulsCommitted,home_yellowCards,away_yellowCards,home_redCards,away_redCards,home_offsides,away_offsides,home_wonCorners,away_wonCorners,home_saves,away_saves,Outcome
0,0,1797,336076,Colorado Rapids,Columbus Crew SC,2012,14746.0,"Dick's Sporting Goods Park, Denver",FT,False,2,0,0.53,0.47,11,14,1,3,0,0,2,4,8,4,3,3,Win
1,1,1798,336077,Vancouver Whitecaps,Montreal Impact,2012,21000.0,"BC Place, Vancouver",FT,False,2,0,0.5,0.5,13,26,1,2,0,0,2,2,4,4,7,3,Win
2,2,1799,336078,DC United,Sporting Kansas City,2012,16314.0,"RFK Stadium, Washington, D.C.",FT,False,0,1,0.43,0.57,12,16,1,1,0,0,3,0,3,5,7,1,Loss


In [37]:
mls_matches.isnull().sum()

index                  0
Unnamed: 0             0
id                     0
home                   0
away                   0
year                   0
attendance             0
venue                  0
game_status            0
shootout               0
home_score             0
away_score             0
home_possessionPct     0
away_possessionPct     0
home_foulsCommitted    0
away_foulsCommitted    0
home_yellowCards       0
away_yellowCards       0
home_redCards          0
away_redCards          0
home_offsides          0
away_offsides          0
home_wonCorners        0
away_wonCorners        0
home_saves             0
away_saves             0
Outcome                0
dtype: int64

In [38]:
mls_matches.columns

Index(['index', 'Unnamed: 0', 'id', 'home', 'away', 'year', 'attendance',
       'venue', 'game_status', 'shootout', 'home_score', 'away_score',
       'home_possessionPct', 'away_possessionPct', 'home_foulsCommitted',
       'away_foulsCommitted', 'home_yellowCards', 'away_yellowCards',
       'home_redCards', 'away_redCards', 'home_offsides', 'away_offsides',
       'home_wonCorners', 'away_wonCorners', 'home_saves', 'away_saves',
       'Outcome'],
      dtype='object')

So, there is **no more NULL** values.

Let's check the datatypes of the columns.

In [39]:
mls_matches.dtypes

index                    int64
Unnamed: 0               int64
id                       int64
home                    object
away                    object
year                     int64
attendance             float64
venue                   object
game_status             object
shootout                  bool
home_score               int64
away_score               int64
home_possessionPct     float64
away_possessionPct     float64
home_foulsCommitted      int64
away_foulsCommitted      int64
home_yellowCards         int64
away_yellowCards         int64
home_redCards            int64
away_redCards            int64
home_offsides            int64
away_offsides            int64
home_wonCorners          int64
away_wonCorners          int64
home_saves               int64
away_saves               int64
Outcome                 object
dtype: object

It looks like there are 5 columns with **categorical** values having consistent data type as *object*.

We should **generate a list of categorical variable** names using Python's "df.dtypes" property. In that case, we can use our variable list to perform the **one-hot encoding** *once*, rather than for each individual variable.

In [43]:
# Generate our categorical variable list
mls_cat = mls_matches.dtypes[mls_matches.dtypes == "object"].index.tolist()
mls_cat

['home', 'away', 'venue', 'game_status', 'Outcome']

In [44]:
# Check the number of unique values in each column
mls_matches[mls_cat].nunique()

home           30
away           31
venue          63
game_status    31
Outcome         3
dtype: int64

Before going to encode these columns using Scikit-learn's OneHotEncoder module, we need to make sure that if the categorical variables require **bucketing**. Since we want to predict outcome for each individual `home` team, we need to see the relation of all individual `home` team, `venue` and `game_status`, we choose not to to any binning/bucketing, and we're ready to use **OneHotEncoder**.

In [45]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mls_matches[mls_cat]))
encode_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [46]:
# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(mls_cat)
encode_df.head()

Unnamed: 0,home_Atlanta United FC,home_Austin FC,home_Chicago Fire FC,home_Chivas USA,home_Colorado Rapids,home_Columbus Crew,home_Columbus Crew SC,home_DC United,home_FC Cincinnati,home_FC Dallas,home_Houston Dynamo,home_Houston Dynamo FC,home_Inter Miami CF,home_LA Galaxy,home_LAFC,home_Minnesota United FC,home_Montreal Impact,home_Nashville SC,home_New England Revolution,home_New York City FC,home_New York Red Bulls,home_Orlando City SC,home_Philadelphia Union,home_Portland Timbers,home_Real Salt Lake,home_San Jose Earthquakes,home_Seattle Sounders FC,home_Sporting Kansas City,home_Toronto FC,home_Vancouver Whitecaps,away_Atlanta United FC,away_Austin FC,away_CF MontrÃ©al,away_Chicago Fire FC,away_Chivas USA,away_Colorado Rapids,away_Columbus Crew,away_Columbus Crew SC,away_DC United,away_FC Cincinnati,away_FC Dallas,away_Houston Dynamo,away_Houston Dynamo FC,away_Inter Miami CF,away_LA Galaxy,away_LAFC,away_Minnesota United FC,away_Montreal Impact,away_Nashville SC,away_New England Revolution,away_New York City FC,away_New York Red Bulls,away_Orlando City SC,away_Philadelphia Union,away_Portland Timbers,away_Real Salt Lake,away_San Jose Earthquakes,away_Seattle Sounders FC,away_Sporting Kansas City,away_Toronto FC,away_Vancouver Whitecaps,venue_Allianz Field,"venue_Allianz Field, Minnesota",venue_Audi Field,"venue_Audi Field, Washington DC",venue_BBVA Stadium,"venue_BBVA Stadium, Houston, TX","venue_BC Place, Vancouver",venue_BMO Field,venue_Banc of California Stadium,"venue_Banc of California Stadium, Los Angeles",venue_Bobby Dodd Stadium,venue_Buck Shaw Stadium,"venue_Camping World Stadium, Orlando","venue_CenturyLink Field, Seattle","venue_Children's Mercy Park, Kansas City","venue_Children's Mercy Park, Kansas City, KS",venue_Citi Field Stadium,"venue_DRV PNK Stadium, Fort Lauderdale, FL","venue_Dick's Sporting Goods Park, Commerce City, CO","venue_Dick's Sporting Goods Park, Denver","venue_Dignity Health Sports Park, Carson","venue_Dignity Health Sports Park, Los Angeles",venue_Earthquakes Stadium,venue_Exploria Stadium,"venue_Exploria Stadium, Orlando, Florida",venue_FedExField,venue_Gillette Stadium,"venue_Historic Crew Stadium, Columbus, OH",venue_Inter Miami CF Stadium,venue_Levi's Stadium,"venue_Lower.com Field, Columbus, OH","venue_Lumen Field, Seattle","venue_MAPFRE Stadium, Columbus","venue_Maryland SoccerPlex, Germantown",venue_Mercedes-Benz Stadium,"venue_Mercedes-Benz Stadium, Atlanta, Georgia",venue_Navy-Marine Corps Memorial Stadium,venue_Nippert Stadium,"venue_Nissan Stadium, Nashville","venue_Nissan Stadium, Nashville, Nashville, TN","venue_PayPal Park, San Jose",venue_Pratt & Whitney Stadium at Rentschler Field,"venue_Providence Park, Portland","venue_Q2 Stadium, Austin","venue_RFK Stadium, Washington, D.C.",venue_Red Bull Arena,"venue_Red Bull Arena, Harrison","venue_Rio Tinto Stadium, Sandy, UT","venue_Rio Tinto Stadium, Utah",venue_Rogers Centere,venue_SeatGeek Stadium,"venue_Soldier Field, Chicago",venue_Stade Olympique,venue_Stade Saputo,venue_Stanford Stadium,"venue_Subaru Park, Chester","venue_Subaru Park, Philadelphia",venue_TCF Bank Stadium,"venue_TQL Stadium, Cincinnati, OH",venue_Toyota Field,venue_Toyota Stadium,"venue_Toyota Stadium, Frisco, TX",venue_Yankee Stadium,game_status_AET,game_status_AET\nAgg. 2â1,game_status_AET\nAgg. 3â4,game_status_AET\nAgg. 4â3,game_status_AET\nAgg. 7â5,game_status_FT,game_status_FT\nAgg. 0â1,game_status_FT\nAgg. 0â2,game_status_FT\nAgg. 0â7,game_status_FT\nAgg. 1â0,game_status_FT\nAgg. 1â1,game_status_FT\nAgg. 1â2,game_status_FT\nAgg. 1â3,game_status_FT\nAgg. 2â0,game_status_FT\nAgg. 2â1,game_status_FT\nAgg. 2â2,game_status_FT\nAgg. 2â3,game_status_FT\nAgg. 2â4,game_status_FT\nAgg. 2â5,game_status_FT\nAgg. 3â1,game_status_FT\nAgg. 3â4,game_status_FT\nAgg. 3â5,game_status_FT\nAgg. 4â1,game_status_FT\nAgg. 4â3,game_status_FT\nAgg. 5â0,game_status_FT\nAgg. 5â3,game_status_FT\nAgg. 7â3,game_status_FT-Pens,game_status_FT-Pens\nAgg. 1â1,game_status_FT-Pens\nAgg. 4â4,game_status_Shootout\nAgg. 3â3,Outcome_Loss,Outcome_Tie,Outcome_Win
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Now that our categorical variables have been encoded, they are ready to replace our unencoded categorical variables in our dataset using *panda's* **merge** and **drop** methods.

In [47]:
# Merge OneHotEncoded features and drop the originals from the updated df
mls_matches = mls_matches.merge(encode_df, left_index=True, right_index=True).drop(mls_matches[mls_cat],1)
mls_matches.head()

Unnamed: 0.1,index,Unnamed: 0,id,year,attendance,shootout,home_score,away_score,home_possessionPct,away_possessionPct,home_foulsCommitted,away_foulsCommitted,home_yellowCards,away_yellowCards,home_redCards,away_redCards,home_offsides,away_offsides,home_wonCorners,away_wonCorners,home_saves,away_saves,home_Atlanta United FC,home_Austin FC,home_Chicago Fire FC,home_Chivas USA,home_Colorado Rapids,home_Columbus Crew,home_Columbus Crew SC,home_DC United,home_FC Cincinnati,home_FC Dallas,home_Houston Dynamo,home_Houston Dynamo FC,home_Inter Miami CF,home_LA Galaxy,home_LAFC,home_Minnesota United FC,home_Montreal Impact,home_Nashville SC,home_New England Revolution,home_New York City FC,home_New York Red Bulls,home_Orlando City SC,home_Philadelphia Union,home_Portland Timbers,home_Real Salt Lake,home_San Jose Earthquakes,home_Seattle Sounders FC,home_Sporting Kansas City,home_Toronto FC,home_Vancouver Whitecaps,away_Atlanta United FC,away_Austin FC,away_CF MontrÃ©al,away_Chicago Fire FC,away_Chivas USA,away_Colorado Rapids,away_Columbus Crew,away_Columbus Crew SC,away_DC United,away_FC Cincinnati,away_FC Dallas,away_Houston Dynamo,away_Houston Dynamo FC,away_Inter Miami CF,away_LA Galaxy,away_LAFC,away_Minnesota United FC,away_Montreal Impact,away_Nashville SC,away_New England Revolution,away_New York City FC,away_New York Red Bulls,away_Orlando City SC,away_Philadelphia Union,away_Portland Timbers,away_Real Salt Lake,away_San Jose Earthquakes,away_Seattle Sounders FC,away_Sporting Kansas City,away_Toronto FC,away_Vancouver Whitecaps,venue_Allianz Field,"venue_Allianz Field, Minnesota",venue_Audi Field,"venue_Audi Field, Washington DC",venue_BBVA Stadium,"venue_BBVA Stadium, Houston, TX","venue_BC Place, Vancouver",venue_BMO Field,venue_Banc of California Stadium,"venue_Banc of California Stadium, Los Angeles",venue_Bobby Dodd Stadium,venue_Buck Shaw Stadium,"venue_Camping World Stadium, Orlando","venue_CenturyLink Field, Seattle","venue_Children's Mercy Park, Kansas City","venue_Children's Mercy Park, Kansas City, KS",venue_Citi Field Stadium,"venue_DRV PNK Stadium, Fort Lauderdale, FL","venue_Dick's Sporting Goods Park, Commerce City, CO","venue_Dick's Sporting Goods Park, Denver","venue_Dignity Health Sports Park, Carson","venue_Dignity Health Sports Park, Los Angeles",venue_Earthquakes Stadium,venue_Exploria Stadium,"venue_Exploria Stadium, Orlando, Florida",venue_FedExField,venue_Gillette Stadium,"venue_Historic Crew Stadium, Columbus, OH",venue_Inter Miami CF Stadium,venue_Levi's Stadium,"venue_Lower.com Field, Columbus, OH","venue_Lumen Field, Seattle","venue_MAPFRE Stadium, Columbus","venue_Maryland SoccerPlex, Germantown",venue_Mercedes-Benz Stadium,"venue_Mercedes-Benz Stadium, Atlanta, Georgia",venue_Navy-Marine Corps Memorial Stadium,venue_Nippert Stadium,"venue_Nissan Stadium, Nashville","venue_Nissan Stadium, Nashville, Nashville, TN","venue_PayPal Park, San Jose",venue_Pratt & Whitney Stadium at Rentschler Field,"venue_Providence Park, Portland","venue_Q2 Stadium, Austin","venue_RFK Stadium, Washington, D.C.",venue_Red Bull Arena,"venue_Red Bull Arena, Harrison","venue_Rio Tinto Stadium, Sandy, UT","venue_Rio Tinto Stadium, Utah",venue_Rogers Centere,venue_SeatGeek Stadium,"venue_Soldier Field, Chicago",venue_Stade Olympique,venue_Stade Saputo,venue_Stanford Stadium,"venue_Subaru Park, Chester","venue_Subaru Park, Philadelphia",venue_TCF Bank Stadium,"venue_TQL Stadium, Cincinnati, OH",venue_Toyota Field,venue_Toyota Stadium,"venue_Toyota Stadium, Frisco, TX",venue_Yankee Stadium,game_status_AET,game_status_AET\nAgg. 2â1,game_status_AET\nAgg. 3â4,game_status_AET\nAgg. 4â3,game_status_AET\nAgg. 7â5,game_status_FT,game_status_FT\nAgg. 0â1,game_status_FT\nAgg. 0â2,game_status_FT\nAgg. 0â7,game_status_FT\nAgg. 1â0,game_status_FT\nAgg. 1â1,game_status_FT\nAgg. 1â2,game_status_FT\nAgg. 1â3,game_status_FT\nAgg. 2â0,game_status_FT\nAgg. 2â1,game_status_FT\nAgg. 2â2,game_status_FT\nAgg. 2â3,game_status_FT\nAgg. 2â4,game_status_FT\nAgg. 2â5,game_status_FT\nAgg. 3â1,game_status_FT\nAgg. 3â4,game_status_FT\nAgg. 3â5,game_status_FT\nAgg. 4â1,game_status_FT\nAgg. 4â3,game_status_FT\nAgg. 5â0,game_status_FT\nAgg. 5â3,game_status_FT\nAgg. 7â3,game_status_FT-Pens,game_status_FT-Pens\nAgg. 1â1,game_status_FT-Pens\nAgg. 4â4,game_status_Shootout\nAgg. 3â3,Outcome_Loss,Outcome_Tie,Outcome_Win
0,0,1797,336076,2012,14746.0,False,2,0,0.53,0.47,11,14,1,3,0,0,2,4,8,4,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1798,336077,2012,21000.0,False,2,0,0.5,0.5,13,26,1,2,0,0,2,2,4,4,7,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1799,336078,2012,16314.0,False,0,1,0.43,0.57,12,16,1,1,0,0,3,0,3,5,7,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,1800,336080,2012,27000.0,False,1,3,0.49,0.51,14,10,0,2,0,0,3,4,7,2,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,1801,336079,2012,10525.0,False,1,0,0.52,0.48,17,14,2,1,0,0,2,3,3,3,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


We need to **split** our **training** and **testing** data *before* fitting our **StandardScaler** instance. This <u> prevents testing data from influencing the standardization </u> function.

To build our training and testing datasets, we need to separate two values:

input values (which are our *independent variables* commonly referred to as **model features or "X"**) and **target output** ( *dependent variable* commonly referred to as **target or "y"** in TensorFlow documentation).

We want to build a model that will predict whether or not a team is winning; therefore, we must separate the `Outcome_Win` column from the rest of the input data. Also, the `Outcome_Loss`, `Outcome_Tie`columns have the same input for all the data, so we can drop this column.

In [49]:
# Split our preprocessed data into our features and target arrays
y = mls_matches["Outcome_Win"].values
X = mls_matches.drop(["Outcome_Win","Outcome_Loss", "Outcome_Tie" ], 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

Now that our training and testing data have been allocated, we're ready to build our **StandardScalerobject** and standardize the numerical features.

In [50]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Our data is now **preprocessed** via one-hot encoding and standardization. 

### Define Neural Network model

For our **input layer**, we must add the **number of input features equal to the number of variables in our feature** DataFrame.

In our **hidden layers**, we'll add **three hidden layers** with only a few neurons in each layer. To create the *second hidden layer*, we'll add another **Keras Dense class** while defining our model. All of our hidden layers will use the **relu activation** function to identify nonlinear characteristics from the input values.

In the **output layer**, we'll use the `sigmoid` activation function that will help us predict the probability that a team is winning or not.

In [51]:
len(X_train[0])

177

In [54]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  300
hidden_nodes_layer2 = 100
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 300)               53400     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               30100     
_________________________________________________________________
dense_6 (Dense)              (None, 10)                1010      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 84,521
Trainable params: 84,521
Non-trainable params: 0
_________________________________________________________________


In [55]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [53]:
# Compile the model using (loss="mean_squared_error") and output activation "linear"
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78