## Import necessary Dependencies

In [36]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

import numpy as np
from matplotlib import pyplot as plt
import math
import statistics as stat

import sqlalchemy
from sqlalchemy import create_engine
import psycopg2
from config import db_password

import datetime
from datetime import date
from datetime import time

import re

## Read Kaggle data from the cleaned mls_data_final.csv file

Import the clean `mls_data_final.csv` (originally from Kaggle) data on major league soccer (MLS).


In [39]:
# Read .csv from FinalProject/Resources
file_dir = "mls_data_final.csv"    

# Create a pandas DataFrame 
game_df = pd.read_csv(file_dir, low_memory=False, skipinitialspace = True)

# Show all columns
pd.set_option("display.max_columns", None)

game_df

Unnamed: 0,id,home,away,date,day,year,venue,home_score,away_score,Outcome,home_goal_scorers,away_goal_scorers,home_possessionPct,away_possessionPct,average_away_possession_from_previous_year,home_total_shots,home_shots_on_goal,away_total_shots,average_away_total_shots_previous_year,away_shots_on_goal,average_away_shots_on_goal_previous_year,home_foulsCommitted,away_foulsCommitted,home_yellowCards,away_yellowCards,home_redCards,away_redCards,average_away_redcards_previous_year,home_offsides,away_offsides,home_wonCorners,away_wonCorners,average_away_corners_won_previous_year,home_saves,away_saves
0,237750,Columbus Crew,Toronto FC,"Saturday, March 29",Saturday,2008,MAPFRE Stadium,2,0,Win,Adam Moffat:Alejandro Moreno,,50%,50%,0.00,9.0,6.0,11.0,,4.0,,13.0,8.0,1.0,1.0,0.0,0.0,,2.0,1.0,3.0,7.0,,8.0,5.0
1,237751,Real Salt Lake,Chicago Fire FC,"Saturday, March 29",Saturday,2008,Rice-Eccles Stadium,1,1,Tie,Bakary Soumare,Cuauhtemoc Blanco,51%,49%,0.00,14.0,7.0,6.0,,1.0,,11.0,5.0,3.0,1.0,0.0,0.0,,1.0,1.0,3.0,3.0,,2.0,6.0
2,237753,Sporting Kansas City,DC United,"Saturday, March 29",Saturday,2008,Community America Ballpark,2,0,Win,Ivan Trujillo:Claudio Lopez,,50%,50%,0.00,7.0,3.0,4.0,,3.0,,4.0,3.0,2.0,2.0,0.0,0.0,,1.0,2.0,2.0,6.0,,3.0,1.0
3,238379,New England Revolution,Houston Dynamo FC,"Saturday, March 29",Saturday,2008,Gillette Stadium,3,0,Win,Steve Ralston:Adam Cristman:Sainey Nyassi,,48%,52%,0.00,12.0,8.0,15.0,,3.0,,6.0,15.0,0.0,1.0,0.0,0.0,,2.0,1.0,4.0,5.0,,5.0,8.0
4,237754,Colorado Rapids,LA Galaxy,"Saturday, March 29",Saturday,2008,Dick's Sporting Goods Park,4,0,Win,Terry Cooke:Christian Gomez:Omar Cummings:Coli...,,52%,48%,0.00,11.0,4.0,5.0,,5.0,,9.0,9.0,0.0,2.0,1.0,1.0,,3.0,4.0,5.0,3.0,,9.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4508,597896,Sporting Kansas City,FC Dallas,"Saturday, July 31",Saturday,2021,Children's Mercy Park,1,2,Loss,Johnny Russell,Paxton Pomykal:Jesús Ferreira,58%,42%,0.49,20.0,4.0,12.0,,5.0,,12.0,14.0,1.0,4.0,0.0,0.0,,2.0,2.0,7.0,2.0,,3.0,3.0
4509,597898,Austin FC,Colorado Rapids,"Saturday, July 31",Saturday,2021,Q2 Stadium,0,1,Loss,,Andre Shinyashiki,57%,43%,0.51,11.0,2.0,16.0,,6.0,,9.0,6.0,2.0,0.0,0.0,0.0,,3.0,3.0,6.0,5.0,,5.0,2.0
4510,597899,Vancouver Whitecaps,Minnesota United FC,"Saturday, July 31",Saturday,2021,Rio Tinto Stadium,2,2,Tie,Cristián Dájome:Cristián Dájome,Ethan Finlay:Robin Lod,52%,48%,0.43,11.0,3.0,13.0,,4.0,,17.0,14.0,3.0,3.0,0.0,0.0,,2.0,3.0,5.0,3.0,,2.0,1.0
4511,597901,Philadelphia Union,Chicago Fire FC,"Sunday, August 1",Sunday,2021,Subaru Park,1,1,Tie,Kai Wagner,Przemyslaw Frankowski,56%,44%,0.52,25.0,5.0,4.0,,2.0,,14.0,8.0,2.0,2.0,0.0,1.0,,2.0,1.0,13.0,1.0,,1.0,4.0


## Data Preprocessing 

Tha source data has already been cleaned but we still need to do some **preprocessing** for our Neural Network Analysis.

In [40]:
# Get all the column names
game_df.columns

Index(['id', 'home', 'away', 'date', 'day', 'year', 'venue', 'home_score',
       'away_score', 'Outcome', 'home_goal_scorers', 'away_goal_scorers',
       'home_possessionPct', 'away_possessionPct',
       'average_away_possession_from_previous_year', 'home_total_shots',
       'home_shots_on_goal', 'away_total_shots',
       'average_away_total_shots_previous_year', 'away_shots_on_goal',
       'average_away_shots_on_goal_previous_year', 'home_foulsCommitted',
       'away_foulsCommitted', 'home_yellowCards', 'away_yellowCards',
       'home_redCards', 'away_redCards', 'average_away_redcards_previous_year',
       'home_offsides', 'away_offsides', 'home_wonCorners', 'away_wonCorners',
       'average_away_corners_won_previous_year', 'home_saves', 'away_saves'],
      dtype='object')

### Feature (X) selection:

There are a lot of columns but we chose to keep only those columns that **information can be know prior** to a game starts.

This should make our prediction realistic.

In [41]:
# Drop columns with data those are not easily known before a game starts.
game_df = game_df.drop(['id', 'date', 'home_goal_scorers', 'away_goal_scorers',
       'home_possessionPct', 'away_possessionPct',
       'home_total_shots', 'home_shots_on_goal', 'away_total_shots',
       'average_away_total_shots_previous_year', 'away_shots_on_goal',
       'average_away_shots_on_goal_previous_year', 'home_foulsCommitted',
       'away_foulsCommitted', 'home_yellowCards', 'away_yellowCards',
       'home_redCards', 'away_redCards', 'average_away_redcards_previous_year',
       'home_offsides', 'away_offsides', 'home_wonCorners', 'away_wonCorners',
       'average_away_corners_won_previous_year', 'home_saves', 'away_saves'], axis='columns')
game_df.head(2)

Unnamed: 0,home,away,day,year,venue,home_score,away_score,Outcome,average_away_possession_from_previous_year
0,Columbus Crew,Toronto FC,Saturday,2008,MAPFRE Stadium,2,0,Win,0.0
1,Real Salt Lake,Chicago Fire FC,Saturday,2008,Rice-Eccles Stadium,1,1,Tie,0.0


In [42]:
game_df.columns

Index(['home', 'away', 'day', 'year', 'venue', 'home_score', 'away_score',
       'Outcome', 'average_away_possession_from_previous_year'],
      dtype='object')

In [43]:
game_df.isnull().sum()

home                                            0
away                                            0
day                                             0
year                                            0
venue                                           1
home_score                                      0
away_score                                      0
Outcome                                         0
average_away_possession_from_previous_year    115
dtype: int64

In [17]:
game_df.dtypes

Unnamed: 0               int64
Unnamed: 0.1             int64
id                       int64
home                    object
away                    object
year                     int64
attendance             float64
venue                   object
away_possessionPct     float64
away_foulsCommitted      int64
away_yellowCards         int64
away_redCards            int64
away_offsides            int64
away_wonCorners          int64
away_saves               int64
Outcome                 object
dtype: object

In [18]:
# No. of venues before removing duplicate venue names
game_df.nunique()

Unnamed: 0             3120
Unnamed: 0.1           3120
id                     3120
home                     30
away                     31
year                     10
attendance             2452
venue                    63
away_possessionPct       57
away_foulsCommitted      29
away_yellowCards          8
away_redCards             4
away_offsides            12
away_wonCorners          18
away_saves               14
Outcome                   3
dtype: int64

### Processing Catagorical Features:

It looks like there are 4 columns with **categorical** values having consistent data type as *object*.

We should **generate a list of categorical variable** names using Python's "df.dtypes" property. In that case, we can use our variable list to perform the **one-hot encoding** *once*, rather than for each individual variable.

In [19]:
# Generate our categorical variable list
game_cat = game_df.dtypes[game_df.dtypes == "object"].index.tolist()
game_cat

['home', 'away', 'venue', 'Outcome']

In [20]:
# Check the number of unique values in each column
game_df[game_cat].nunique()

home       30
away       31
venue      63
Outcome     3
dtype: int64

In [21]:
game_df['home'].unique()

array(['Colorado Rapids', 'Vancouver Whitecaps', 'DC United', 'LA Galaxy',
       'San Jose Earthquakes', 'FC Dallas', 'Chivas USA',
       'Portland Timbers', 'Montreal Impact', 'Sporting Kansas City',
       'Real Salt Lake', 'Seattle Sounders FC', 'Philadelphia Union',
       'Toronto FC', 'Columbus Crew SC', 'New England Revolution',
       'Chicago Fire FC', 'New York Red Bulls', 'Houston Dynamo',
       'Orlando City SC', 'New York City FC', 'Atlanta United FC',
       'Minnesota United FC', 'LAFC', 'FC Cincinnati',
       'Houston Dynamo FC', 'Nashville SC', 'Inter Miami CF',
       'Columbus Crew', 'Austin FC'], dtype=object)

#### Encoding:

Before going to encode these columns using Scikit-learn's OneHotEncoder module, we need to make sure that if the categorical variables require **bucketing**. Since we want to predict outcome for each individual `home` team, we need to see the relation of all individual `home` team, `venue` and `game_status`, we choose not to to any binning/bucketing, and we're ready to use **OneHotEncoder**.

In [22]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(game_df[game_cat]))
encode_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Adding meaningful titles with the encoded columns:

In [23]:
# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(game_cat)
encode_df.head()

Unnamed: 0,home_Atlanta United FC,home_Austin FC,home_Chicago Fire FC,home_Chivas USA,home_Colorado Rapids,home_Columbus Crew,home_Columbus Crew SC,home_DC United,home_FC Cincinnati,home_FC Dallas,home_Houston Dynamo,home_Houston Dynamo FC,home_Inter Miami CF,home_LA Galaxy,home_LAFC,home_Minnesota United FC,home_Montreal Impact,home_Nashville SC,home_New England Revolution,home_New York City FC,home_New York Red Bulls,home_Orlando City SC,home_Philadelphia Union,home_Portland Timbers,home_Real Salt Lake,home_San Jose Earthquakes,home_Seattle Sounders FC,home_Sporting Kansas City,home_Toronto FC,home_Vancouver Whitecaps,away_Atlanta United FC,away_Austin FC,away_CF MontrÃ©al,away_Chicago Fire FC,away_Chivas USA,away_Colorado Rapids,away_Columbus Crew,away_Columbus Crew SC,away_DC United,away_FC Cincinnati,away_FC Dallas,away_Houston Dynamo,away_Houston Dynamo FC,away_Inter Miami CF,away_LA Galaxy,away_LAFC,away_Minnesota United FC,away_Montreal Impact,away_Nashville SC,away_New England Revolution,away_New York City FC,away_New York Red Bulls,away_Orlando City SC,away_Philadelphia Union,away_Portland Timbers,away_Real Salt Lake,away_San Jose Earthquakes,away_Seattle Sounders FC,away_Sporting Kansas City,away_Toronto FC,away_Vancouver Whitecaps,venue_Allianz Field,"venue_Allianz Field, Minnesota",venue_Audi Field,"venue_Audi Field, Washington DC",venue_BBVA Stadium,"venue_BBVA Stadium, Houston, TX","venue_BC Place, Vancouver",venue_BMO Field,venue_Banc of California Stadium,"venue_Banc of California Stadium, Los Angeles",venue_Bobby Dodd Stadium,venue_Buck Shaw Stadium,"venue_Camping World Stadium, Orlando","venue_CenturyLink Field, Seattle","venue_Children's Mercy Park, Kansas City","venue_Children's Mercy Park, Kansas City, KS",venue_Citi Field Stadium,"venue_DRV PNK Stadium, Fort Lauderdale, FL","venue_Dick's Sporting Goods Park, Commerce City, CO","venue_Dick's Sporting Goods Park, Denver","venue_Dignity Health Sports Park, Carson","venue_Dignity Health Sports Park, Los Angeles",venue_Earthquakes Stadium,venue_Exploria Stadium,"venue_Exploria Stadium, Orlando, Florida",venue_FedExField,venue_Gillette Stadium,"venue_Historic Crew Stadium, Columbus, OH",venue_Inter Miami CF Stadium,venue_Levi's Stadium,"venue_Lower.com Field, Columbus, OH","venue_Lumen Field, Seattle","venue_MAPFRE Stadium, Columbus","venue_Maryland SoccerPlex, Germantown",venue_Mercedes-Benz Stadium,"venue_Mercedes-Benz Stadium, Atlanta, Georgia",venue_Navy-Marine Corps Memorial Stadium,venue_Nippert Stadium,"venue_Nissan Stadium, Nashville","venue_Nissan Stadium, Nashville, Nashville, TN","venue_PayPal Park, San Jose",venue_Pratt & Whitney Stadium at Rentschler Field,"venue_Providence Park, Portland","venue_Q2 Stadium, Austin","venue_RFK Stadium, Washington, D.C.",venue_Red Bull Arena,"venue_Red Bull Arena, Harrison","venue_Rio Tinto Stadium, Sandy, UT","venue_Rio Tinto Stadium, Utah",venue_Rogers Centere,venue_SeatGeek Stadium,"venue_Soldier Field, Chicago",venue_Stade Olympique,venue_Stade Saputo,venue_Stanford Stadium,"venue_Subaru Park, Chester","venue_Subaru Park, Philadelphia",venue_TCF Bank Stadium,"venue_TQL Stadium, Cincinnati, OH",venue_Toyota Field,venue_Toyota Stadium,"venue_Toyota Stadium, Frisco, TX",venue_Yankee Stadium,Outcome_Loss,Outcome_Tie,Outcome_Win
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Merge with original DataFrame and drop the Categorical List

Now that our categorical variables have been encoded, they are ready to replace our unencoded categorical variables in our dataset using *panda's* **merge** and **drop** methods.

In [24]:
# Merge OneHotEncoded features and drop the originals from the updated df
game_df = game_df.merge(encode_df, left_index=True, right_index=True).drop(game_df[game_cat],1)
game_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,year,attendance,away_possessionPct,away_foulsCommitted,away_yellowCards,away_redCards,away_offsides,away_wonCorners,away_saves,home_Atlanta United FC,home_Austin FC,home_Chicago Fire FC,home_Chivas USA,home_Colorado Rapids,home_Columbus Crew,home_Columbus Crew SC,home_DC United,home_FC Cincinnati,home_FC Dallas,home_Houston Dynamo,home_Houston Dynamo FC,home_Inter Miami CF,home_LA Galaxy,home_LAFC,home_Minnesota United FC,home_Montreal Impact,home_Nashville SC,home_New England Revolution,home_New York City FC,home_New York Red Bulls,home_Orlando City SC,home_Philadelphia Union,home_Portland Timbers,home_Real Salt Lake,home_San Jose Earthquakes,home_Seattle Sounders FC,home_Sporting Kansas City,home_Toronto FC,home_Vancouver Whitecaps,away_Atlanta United FC,away_Austin FC,away_CF MontrÃ©al,away_Chicago Fire FC,away_Chivas USA,away_Colorado Rapids,away_Columbus Crew,away_Columbus Crew SC,away_DC United,away_FC Cincinnati,away_FC Dallas,away_Houston Dynamo,away_Houston Dynamo FC,away_Inter Miami CF,away_LA Galaxy,away_LAFC,away_Minnesota United FC,away_Montreal Impact,away_Nashville SC,away_New England Revolution,away_New York City FC,away_New York Red Bulls,away_Orlando City SC,away_Philadelphia Union,away_Portland Timbers,away_Real Salt Lake,away_San Jose Earthquakes,away_Seattle Sounders FC,away_Sporting Kansas City,away_Toronto FC,away_Vancouver Whitecaps,venue_Allianz Field,"venue_Allianz Field, Minnesota",venue_Audi Field,"venue_Audi Field, Washington DC",venue_BBVA Stadium,"venue_BBVA Stadium, Houston, TX","venue_BC Place, Vancouver",venue_BMO Field,venue_Banc of California Stadium,"venue_Banc of California Stadium, Los Angeles",venue_Bobby Dodd Stadium,venue_Buck Shaw Stadium,"venue_Camping World Stadium, Orlando","venue_CenturyLink Field, Seattle","venue_Children's Mercy Park, Kansas City","venue_Children's Mercy Park, Kansas City, KS",venue_Citi Field Stadium,"venue_DRV PNK Stadium, Fort Lauderdale, FL","venue_Dick's Sporting Goods Park, Commerce City, CO","venue_Dick's Sporting Goods Park, Denver","venue_Dignity Health Sports Park, Carson","venue_Dignity Health Sports Park, Los Angeles",venue_Earthquakes Stadium,venue_Exploria Stadium,"venue_Exploria Stadium, Orlando, Florida",venue_FedExField,venue_Gillette Stadium,"venue_Historic Crew Stadium, Columbus, OH",venue_Inter Miami CF Stadium,venue_Levi's Stadium,"venue_Lower.com Field, Columbus, OH","venue_Lumen Field, Seattle","venue_MAPFRE Stadium, Columbus","venue_Maryland SoccerPlex, Germantown",venue_Mercedes-Benz Stadium,"venue_Mercedes-Benz Stadium, Atlanta, Georgia",venue_Navy-Marine Corps Memorial Stadium,venue_Nippert Stadium,"venue_Nissan Stadium, Nashville","venue_Nissan Stadium, Nashville, Nashville, TN","venue_PayPal Park, San Jose",venue_Pratt & Whitney Stadium at Rentschler Field,"venue_Providence Park, Portland","venue_Q2 Stadium, Austin","venue_RFK Stadium, Washington, D.C.",venue_Red Bull Arena,"venue_Red Bull Arena, Harrison","venue_Rio Tinto Stadium, Sandy, UT","venue_Rio Tinto Stadium, Utah",venue_Rogers Centere,venue_SeatGeek Stadium,"venue_Soldier Field, Chicago",venue_Stade Olympique,venue_Stade Saputo,venue_Stanford Stadium,"venue_Subaru Park, Chester","venue_Subaru Park, Philadelphia",venue_TCF Bank Stadium,"venue_TQL Stadium, Cincinnati, OH",venue_Toyota Field,venue_Toyota Stadium,"venue_Toyota Stadium, Frisco, TX",venue_Yankee Stadium,Outcome_Loss,Outcome_Tie,Outcome_Win
0,0,1797,336076,2012,14746.0,0.47,14,3,0,4,4,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1798,336077,2012,21000.0,0.5,26,2,0,2,4,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1799,336078,2012,16314.0,0.57,16,1,0,0,5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,1800,336080,2012,27000.0,0.51,10,2,0,4,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,1801,336079,2012,10525.0,0.48,14,1,0,3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Split the data into Training and Testing data:

We need to **split** our **training** and **testing** data *before* fitting our **StandardScaler** instance. This <u> prevents testing data from influencing the standardization </u> function.

To build our training and testing datasets, we need to separate two values:

input values (which are our *independent variables* commonly referred to as **model features or "X"**) and **target output** ( *dependent variable* commonly referred to as **target or "y"** in TensorFlow documentation).

We want to build a model that will predict whether or not a team is winning; therefore, we must separate the `Outcome_win` column from the rest of the input data. Also, the `Outcome_loss`, `Outcome_tie`columns have the same input for all the data, so we can drop this column.

In [25]:
# Split our preprocessed data into our features and target arrays
y = game_df["Outcome_Win"].values
X = game_df.drop(["Outcome_Win","Outcome_Loss", "Outcome_Tie" ], 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

### Standardize the Features:

Now that our training and testing data have been allocated, we're ready to build our **StandardScalerobject** and standardize the numerical features.

In [26]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Our data is now **preprocessed** via one-hot encoding and standardization. 

## Choice of the Machine Learning Model:

Since we're working on major league soccer match data to predict the outcome whether **Win or Not**; the scope of outcome is known. Therefore, **Supervised ML Model** is a good fit for our prediction. Also, our prediction type falls under **Binary Classification**, so we chose to build a Neural Network based **Deep Learning** model for our project.

### Pros:

**Neural Network** based **Deep Learning Model** is a popular choice among maching learning model because of its 

- better **accuracy** in evaluating test data, and 
- ability to model **non-linear** relationship. 

### Cons:

- Although Neural Network model is like a **blackbox and hard to explain**.
- Needs very good quality **clean data**;
- **Prone to over-fitting** model which affects the performance in matching model with real test dataset.

### Define Neural Network model

For our **input layer**, we must add the **number of input features equal to the number of variables in our feature** DataFrame.

In our **hidden layers**, we'll add **three hidden layers** with only a few neurons in each layer. To create the *second hidden layer*, we'll add another **Keras Dense class** while defining our model. All of our hidden layers will use the **relu activation** function to identify nonlinear characteristics from the input values.

In the **output layer**, we'll use the `sigmoid` activation function that will help us predict the probability that a team is winning or not.

In [27]:
len(X_train[0])

136

In [28]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  272
hidden_nodes_layer2 = 136
hidden_nodes_layer3 = 44

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 272)               37264     
_________________________________________________________________
dense_1 (Dense)              (None, 136)               37128     
_________________________________________________________________
dense_2 (Dense)              (None, 44)                6028      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 45        
Total params: 80,465
Trainable params: 80,465
Non-trainable params: 0
_________________________________________________________________


### Compile and Evaluate the NN Model:

During compilation, we've tried both `binary_crossentropy` and `mean_squared_error` to measue loss in model evaluation. We found `mean_squared_error` provides a **lower loss** metric for similar percentage of accuracy.

For **optimizer** `adam` is widely used and effective for NN model.

In [29]:
# Compile the model
#nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
25/25 - 1s - loss: 0.3779 - accuracy: 0.5769
Loss: 0.3779447376728058, Accuracy: 0.5769230723381042


### Model Evaluation:

It looks like the model is fit pretty well in 100 epochs with less than **1% loss**.

While running on **test-data** the model is evaluated to predict the **Win** outcome of a game **57.6% times accurate**. We got an accuracy of 51% and 53% in our previous models. We went back to our dataset and added some more features prepared from "average grouped" data. This addition along with **optimization** of the model brough us a good increase in accuracy.

This is not a high accuracy metric, but based on our available realistic input data prior to a game starts, this prediction seems to be **fair enough**