# Feature Spec

In [1]:
# mixture of libs for web scraping, parsing and pandas
from bs4 import BeautifulSoup
import datetime as dt
import io
import json
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
import sqlite3
from urllib.request import Request, urlopen
import warnings

from epl.query import create_and_query, create_conn, query_creator, query_db

pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

Setup:
 - Maintain 2 historical tables:
  - 1 for match results and related data
  - 1 for features which are derived stats used to help predict e.g. avg shots on target in last $n$ games
 - Features table will need to sustain 2 kinds of updates:
  - More matches data becomes available - need to update the features for that match so we can use to continuously train the model
  - New features are added, need to compute the feature for all matches where possible in the data
  
Steps:
 - Every time the match table is updated, check to see if there are any new matches, or just old matches
 - For any new matches, call the current feature list on it to compute the features:
  - If there are new features added to the list, then exclude them for now
 - Once we have computed the features for the new matches, then we append that into the features table with the key data (to join)
 - Then we check for new features: if we have new requested features that don't have columns then we compute that for the whole set of matches

__Each column will return a dict: col_header and value of list of values__

In [5]:
# this is the set of cols that unqiuely identifies a match
key_cols = ['Date', 'Country', 'Div', 'Season', 'HomeTeam', 'AwayTeam']
# set of cols we wish to utilise for feature engineering
ft_cols = ['FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST']
# combine
all_cols = key_cols + ft_cols

# for now restrict to only england
wc = {'Country': ['=', 'england']}

In [8]:
# query the matches db for these things
df = create_and_query('matches', cols=all_cols, wc=wc)
df.tail(10)

Running query: SELECT Date, Country, Div, Season, HomeTeam, AwayTeam, FTHG, FTAG, FTR, HS, [AS], HST, AST FROM matches WHERE Country = 'england'


Unnamed: 0,Date,Country,Div,Season,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST
62124,2020-11-28,england,EC,2021,Weymouth,Chesterfield,1.0,2.0,A,,,,
62125,2020-12-01,england,EC,2021,Chesterfield,Aldershot,0.0,0.0,D,,,,
62126,2020-12-01,england,EC,2021,Kings Lynn,Bromley,1.0,4.0,A,,,,
62127,2020-12-01,england,EC,2021,Solihull,Hartlepool,2.0,0.0,H,,,,
62128,2020-12-01,england,EC,2021,Wealdstone,Torquay,1.0,2.0,A,,,,
62129,2020-12-01,england,EC,2021,Woking,Sutton,0.0,1.0,A,,,,
62130,2020-12-01,england,EC,2021,Wrexham,Altrincham,0.0,1.0,A,,,,
62131,2020-12-01,england,EC,2021,Yeovil,Eastleigh,1.0,3.0,A,,,,
62132,2020-12-02,england,EC,2021,Dag and Red,Notts County,0.0,0.0,D,,,,
62133,2020-12-02,england,EC,2021,Halifax,Barnet,5.0,2.0,H,,,,


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62134 entries, 0 to 62133
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      62134 non-null  datetime64[ns]
 1   Country   62134 non-null  object        
 2   Div       62134 non-null  object        
 3   Season    62134 non-null  object        
 4   HomeTeam  62134 non-null  object        
 5   AwayTeam  62134 non-null  object        
 6   FTHG      62134 non-null  float64       
 7   FTAG      62134 non-null  float64       
 8   FTR       62134 non-null  object        
 9   HS        45893 non-null  float64       
 10  AS        45896 non-null  float64       
 11  HST       45893 non-null  float64       
 12  AST       45896 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(6)
memory usage: 6.2+ MB
