In [73]:
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import DeterministicProcess
import pandas as pd
import preprocessing
import numpy as np
import pandas as pd


# time series features sorta
# trend feature for all of the numerical columns using a dummy dimeansion of 10
def dummy_feature_for_num(data):
    num_data=data.select_dtypes('number')
    dp=DeterministicProcess(
        index=data.date_dummy,
        constant=True,
        order=10,
        drop=True)
    
    for column in num_data.columns:
        if column!='time_dummy':
            y=data[column]
            x=dp.in_sample()
            lr=LinearRegression()
            lr.fit(x,y)
            feature=lr.predict(x)
            string=column+'_tnd'
            data[string]=feature
    return data



# another interesting thing is the motion of avarage variance in any attribute - seems so much rigurous and seems to contain a lot of seasonal changes
# i am gonna treat it as a separate feature and then i'll estimate this feature over time dummy using linear regression
# more like variance trend


def estimate_variance_of_features(data,features,keep_both_features=None):
    """we wil make variance feature first and then we will estimate that feature using regression 
        you can either keep both the variance feature and its estimate or just keep the esrimate as a feature ain the out"""
    var_est=pd.DataFrame()
    var=pd.DataFrame()
    for feature in features:
        var_of_feature=data[feature].rolling(
            window=380, #assuming feature could be having a football seasons season
            center=True,
            min_periods=191,
        ).var()
        string=feature+'_var'
        var[string]=var_of_feature
        dp=DeterministicProcess(
            index=data.date_dummy,
            constant=True,
            order=10,
            drop=True
        )
        x=dp.in_sample()
        y=var_of_feature
        fill=y.mean()
        y.fillna(fill,inplace=True)
        lr=LinearRegression()
        lr.fit(x,y)
        est_var_feature=lr.predict(x)
        string=feature+'_var_'+'tnd'
        var_est[string]=est_var_feature

    var_est.reset_index(drop=True)
    if keep_both_features==True:
        all_faetures=var_est.join(var.reset_index(drop=True))
    else:
        all_faetures=var_est
    return all_faetures


class RollingFeatures:
    """every team  to its current rolling values of features home and away combined"""
    home_features=['season','datetime','date_dummy','hometeam','ftr','htr','hthg','fthg','hs','hst','hc','hf','hy','hr','referee','week','dayofseason','day']
    away_features=['season','datetime','date_dummy','awayteam','ftr','htr','htag','ftag','as','ast','ac','af','ay','ar','referee','week','dayofseason','day']
    general_features=['season','datetime','date_dummy','team','ftr','htr','htg','ftg','s','st','c','f','y','r','referee','week','dayofseason','day']
    roll_features = ['htg','ftg','s','st','c','f','y','r']

    def __init__(self,data_path):
        self.data=preprocessing.process_reg(data_path)

    def f_one(self):
        home_data=self.data[self.home_features]
        away_data=self.data[self.away_features]
        home_data.columns=self.general_features
        away_data.columns=self.general_features
        identification=pd.concat([pd.Series(np.ones(home_data.shape[0])),pd.Series(np.zeros(away_data.shape[0]))],axis=0)
        identification.reset_index(drop=True,inplace=True) #identifying home and away teams
        data=pd.concat([home_data,away_data],axis=0) #concatting home aand away 
        data.reset_index(drop=True,inplace=True)
        data['identification']=identification
        data.sort_values(['team','date_dummy'],ascending=True,inplace=True)
        data.reset_index(drop=True,inplace=True)
        return data
        
    def rolling_features(self,data):
        """function calculates the rolling mean of the features and returns a dataframe with the rolling mean of the features"""
        rolling_features=pd.DataFrame()
        
        for feature in self.roll_features:
            name=feature+'_rol'
            blank_frame=pd.DataFrame()
            blank_frame[name]=data.groupby('team')[feature].rolling(window=38,center=True,min_periods=20).mean().shift(-1).fillna(method='ffill')
            blank_frame.index=blank_frame.index.get_level_values(1)
            rolling_features[name]=blank_frame[name]
        return rolling_features,


    def f_two(self,rol_data):
        new_data=self.f_one()
        new_data=new_data.join(rol_data)
        n_home_data=new_data[new_data.identification==1]
        n_away_data=new_data[new_data.identification==0]
        n_home_data.drop(['identification'],axis=1,inplace=True)
        n_away_data.drop(['identification'],axis=1,inplace=True)
        
        n_home_data.rename(columns={i:i+'(H)' for i in self.roll_features+['team']},inplace=True)
        n_home_data.rename(columns={i:i+'(H)' for i in [i+'_rol' for i in self.roll_features]},inplace=True)
        
        n_away_data.rename(columns={i:i+'(A)' for i in self.roll_features+['team']},inplace=True)
        n_away_data.rename(columns={i:i+'(A)' for i in [i+'_rol' for i in self.roll_features]},inplace=True)
        
        n_home_data.sort_values(['datetime','week','dayofseason','day','team(H)','referee'],ascending=True,inplace=True)
        n_away_data.sort_values(['datetime','week','dayofseason','day','team(A)','referee'],ascending=True,inplace=True)
        n_home_data.reset_index(drop=True,inplace=True)
        n_away_data.reset_index(drop=True,inplace=True)
        valid_features=[i for i in list(n_away_data.columns) if '(A)' in i]
        final_data=n_home_data.join(n_away_data[valid_features])
        return final_data
    

    def excecute(self):
        trans_data=self.f_one()
        rol_data=self.rolling_features(trans_data)
        final_data=self.f_two(rol_data)
        return final_data


def feature_selection(data):
    features=data.select_dtypes(include=['float64'])
    features=features.drop(['fthg','ftag'],axis=1) #select variables
    targets=data[['fthg','ftag']]
    return features,targets

In [74]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import datetime as dt
from sklearn.model_selection import train_test_split
import numpy as np

def season_datetime_fix(data):
    """function creates datetime features. faeatures- a date dummy as if date is starting from begginning of years  
    match week, match day of saeason, 
    Args:
        data (dataframe): data
    """
    data[['week','dayofseason','date_dummy']]= "" 
    data.datetime=pd.to_datetime(data.datetime).dt.date
    for season in data.season.unique():
        season_data=data[data['season']==season]
        start=season_data.iloc[0].datetime
        year=start.year
        delta=start-dt.date(year,1,1)
        season_data['date_dummy']=season_data.datetime-delta
        season_data.week=season_data.date_dummy.apply(lambda x:x.isocalendar()[1])
        season_data.dayofseason=season_data.date_dummy.apply(lambda x: x.timetuple().tm_yday)
        data[data.season==season]=season_data
    data['day']=data['datetime'].apply(lambda x:x.isoweekday())
    return data


def process_reg(data_path,dt_features=None):
    "features added aand rmoved and asplit the data  perticuarly for a regression setup"

    data=pd.read_csv(data_path,encoding='windows-1254')
    data.columns=[i.lower() for i in data.columns]
    # infact i have to drop all all seaons until 2000-01 - caontaains null
    data.drop(data.loc[data['season'].str.contains('1993|1994|1995|1996|1997|1998|1999',regex=True)].index,inplace=True)
    data=season_datetime_fix(data)
    data['week']=data.week.astype(int)
    data['dayofseason']=data.dayofseason.astype(int)
    data['fthg']=data['fthg'].astype(float)
    data['ftag']=data['ftag'].astype(float)
    data.sort_values('date_dummy',inplace=True) 
    return data


def postprocess(data):
    """imutation,scaling,ordinal encoding ect.."""
    pass

In [75]:
import pandas as pd

from sklearn import model_selection
def splits(data):
    chunk1,chunk2=model_selection.train_test_split(data,test_size=0.2,random_state=42)
    return chunk1,chunk2

In [76]:
import warnings
warnings.simplefilter('ignore')


data_path=r"epl_23-f.csv"
data=RollingFeatures(data_path=data_path).excecute()
# data=preprocessing.postprocess(data)
train,test=splits(data)
train,val=splits(train)

train



Unnamed: 0,season,datetime,date_dummy,team(H),ftr,htr,htg(H),ftg(H),s(H),st(H),...,y(A),r(A),htg_rol(A),ftg_rol(A),s_rol(A),st_rol(A),c_rol(A),f_rol(A),y_rol(A),r_rol(A)
4757,2012-13,2012-12-30,2012-05-14,QPR,A,A,0.0,0.0,14.0,7.0,...,1.0,0.0,0.815789,1.894737,17.921053,10.236842,7.736842,9.789474,1.342105,0.026316
3562,2009-10,2009-12-05,2009-04-23,Man City,H,D,1.0,2.0,13.0,8.0,...,6.0,0.0,1.105263,2.552632,18.184211,9.921053,7.421053,11.131579,1.315789,0.105263
7580,2019-20,2020-07-18,2019-12-11,Norwich,A,A,0.0,0.0,6.0,2.0,...,0.0,0.0,0.315789,0.789474,9.789474,3.157895,4.026316,10.842105,1.552632,0.000000
309,2000-01,2001-04-07,2000-08-19,Leeds,H,H,1.0,2.0,15.0,7.0,...,0.0,0.0,0.421053,0.921053,10.394737,4.236842,4.526316,14.605263,1.131579,0.131579
2437,2006-07,2006-12-09,2006-04-23,Blackburn,A,A,0.0,1.0,13.0,7.0,...,4.0,0.0,0.315789,0.842105,9.236842,4.368421,4.552632,11.184211,1.605263,0.105263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
788,2002-03,2002-08-31,2002-01-15,Man City,H,H,2.0,3.0,7.0,6.0,...,1.0,0.0,0.421053,0.921053,9.473684,4.973684,5.578947,12.710526,1.657895,0.026316
7311,2019-20,2019-10-26,2019-03-20,Brighton,H,D,1.0,3.0,8.0,4.0,...,1.0,0.0,0.473684,1.026316,11.605263,3.763158,5.052632,11.078947,1.842105,0.078947
7349,2019-20,2019-11-25,2019-04-19,Aston Villa,H,H,2.0,2.0,17.0,8.0,...,1.0,0.0,0.526316,1.157895,10.868421,3.763158,4.026316,9.605263,1.684211,0.078947
1619,2004-05,2004-10-30,2004-03-18,Arsenal,D,D,0.0,2.0,14.0,9.0,...,1.0,0.0,0.605263,1.157895,10.184211,5.210526,5.736842,15.131579,1.578947,0.052632


In [81]:
train.sort_values(['team(H)','date_dummy','team(A)'],inplace=True,ascending=True)

In [89]:
df=train[train['team(H)']=='Arsenal']
df.sort_values('date_dummy',inplace=True,ascending=True)
df[['ftg_rol(H)','ftg(H)','team(H)','team(A)','ftg(A)']]

Unnamed: 0,ftg_rol(H),ftg(H),team(H),team(A),ftg(A)
10,1.809524,2.0,Arsenal,Liverpool,0.0
49,1.640000,2.0,Arsenal,Bradford,0.0
99,1.566667,5.0,Arsenal,Charlton,1.0
119,1.656250,0.0,Arsenal,Charlton,0.0
159,1.694444,5.0,Arsenal,Aston Villa,1.0
...,...,...,...,...,...
8190,1.607143,0.0,Arsenal,Brighton,1.0
8227,1.720000,2.0,Arsenal,Wolves,1.0
8254,1.739130,2.0,Arsenal,Aston Villa,1.0
8261,1.727273,0.0,Arsenal,Liverpool,2.0


In [79]:
train.columns

Index(['season', 'datetime', 'date_dummy', 'team(H)', 'ftr', 'htr', 'htg(H)',
       'ftg(H)', 's(H)', 'st(H)', 'c(H)', 'f(H)', 'y(H)', 'r(H)', 'referee',
       'week', 'dayofseason', 'day', 'htg_rol(H)', 'ftg_rol(H)', 's_rol(H)',
       'st_rol(H)', 'c_rol(H)', 'f_rol(H)', 'y_rol(H)', 'r_rol(H)', 'team(A)',
       'htg(A)', 'ftg(A)', 's(A)', 'st(A)', 'c(A)', 'f(A)', 'y(A)', 'r(A)',
       'htg_rol(A)', 'ftg_rol(A)', 's_rol(A)', 'st_rol(A)', 'c_rol(A)',
       'f_rol(A)', 'y_rol(A)', 'r_rol(A)'],
      dtype='object')