In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

Cleaning data for usage in SFS

In [2]:
soccer = pd.read_csv("soccerMatches.csv", index_col=0)

In [3]:
soccer = soccer.sort_values("date")
soccer = soccer.reset_index(drop=True)

In [4]:
del soccer["match report"]
del soccer["captain"]
del soccer["notes"]
del soccer["attendance"]

In [5]:
win_map = {'L': 0, 'D': 0, 'W': 1}
soccer["result"] = soccer["result"].replace(win_map)

In [6]:
def target(team):
    team["target"] = team["result"].shift(-1)
    return team

In [7]:
soccer = soccer.groupby("team", group_keys=False).apply(target)
soccer["target"][pd.isnull(soccer["target"])] = 2
soccer["target"] = soccer["target"].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soccer["target"][pd.isnull(soccer["target"])] = 2


In [8]:
soccer

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,referee,sh,sot,dist,fk,pk,pkatt,season,team,target
0,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Home,1,4.0,3.0,Leeds United,...,Michael Oliver,20.0,4.0,17.0,0.0,2.0,2.0,2021,Liverpool,1
1,2020-09-12,20:00,Premier League,Matchweek 1,Sat,Home,0,0.0,2.0,Newcastle Utd,...,Stuart Attwell,15.0,3.0,15.6,0.0,0.0,0.0,2021,West Ham United,0
2,2020-09-12,15:00,Premier League,Matchweek 1,Sat,Home,1,1.0,0.0,Southampton,...,Jonathan Moss,6.0,3.0,10.1,0.0,0.0,0.0,2021,Crystal Palace,1
3,2020-09-12,20:00,Premier League,Matchweek 1,Sat,Away,1,2.0,0.0,West Ham,...,Stuart Attwell,16.0,3.0,16.2,1.0,0.0,0.0,2021,Newcastle United,0
4,2020-09-12,12:30,Premier League,Matchweek 1,Sat,Away,1,3.0,0.0,Fulham,...,Chris Kavanagh,13.0,5.0,13.6,2.0,0.0,0.0,2021,Arsenal,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,2022-04-24,14:00,Premier League,Matchweek 34,Sun,Home,1,1.0,0.0,Wolves,...,Anthony Taylor,13.0,5.0,18.8,0.0,0.0,0.0,2022,Burnley,2
1385,2022-04-24,14:00,Premier League,Matchweek 34,Sun,Home,0,2.0,2.0,Southampton,...,Robert Jones,8.0,5.0,11.2,0.0,0.0,0.0,2022,Brighton and Hove Albion,2
1386,2022-04-24,14:00,Premier League,Matchweek 34,Sun,Away,0,2.0,2.0,Brighton,...,Robert Jones,18.0,5.0,19.4,1.0,0.0,0.0,2022,Southampton,2
1387,2022-04-25,20:00,Premier League,Matchweek 34,Mon,Home,0,0.0,0.0,Leeds United,...,Darren England,17.0,7.0,13.8,0.0,0.0,0.0,2022,Crystal Palace,2


In [9]:
nulls = pd.isnull(soccer)
nulls = nulls.sum()
nulls = nulls[nulls > 0]
valid = soccer.columns[~soccer.columns.isin(nulls.index)]

In [10]:
soccer = soccer[valid.copy()]

Creating the model for machine learning

In [11]:
rf = RandomForestClassifier(n_estimators=100)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rf, n_features_to_select=9, direction="forward", cv=split)

In [12]:
cant_scale = ["date", "time", "comp", "round", "day", "venue", "opponent", "formation", "referee", "season", "team", "target", "result"]

In [13]:
selected = soccer.columns[~soccer.columns.isin(cant_scale)]

In [14]:
scaler = MinMaxScaler()
soccer[selected] = scaler.fit_transform(soccer[selected])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soccer[selected] = scaler.fit_transform(soccer[selected])


In [15]:
sfs.fit(soccer[selected], soccer["target"])

In [16]:
predictors = list(selected[sfs.get_support()])

In [17]:
predictors

['gf', 'xg', 'xga', 'poss', 'sh', 'sot', 'fk', 'pk', 'pkatt']

In [18]:
def backtest(data, model, predictors, start=0):
    predictions = []

    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons)):

        season = seasons[i]

        season_data = data[data["season"] == season]
        train, test = train_test_split(season_data, test_size=0.2)
      
    model.fit(train[predictors], train["target"])

    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index)
    
    combined = pd.concat([test["target"], preds], axis=1)
    combined.columns = ["actual", "prediction"]

    predictions.append(combined)
        
    return pd.concat(predictions)

In [19]:
predictions = backtest(soccer, rf, predictors)

In [20]:
predictions

Unnamed: 0,actual,prediction
999,1,1
895,0,0
1359,0,1
1061,0,0
894,0,1
...,...,...
879,0,0
1205,1,0
856,0,1
1089,0,0


Evaluating Accuracy

In [21]:
predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])

0.6721311475409836