In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()   # go up from /notebooks to project root
SRC_DIR = PROJECT_ROOT / "src"

if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

print("Added to path:", SRC_DIR)

Added to path: C:\Users\Mani\epl-ml-predictor\src


In [2]:
from data_loading import load_raw_matches

df = load_raw_matches()
print(df.shape)
print(df.head())
print(df["SeasonFile"].value_counts())
print(df.columns)



(1140, 133)
  Div        Date   Time        HomeTeam       AwayTeam  FTHG  FTAG FTR  HTHG  \
0  E0  05/08/2022  20:00  Crystal Palace        Arsenal     0     2   A     0   
1  E0  06/08/2022  12:30          Fulham      Liverpool     2     2   D     1   
2  E0  06/08/2022  15:00     Bournemouth    Aston Villa     2     0   H     1   
3  E0  06/08/2022  15:00           Leeds         Wolves     2     1   H     1   
4  E0  06/08/2022  15:00       Newcastle  Nott'm Forest     2     0   H     0   

   HTAG  ... 1XBCH 1XBCD  1XBCA  BFECH  BFECD  BFECA  BFEC>2.5  BFEC<2.5  \
0     1  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
1     0  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
2     0  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
3     1  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
4     0  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   

   BFECAHH  BFECAHA  
0      NaN      NaN  


In [3]:
df["FTR"].value_counts(), df["FTR"].value_counts(normalize=True)  

(FTR
 H    514
 A    364
 D    262
 Name: count, dtype: int64,
 FTR
 H    0.450877
 A    0.319298
 D    0.229825
 Name: proportion, dtype: float64)

In [4]:
target_map = {'H': 0, 'D': 1, 'A': 2}
df["Result"] = df["FTR"].map(target_map)

print(df[["FTR", "Result"]].head())

  FTR  Result
0   A       2
1   D       1
2   H       0
3   H       0
4   H       0


In [5]:
# Show all columns that look like home/draw/away odds
odds_cols = [c for c in df.columns if c.endswith("H") or c.endswith("D") or c.endswith("A")]
print(sorted(odds_cols)[:40])  # first 40 just to inspect

['1XBA', '1XBCA', '1XBCD', '1XBCH', '1XBD', '1XBH', 'AvgA', 'AvgAHA', 'AvgAHH', 'AvgCA', 'AvgCAHA', 'AvgCAHH', 'AvgCD', 'AvgCH', 'AvgD', 'AvgH', 'B365A', 'B365AHA', 'B365AHH', 'B365CA', 'B365CAHA', 'B365CAHH', 'B365CD', 'B365CH', 'B365D', 'B365H', 'BFA', 'BFCA', 'BFCD', 'BFCH', 'BFD', 'BFEA', 'BFEAHA', 'BFEAHH', 'BFECA', 'BFECAHA', 'BFECAHH', 'BFECD', 'BFECH', 'BFED']


In [6]:
feature_cols = ["B365H", "B365D", "B365A"]

model_df = df[feature_cols + ["Result"]].dropna()

print(model_df.shape)
model_df.head()


(1140, 4)


Unnamed: 0,B365H,B365D,B365A,Result
0,4.2,3.6,1.85,2
1,11.0,6.0,1.25,1
2,3.75,3.5,2.0,0
3,2.25,3.4,3.2,0
4,1.66,3.8,5.25,0


In [7]:
from sklearn.model_selection import train_test_split

X = model_df[feature_cols]
y = model_df["Result"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((912, 3), (228, 3))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf = LogisticRegression(
    max_iter=500,
    solver="lbfgs"   # supports multinomial automatically
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5614035087719298
              precision    recall  f1-score   support

           0       0.57      0.86      0.69       103
           1       0.00      0.00      0.00        52
           2       0.54      0.53      0.54        73

    accuracy                           0.56       228
   macro avg       0.37      0.47      0.41       228
weighted avg       0.43      0.56      0.48       228



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
from features import add_basic_match_features

feat_df = add_basic_match_features(df)

feat_df[[
    "HomeTeam","AwayTeam",
    "B365H","B365D","B365A",
    "HomeFav","p_home","p_draw","p_away",
    "GoalDiff","Result"
]].head()


Unnamed: 0,HomeTeam,AwayTeam,B365H,B365D,B365A,HomeFav,p_home,p_draw,p_away,GoalDiff,Result
0,Crystal Palace,Arsenal,4.2,3.6,1.85,0,0.225381,0.262944,0.511675,-2,2
1,Fulham,Liverpool,11.0,6.0,1.25,0,0.08596,0.157593,0.756447,0,1
2,Bournemouth,Aston Villa,3.75,3.5,2.0,0,0.253394,0.271493,0.475113,2,0
3,Leeds,Wolves,2.25,3.4,3.2,1,0.422853,0.279829,0.297318,1,0
4,Newcastle,Nott'm Forest,1.66,3.8,5.25,1,0.57044,0.249192,0.180368,2,0


In [10]:
feature_cols = [
    "p_home", "p_draw", "p_away",
    "HomeFav",
]

model_df2 = feat_df[feature_cols + ["Result"]].dropna()

X2 = model_df2[feature_cols]
y2 = model_df2["Result"]


In [11]:
from sklearn.model_selection import train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf2 = LogisticRegression(max_iter=500, solver="lbfgs")
clf2.fit(X2_train, y2_train)

y2_pred = clf2.predict(X2_test)

print("Accuracy:", accuracy_score(y2_test, y2_pred))
print(classification_report(y2_test, y2_pred))


Accuracy: 0.5614035087719298
              precision    recall  f1-score   support

           0       0.58      0.82      0.68       103
           1       0.00      0.00      0.00        52
           2       0.52      0.60      0.56        73

    accuracy                           0.56       228
   macro avg       0.37      0.47      0.41       228
weighted avg       0.43      0.56      0.49       228



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
from features import add_team_form_features

feat_df2 = add_team_form_features(feat_df)   # start from previous feature set

feat_df2[[
    "HomeTeam","AwayTeam",
    "Home_PTS_roll","Away_PTS_roll",
    "Home_GF_roll","Away_GF_roll",
    "Home_GA_roll","Away_GA_roll",
    "Result"
]].head(10)


Unnamed: 0,HomeTeam,AwayTeam,Home_PTS_roll,Away_PTS_roll,Home_GF_roll,Away_GF_roll,Home_GA_roll,Away_GA_roll,Result
0,Crystal Palace,Arsenal,1.4,1.4,1.0,1.2,1.0,1.0,2
1,Fulham,Liverpool,1.4,1.6,2.0,1.4,1.8,1.2,1
2,Bournemouth,Aston Villa,1.2,0.2,1.6,0.6,1.8,1.8,0
3,Leeds,Wolves,1.8,1.2,1.8,0.8,1.6,1.4,0
4,Newcastle,Nott'm Forest,2.2,1.6,2.2,1.6,0.8,1.0,0
5,Tottenham,Southampton,1.8,0.6,2.0,1.0,1.4,3.0,0
6,Everton,Chelsea,1.0,2.0,0.4,1.8,0.8,0.4,2
7,Leicester,Brentford,1.4,0.6,1.4,1.2,2.0,1.6,1
8,Man United,Brighton,0.8,1.8,1.2,1.6,1.8,2.0,2
9,West Ham,Man City,1.8,2.0,1.6,1.6,1.2,0.8,2


In [14]:
feature_cols2 = [
    "p_home","p_draw","p_away","HomeFav",
    "Home_PTS_roll","Away_PTS_roll",
    "Home_GF_roll","Away_GF_roll",
    "Home_GA_roll","Away_GA_roll",
]

model_df3 = feat_df2[feature_cols2 + ["Result"]].dropna()

X3 = model_df3[feature_cols2]
y3 = model_df3["Result"]


In [15]:
from sklearn.model_selection import train_test_split

X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=42, stratify=y3
)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf3 = LogisticRegression(max_iter=1000, solver="lbfgs")
clf3.fit(X3_train, y3_train)

y3_pred = clf3.predict(X3_test)

print("Accuracy:", accuracy_score(y3_test, y3_pred))
print(classification_report(y3_test, y3_pred))


Accuracy: 0.6620370370370371
              precision    recall  f1-score   support

           0       0.73      0.84      0.78        98
           1       0.38      0.16      0.23        49
           2       0.65      0.77      0.70        69

    accuracy                           0.66       216
   macro avg       0.58      0.59      0.57       216
weighted avg       0.62      0.66      0.63       216



In [17]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y3_test, y3_pred, labels=[0,1,2])
cm

array([[82,  6, 10],
       [22,  8, 19],
       [ 9,  7, 53]])

In [18]:
import pandas as pd

cm_df = pd.DataFrame(
    cm,
    index=["True 0 (Home win)", "True 1 (Draw)", "True 2 (Away win)"],
    columns=["Pred 0 (Home)", "Pred 1 (Draw)", "Pred 2 (Away)"]
)
cm_df

Unnamed: 0,Pred 0 (Home),Pred 1 (Draw),Pred 2 (Away)
True 0 (Home win),82,6,10
True 1 (Draw),22,8,19
True 2 (Away win),9,7,53


In [19]:
from features import add_basic_match_features, add_team_form_features, add_gap_features

feat_df = add_basic_match_features(df)
feat_df2 = add_team_form_features(feat_df)
feat_df3 = add_gap_features(feat_df2)

feat_df3[[
    "HomeTeam","AwayTeam",
    "Home_PTS_roll","Away_PTS_roll",
    "PTS_gap","GF_gap","GA_gap",
    "Form_balance_PTS","Form_balance_GF","Form_balance_GA",
    "Result"
]].head(10)

Unnamed: 0,HomeTeam,AwayTeam,Home_PTS_roll,Away_PTS_roll,PTS_gap,GF_gap,GA_gap,Form_balance_PTS,Form_balance_GF,Form_balance_GA,Result
0,Crystal Palace,Arsenal,1.4,1.4,0.0,-0.2,0.0,0.0,0.2,0.0,2
1,Fulham,Liverpool,1.4,1.6,-0.2,0.6,-0.6,0.2,0.6,0.6,1
2,Bournemouth,Aston Villa,1.2,0.2,1.0,1.0,0.0,1.0,1.0,0.0,0
3,Leeds,Wolves,1.8,1.2,0.6,1.0,-0.2,0.6,1.0,0.2,0
4,Newcastle,Nott'm Forest,2.2,1.6,0.6,0.6,0.2,0.6,0.6,0.2,0
5,Tottenham,Southampton,1.8,0.6,1.2,1.0,1.6,1.2,1.0,1.6,0
6,Everton,Chelsea,1.0,2.0,-1.0,-1.4,-0.4,1.0,1.4,0.4,2
7,Leicester,Brentford,1.4,0.6,0.8,0.2,-0.4,0.8,0.2,0.4,1
8,Man United,Brighton,0.8,1.8,-1.0,-0.4,0.2,1.0,0.4,0.2,2
9,West Ham,Man City,1.8,2.0,-0.2,0.0,-0.4,0.2,0.0,0.4,2


In [20]:
feature_cols = [
    "p_home","p_draw","p_away",
    "Home_PTS_roll","Away_PTS_roll",
    "Home_GF_roll","Away_GF_roll",
    "Home_GA_roll","Away_GA_roll",
    "PTS_gap","GF_gap","GA_gap",
    "Form_balance_PTS","Form_balance_GF","Form_balance_GA",
]

model_df = feat_df3[feature_cols + ["Result"]].dropna()

X = model_df[feature_cols]
y = model_df["Result"]

model_df.shape


(1077, 16)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape


((861, 15), (216, 15))

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf = LogisticRegression(max_iter=1000, solver="lbfgs")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

           0       0.72      0.83      0.77        98
           1       0.43      0.18      0.26        49
           2       0.65      0.78      0.71        69

    accuracy                           0.67       216
   macro avg       0.60      0.60      0.58       216
weighted avg       0.63      0.67      0.64       216



In [23]:
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_test, y_pred, labels=[0,1,2])
cm_df = pd.DataFrame(
    cm,
    index=["True 0 (Home win)", "True 1 (Draw)", "True 2 (Away win)"],
    columns=["Pred 0 (Home)", "Pred 1 (Draw)", "Pred 2 (Away)"]
)

cm_df


Unnamed: 0,Pred 0 (Home),Pred 1 (Draw),Pred 2 (Away)
True 0 (Home win),81,6,11
True 1 (Draw),22,9,18
True 2 (Away win),9,6,54


In [24]:
from features import add_basic_match_features, add_team_form_features, add_gap_features, add_team_strength_features


feat_df = add_basic_match_features(df)
feat_df2 = add_team_form_features(feat_df)
feat_df3 = add_gap_features(feat_df2)
feat_df4 = add_team_strength_features(feat_df3)

feat_df4[[
    "HomeTeam", "AwayTeam",
    "Home_PTS_roll", "Away_PTS_roll",
    "Home_Season_PTS_avg", "Away_Season_PTS_avg",
    "Season_PTS_gap",
    "Result"
]].head(10)


Unnamed: 0,HomeTeam,AwayTeam,Home_PTS_roll,Away_PTS_roll,Home_Season_PTS_avg,Away_Season_PTS_avg,Season_PTS_gap,Result
0,Crystal Palace,Arsenal,1.4,1.4,0.571429,2.111111,-1.539683,2
1,Fulham,Liverpool,1.4,1.6,0.555556,1.454545,-0.89899,1
2,Bournemouth,Aston Villa,1.2,0.2,1.0,1.5,-0.5,0
3,Leeds,Wolves,1.8,1.2,1.1,1.4,-0.3,0
4,Newcastle,Nott'm Forest,2.2,1.6,1.875,1.0,0.875,0
5,Tottenham,Southampton,1.8,0.6,1.777778,0.428571,1.349206,0
6,Everton,Chelsea,1.0,2.0,1.0,1.636364,-0.636364,2
7,Leicester,Brentford,1.4,0.6,1.0,1.6,-0.6,1
8,Man United,Brighton,0.8,1.8,1.25,2.6,-1.35,2
9,West Ham,Man City,1.8,2.0,1.0,2.5,-1.5,2


In [25]:
feature_cols = [
    # odds-based
    "p_home","p_draw","p_away",

    # short-term form (rolling window)
    "Home_PTS_roll","Away_PTS_roll",
    "Home_GF_roll","Away_GF_roll",
    "Home_GA_roll","Away_GA_roll",
    "PTS_gap","GF_gap","GA_gap",
    "Form_balance_PTS","Form_balance_GF","Form_balance_GA",

    # season-long strength
    "Home_Season_PTS_avg","Away_Season_PTS_avg",
    "Home_Season_GF_avg","Away_Season_GF_avg",
    "Home_Season_GA_avg","Away_Season_GA_avg",
    "Season_PTS_gap","Season_GF_gap","Season_GA_gap",
]

model_df = feat_df4[feature_cols + ["Result"]].dropna()

X = model_df[feature_cols]
y = model_df["Result"]
model_df.shape


(1077, 25)

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=1000, solver="lbfgs")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7083333333333334
              precision    recall  f1-score   support

           0       0.78      0.85      0.81        98
           1       0.55      0.24      0.34        49
           2       0.67      0.84      0.74        69

    accuracy                           0.71       216
   macro avg       0.66      0.64      0.63       216
weighted avg       0.69      0.71      0.68       216



In [27]:
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_test, y_pred, labels=[0,1,2])
cm_df = pd.DataFrame(
    cm,
    index=["True 0 (Home win)", "True 1 (Draw)", "True 2 (Away win)"],
    columns=["Pred 0 (Home)", "Pred 1 (Draw)", "Pred 2 (Away)"]
)
cm_df


Unnamed: 0,Pred 0 (Home),Pred 1 (Draw),Pred 2 (Away)
True 0 (Home win),83,4,11
True 1 (Draw),19,12,18
True 2 (Away win),5,6,58


In [28]:
y_proba = clf.predict_proba(X_test)

In [29]:
from sklearn.metrics import brier_score_loss
import numpy as np

# One-vs-rest Brier score for each class
brier_home = brier_score_loss((y_test == 0).astype(int), y_proba[:, 0])
brier_draw = brier_score_loss((y_test == 1).astype(int), y_proba[:, 1])
brier_away = brier_score_loss((y_test == 2).astype(int), y_proba[:, 2])

print("Brier (Home):", brier_home)
print("Brier (Draw):", brier_draw)
print("Brier (Away):", brier_away)

# Overall macro Brier
macro_brier = np.mean([brier_home, brier_draw, brier_away])
print("Macro Brier:", macro_brier)


Brier (Home): 0.13909539187983638
Brier (Draw): 0.1606256990399098
Brier (Away): 0.12236857020862482
Macro Brier: 0.140696553709457


In [30]:
from sklearn.calibration import CalibratedClassifierCV

cal_clf = CalibratedClassifierCV(clf, method="isotonic", cv=5)
cal_clf.fit(X_train, y_train)

y_proba_cal = cal_clf.predict_proba(X_test)


In [31]:
brier_home_cal = brier_score_loss((y_test == 0).astype(int), y_proba_cal[:, 0])
brier_draw_cal = brier_score_loss((y_test == 1).astype(int), y_proba_cal[:, 1])
brier_away_cal = brier_score_loss((y_test == 2).astype(int), y_proba_cal[:, 2])

print("Calibrated Brier — Home:", brier_home_cal)
print("Calibrated Brier — Draw:", brier_draw_cal)
print("Calibrated Brier — Away:", brier_away_cal)
print("Macro Brier (Calibrated):", np.mean([brier_home_cal, brier_draw_cal, brier_away_cal]))


Calibrated Brier — Home: 0.1408225605759659
Calibrated Brier — Draw: 0.16751786715804418
Calibrated Brier — Away: 0.12278901591675774
Macro Brier (Calibrated): 0.14370981455025592


In [32]:
from features import (
    add_basic_match_features,
    add_team_form_features,
    add_gap_features,
    add_team_strength_features,
)

feat_df = add_basic_match_features(df)
feat_df2 = add_team_form_features(feat_df)
feat_df3 = add_gap_features(feat_df2)
feat_df4 = add_team_strength_features(feat_df3)



In [33]:
feat_df4[[
    "HomeTeam","AwayTeam",
    "Home_Home_PTS_roll","Away_Away_PTS_roll",
    "PTS_gap_split",
    "Result"
]].head(12)


Unnamed: 0,HomeTeam,AwayTeam,Home_Home_PTS_roll,Away_Away_PTS_roll,PTS_gap_split,Result
0,Crystal Palace,Arsenal,,,,2
1,Fulham,Liverpool,,,,1
2,Bournemouth,Aston Villa,,,,0
3,Leeds,Wolves,,,,0
4,Newcastle,Nott'm Forest,,,,0
5,Tottenham,Southampton,,,,0
6,Everton,Chelsea,,,,2
7,Leicester,Brentford,,,,1
8,Man United,Brighton,,,,2
9,West Ham,Man City,,,,2


In [34]:
feature_cols += [
    "Home_Home_PTS_roll","Away_Away_PTS_roll",
    "Home_Home_GF_roll","Away_Away_GF_roll",
    "Home_Home_GA_roll","Away_Away_GA_roll",
    "PTS_gap_split","GF_gap_split","GA_gap_split",
]


In [35]:
model_df = feat_df4[feature_cols + ["Result"]].dropna()

X = model_df[feature_cols]
y = model_df["Result"]


In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=1000, solver="lbfgs")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7560975609756098
              precision    recall  f1-score   support

           0       0.78      0.89      0.83        93
           1       0.65      0.24      0.35        46
           2       0.74      0.92      0.82        66

    accuracy                           0.76       205
   macro avg       0.72      0.69      0.67       205
weighted avg       0.74      0.76      0.72       205



In [37]:
cm = confusion_matrix(y_test, y_pred, labels=[0,1,2])
pd.DataFrame(cm,
  index=["True Home","True Draw","True Away"],
  columns=["Pred Home","Pred Draw","Pred Away"])


Unnamed: 0,Pred Home,Pred Draw,Pred Away
True Home,83,3,7
True Draw,21,11,14
True Away,2,3,61


In [42]:
from features import (
    add_basic_match_features,
    add_team_form_features,
    add_gap_features,
    add_team_strength_features,
    add_elo_features,
)

# 1. Start from raw df and sort by date
df_raw = df.sort_values("Date")

# 2. Apply feature steps in sequence
feat_df = add_basic_match_features(df_raw)
feat_df = add_team_form_features(feat_df)
feat_df = add_gap_features(feat_df)
feat_df = add_team_strength_features(feat_df)
feat_df = add_elo_features(feat_df)

# Quick sanity check: do the columns exist now?
feat_df[[
    "HomeTeam","AwayTeam",
    "p_home","Home_PTS_roll","Home_Season_PTS_avg",
    "Home_Home_PTS_roll","Away_Away_PTS_roll",
    "ELO_Home","ELO_Away","ELO_Diff",
    "Result"
]].head()



Unnamed: 0,HomeTeam,AwayTeam,p_home,Home_PTS_roll,Home_Season_PTS_avg,Home_Home_PTS_roll,Away_Away_PTS_roll,ELO_Home,ELO_Away,ELO_Diff,Result
0,Nott'm Forest,Chelsea,0.190224,,1.0,,,1500.0,1500.0,0.0,1
1,Tottenham,Aston Villa,0.570939,,0.0,,,1500.0,1500.0,0.0,2
2,Liverpool,Newcastle,0.653866,,3.0,,,1500.0,1500.0,0.0,0
3,Brentford,Arsenal,0.1453,,0.0,,,1500.0,1500.0,0.0,2
4,Wolves,Man United,0.362066,,0.0,,,1500.0,1500.0,0.0,2


In [43]:
feature_cols = [
    # odds-based features
    "p_home","p_draw","p_away",

    # short-term form (rolling)
    "Home_PTS_roll","Away_PTS_roll",
    "Home_GF_roll","Away_GF_roll",
    "Home_GA_roll","Away_GA_roll",
    "PTS_gap","GF_gap","GA_gap",
    "Form_balance_PTS","Form_balance_GF","Form_balance_GA",

    # season-long strength
    "Home_Season_PTS_avg","Away_Season_PTS_avg",
    "Home_Season_GF_avg","Away_Season_GF_avg",
    "Home_Season_GA_avg","Away_Season_GA_avg",
    "Season_PTS_gap","Season_GF_gap","Season_GA_gap",

    # venue-specific form
    "Home_Home_PTS_roll","Away_Away_PTS_roll",
    "Home_Home_GF_roll","Away_Away_GF_roll",
    "Home_Home_GA_roll","Away_Away_GA_roll",
    "PTS_gap_split","GF_gap_split","GA_gap_split",

    # NEW: ELO
    "ELO_Home","ELO_Away","ELO_Diff",
]

model_df = feat_df[feature_cols + ["Result"]].dropna()

X = model_df[feature_cols]
y = model_df["Result"]


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=1000, solver="lbfgs")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=[0,1,2])
pd.DataFrame(
    cm,
    index=["True Home","True Draw","True Away"],
    columns=["Pred Home","Pred Draw","Pred Away"]
)


Accuracy: 0.7746478873239436
              precision    recall  f1-score   support

           0       0.79      0.95      0.86        97
           1       0.68      0.40      0.50        48
           2       0.79      0.79      0.79        68

    accuracy                           0.77       213
   macro avg       0.75      0.71      0.72       213
weighted avg       0.76      0.77      0.76       213



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Pred Home,Pred Draw,Pred Away
True Home,92,3,2
True Draw,17,19,12
True Away,8,6,54
