In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()   # go up from /notebooks to project root
SRC_DIR = PROJECT_ROOT / "src"

if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

print("Added to path:", SRC_DIR)

Added to path: C:\Users\Mani\epl-ml-predictor\src


In [2]:
from data_loading import load_raw_matches

df = load_raw_matches()
print(df.shape)
print(df.head())
print(df["SeasonFile"].value_counts())
print(df.columns)



(1140, 133)
  Div        Date   Time        HomeTeam       AwayTeam  FTHG  FTAG FTR  HTHG  \
0  E0  05/08/2022  20:00  Crystal Palace        Arsenal     0     2   A     0   
1  E0  06/08/2022  12:30          Fulham      Liverpool     2     2   D     1   
2  E0  06/08/2022  15:00     Bournemouth    Aston Villa     2     0   H     1   
3  E0  06/08/2022  15:00           Leeds         Wolves     2     1   H     1   
4  E0  06/08/2022  15:00       Newcastle  Nott'm Forest     2     0   H     0   

   HTAG  ... 1XBCH 1XBCD  1XBCA  BFECH  BFECD  BFECA  BFEC>2.5  BFEC<2.5  \
0     1  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
1     0  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
2     0  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
3     1  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   
4     0  ...   NaN   NaN    NaN    NaN    NaN    NaN       NaN       NaN   

   BFECAHH  BFECAHA  
0      NaN      NaN  


In [3]:
df["FTR"].value_counts(), df["FTR"].value_counts(normalize=True)  

(FTR
 H    514
 A    364
 D    262
 Name: count, dtype: int64,
 FTR
 H    0.450877
 A    0.319298
 D    0.229825
 Name: proportion, dtype: float64)

In [4]:
target_map = {'H': 0, 'D': 1, 'A': 2}
df["Result"] = df["FTR"].map(target_map)

print(df[["FTR", "Result"]].head())

  FTR  Result
0   A       2
1   D       1
2   H       0
3   H       0
4   H       0


In [5]:
# Show all columns that look like home/draw/away odds
odds_cols = [c for c in df.columns if c.endswith("H") or c.endswith("D") or c.endswith("A")]
print(sorted(odds_cols)[:40])  # first 40 just to inspect

['1XBA', '1XBCA', '1XBCD', '1XBCH', '1XBD', '1XBH', 'AvgA', 'AvgAHA', 'AvgAHH', 'AvgCA', 'AvgCAHA', 'AvgCAHH', 'AvgCD', 'AvgCH', 'AvgD', 'AvgH', 'B365A', 'B365AHA', 'B365AHH', 'B365CA', 'B365CAHA', 'B365CAHH', 'B365CD', 'B365CH', 'B365D', 'B365H', 'BFA', 'BFCA', 'BFCD', 'BFCH', 'BFD', 'BFEA', 'BFEAHA', 'BFEAHH', 'BFECA', 'BFECAHA', 'BFECAHH', 'BFECD', 'BFECH', 'BFED']


In [6]:
feature_cols = ["B365H", "B365D", "B365A"]

model_df = df[feature_cols + ["Result"]].dropna()

print(model_df.shape)
model_df.head()


(1140, 4)


Unnamed: 0,B365H,B365D,B365A,Result
0,4.2,3.6,1.85,2
1,11.0,6.0,1.25,1
2,3.75,3.5,2.0,0
3,2.25,3.4,3.2,0
4,1.66,3.8,5.25,0


In [7]:
from sklearn.model_selection import train_test_split

X = model_df[feature_cols]
y = model_df["Result"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((912, 3), (228, 3))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf = LogisticRegression(
    max_iter=500,
    solver="lbfgs"   # supports multinomial automatically
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5614035087719298
              precision    recall  f1-score   support

           0       0.57      0.86      0.69       103
           1       0.00      0.00      0.00        52
           2       0.54      0.53      0.54        73

    accuracy                           0.56       228
   macro avg       0.37      0.47      0.41       228
weighted avg       0.43      0.56      0.48       228



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
from features import add_basic_match_features

feat_df = add_basic_match_features(df)

feat_df[[
    "HomeTeam","AwayTeam",
    "B365H","B365D","B365A",
    "HomeFav","p_home","p_draw","p_away",
    "GoalDiff","Result"
]].head()


Unnamed: 0,HomeTeam,AwayTeam,B365H,B365D,B365A,HomeFav,p_home,p_draw,p_away,GoalDiff,Result
0,Crystal Palace,Arsenal,4.2,3.6,1.85,0,0.225381,0.262944,0.511675,-2,2
1,Fulham,Liverpool,11.0,6.0,1.25,0,0.08596,0.157593,0.756447,0,1
2,Bournemouth,Aston Villa,3.75,3.5,2.0,0,0.253394,0.271493,0.475113,2,0
3,Leeds,Wolves,2.25,3.4,3.2,1,0.422853,0.279829,0.297318,1,0
4,Newcastle,Nott'm Forest,1.66,3.8,5.25,1,0.57044,0.249192,0.180368,2,0


In [10]:
feature_cols = [
    "p_home", "p_draw", "p_away",
    "HomeFav",
]

model_df2 = feat_df[feature_cols + ["Result"]].dropna()

X2 = model_df2[feature_cols]
y2 = model_df2["Result"]


In [11]:
from sklearn.model_selection import train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf2 = LogisticRegression(max_iter=500, solver="lbfgs")
clf2.fit(X2_train, y2_train)

y2_pred = clf2.predict(X2_test)

print("Accuracy:", accuracy_score(y2_test, y2_pred))
print(classification_report(y2_test, y2_pred))


Accuracy: 0.5614035087719298
              precision    recall  f1-score   support

           0       0.58      0.82      0.68       103
           1       0.00      0.00      0.00        52
           2       0.52      0.60      0.56        73

    accuracy                           0.56       228
   macro avg       0.37      0.47      0.41       228
weighted avg       0.43      0.56      0.49       228



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
from features import add_team_form_features

feat_df2 = add_team_form_features(feat_df)   # start from previous feature set

feat_df2[[
    "HomeTeam","AwayTeam",
    "Home_PTS_roll","Away_PTS_roll",
    "Home_GF_roll","Away_GF_roll",
    "Home_GA_roll","Away_GA_roll",
    "Result"
]].head(10)


Unnamed: 0,HomeTeam,AwayTeam,Home_PTS_roll,Away_PTS_roll,Home_GF_roll,Away_GF_roll,Home_GA_roll,Away_GA_roll,Result
0,Crystal Palace,Arsenal,1.4,1.4,1.0,1.2,1.0,1.0,2
1,Fulham,Liverpool,1.4,1.6,2.0,1.4,1.8,1.2,1
2,Bournemouth,Aston Villa,1.2,0.2,1.6,0.6,1.8,1.8,0
3,Leeds,Wolves,1.8,1.2,1.8,0.8,1.6,1.4,0
4,Newcastle,Nott'm Forest,2.2,1.6,2.2,1.6,0.8,1.0,0
5,Tottenham,Southampton,1.8,0.6,2.0,1.0,1.4,3.0,0
6,Everton,Chelsea,1.0,2.0,0.4,1.8,0.8,0.4,2
7,Leicester,Brentford,1.4,0.6,1.4,1.2,2.0,1.6,1
8,Man United,Brighton,0.8,1.8,1.2,1.6,1.8,2.0,2
9,West Ham,Man City,1.8,2.0,1.6,1.6,1.2,0.8,2


In [14]:
feature_cols2 = [
    "p_home","p_draw","p_away","HomeFav",
    "Home_PTS_roll","Away_PTS_roll",
    "Home_GF_roll","Away_GF_roll",
    "Home_GA_roll","Away_GA_roll",
]

model_df3 = feat_df2[feature_cols2 + ["Result"]].dropna()

X3 = model_df3[feature_cols2]
y3 = model_df3["Result"]


In [15]:
from sklearn.model_selection import train_test_split

X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=42, stratify=y3
)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf3 = LogisticRegression(max_iter=1000, solver="lbfgs")
clf3.fit(X3_train, y3_train)

y3_pred = clf3.predict(X3_test)

print("Accuracy:", accuracy_score(y3_test, y3_pred))
print(classification_report(y3_test, y3_pred))


Accuracy: 0.6620370370370371
              precision    recall  f1-score   support

           0       0.73      0.84      0.78        98
           1       0.38      0.16      0.23        49
           2       0.65      0.77      0.70        69

    accuracy                           0.66       216
   macro avg       0.58      0.59      0.57       216
weighted avg       0.62      0.66      0.63       216

