# Football Match Outcome Prediction Model

This notebook demonstrates training a simple prediction model for football match outcomes.


In [18]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append("src")

In [19]:
from spark_session import create_spark_session

spark = create_spark_session()

Setting Spark log level to "ERROR".


In [None]:
from data_loader import get_data_path, load_parquet_data

matches_transformed_df, load_time = load_parquet_data(spark)

Execution time for load_parquet_data: 0.14 seconds


In [None]:
matches_transformed_df.select(
    "Date", "HomeTeam", "AwayTeam", "HomeElo", "AwayElo", "EloDiff", "Form3Home", "Form3Away", "FTResult"
).show(5)

+----------+----------------+----------+-------+-------+-------+---------+---------+--------+
|      Date|        HomeTeam|  AwayTeam|HomeElo|AwayElo|EloDiff|Form3Home|Form3Away|FTResult|
+----------+----------------+----------+-------+-------+-------+---------+---------+--------+
|2023-01-21|        Coventry|   Norwich|1514.06|1557.43| -43.37|      2.0|      4.0|       A|
|2023-02-04|         Norwich|   Burnley|1568.72|1718.49|-149.77|      6.0|      9.0|       A|
|2023-02-11|         Burnley|   Preston|1718.49|1503.04| 215.45|      9.0|      3.0|       H|
|2023-02-21|         Norwich|Birmingham|1556.65|1446.99| 109.66|      4.0|      3.0|       H|
|2023-03-11|Sheffield United|     Luton| 1624.2|1560.28|  63.92|      6.0|      7.0|       A|
+----------+----------------+----------+-------+-------+-------+---------+---------+--------+
only showing top 5 rows



In [22]:
from ml import prepare_features

(matches_features_df, prep_time) = prepare_features(matches_transformed_df)

Execution time for prepare_features: 0.41 seconds


In [23]:
matches_features_df.select("Features", "Label", "FTResult").show(5, truncate=False)

+---------------------------------+-----+--------+
|Features                         |Label|FTResult|
+---------------------------------+-----+--------+
|[1514.06,1557.43,-43.37,2.0,4.0] |1.0  |A       |
|[1568.72,1718.49,-149.77,6.0,9.0]|1.0  |A       |
|[1718.49,1503.04,215.45,9.0,3.0] |0.0  |H       |
|[1556.65,1446.99,109.66,4.0,3.0] |0.0  |H       |
|[1624.2,1560.28,63.92,6.0,7.0]   |1.0  |A       |
+---------------------------------+-----+--------+
only showing top 5 rows



In [24]:
from ml import train_and_evaluate

# Train model with different test sizes to evaluate stability
test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]
results = []


for test_size in test_sizes:

    (model, predictions, accuracy), train_time = train_and_evaluate(matches_transformed_df, test_size)

    results.append({"TestSize": test_size, "Accuracy": accuracy, "TrainingTime": train_time})

    print(f"Test size: {test_size}, Accuracy: {accuracy:.4f}, Training time: {train_time:.2f}s")

Execution time for prepare_features: 0.31 seconds
Execution time for train_and_evaluate: 2.32 seconds
Test size: 0.1, Accuracy: 0.4886, Training time: 2.32s
Execution time for prepare_features: 0.22 seconds
Execution time for train_and_evaluate: 1.60 seconds
Test size: 0.2, Accuracy: 0.4977, Training time: 1.60s
Execution time for prepare_features: 0.15 seconds
Execution time for train_and_evaluate: 1.53 seconds
Test size: 0.3, Accuracy: 0.4967, Training time: 1.53s
Execution time for prepare_features: 0.19 seconds
Execution time for train_and_evaluate: 1.68 seconds
Test size: 0.4, Accuracy: 0.4971, Training time: 1.68s
Execution time for prepare_features: 0.40 seconds
Execution time for train_and_evaluate: 1.99 seconds
Test size: 0.5, Accuracy: 0.4959, Training time: 1.99s


In [25]:
prediction_counts = predictions.groupBy("FTResult", "Prediction").count().orderBy("FTResult", "Prediction")
prediction_counts.show()

+--------+----------+-----+
|FTResult|Prediction|count|
+--------+----------+-----+
|       A|       0.0|11373|
|       A|       1.0| 7732|
|       D|       0.0|13721|
|       D|       1.0| 4390|
|       H|       0.0|25870|
|       H|       1.0| 4669|
+--------+----------+-----+



In [26]:
import pyspark.sql.functions as F

league_accuracy = (
    predictions.groupBy("League")
    .agg(F.count(F.when(F.col("Prediction") == F.col("Label"), 1)).alias("Correct"), F.count("*").alias("Total"))
    .withColumn("Accuracy", F.round(F.col("Correct") / F.col("Total"), 2))
    .orderBy(F.desc("Accuracy"))
)

league_accuracy.show(20)

+------+-------+-----+--------+
|League|Correct|Total|Accuracy|
+------+-------+-----+--------+
|   SC2|     10|   14|    0.71|
|    EC|     13|   20|    0.65|
|    N1|   1853| 3369|    0.55|
|    G1|   1002| 1836|    0.55|
|    E0|   2458| 4594|    0.54|
|    P1|   1704| 3184|    0.54|
|   SP1|   2374| 4520|    0.53|
|    I1|   2370| 4459|    0.53|
|    T1|   1690| 3208|    0.53|
|   SC0|   1219| 2305|    0.53|
|   AUT|    177|  331|    0.53|
|    B1|   1492| 2880|    0.52|
|   NOR|    563| 1074|    0.52|
|    D1|   1860| 3667|    0.51|
|   SWE|    221|  430|    0.51|
|   RUS|     53|  104|    0.51|
|   ROM|     40|   80|     0.5|
|    F1|   2143| 4382|    0.49|
|   FIN|     55|  115|    0.48|
|    D2|   1615| 3469|    0.47|
+------+-------+-----+--------+
only showing top 20 rows



In [27]:
# Compare with baseline (always predict home win)
baseline = matches_transformed_df.agg(
    F.count(F.when(F.col("FTResult") == "H", 1)).alias("HomeWins"), F.count("*").alias("Total")
).withColumn("BaselineAccuracy", F.col("HomeWins") / F.col("Total"))

baseline.show()

+--------+------+-------------------+
|HomeWins| Total|   BaselineAccuracy|
+--------+------+-------------------+
|   61265|135836|0.45102182043051914|
+--------+------+-------------------+



In [28]:
# Feature importance (coefficients from logistic regression)
feature_importance = model.coefficientMatrix.toArray()
print("Feature importance (coefficients):")
print(feature_importance)

Feature importance (coefficients):
[[ 1.00557815e-03 -8.02856329e-04  2.28791562e-03  9.07090235e-03
  -1.16881855e-02]
 [-9.06419416e-04  8.54309802e-04 -2.22745243e-03 -1.16825840e-02
   3.95321738e-03]
 [-9.91587324e-05 -5.14534722e-05 -6.04631912e-05  2.61168162e-03
   7.73496817e-03]]


In [29]:
model_path = get_data_path("data/models/outcome_lr")
model.write().overwrite().save(model_path)
print(f"Model saved to {model_path}")

Model saved to data/models/outcome_lr


In [30]:
# Stop Spark session
spark.stop()