# Feature Engineering for Football Match Prediction

Useful outputs for deciding on features.


In [2]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append("src")

In [4]:
from spark_session import create_spark_session

spark = create_spark_session()

In [3]:
from pyspark.sql import SparkSession
from config import MATCHES_OUTPUT_DIR
from data_loader import get_data_path
from utils import time_execution


@time_execution
def load_parquet_data(spark: SparkSession):
    return spark.read.parquet(get_data_path(MATCHES_OUTPUT_DIR))


matches_transformed_df, load_time = load_parquet_data(spark)

Execution time for load_parquet_data: 1.18 seconds


In [4]:
# Display data samples
print("\nMatches Sample:")
matches_transformed_df.show(5)


Matches Sample:
+----------+----------------+----------+-------+--------+------+------+--------------------+-------+-------+---------+---------+---------+---------+----------+----------+-------+-------+-------+-------+-------+-------+------+-------+---------+---------+---------+-------+------+
|      Date|        HomeTeam|  AwayTeam|EloDiff|FTResult|FTHome|FTAway|                  ID|HomeElo|AwayElo|Form3Home|Form5Home|Form3Away|Form5Away|HomeTarget|AwayTarget|OddHome|OddDraw|OddAway|MaxHome|MaxDraw|MaxAway|Over25|Under25|HandiSize|HandiHome|HandiAway|Country|League|
+----------+----------------+----------+-------+--------+------+------+--------------------+-------+-------+---------+---------+---------+---------+----------+----------+-------+-------+-------+-------+-------+-------+------+-------+---------+---------+---------+-------+------+
|2023-01-21|        Coventry|   Norwich| -43.37|       A|   2.0|   4.0|2023-01-21_Covent...|1514.06|1557.43|      2.0|      5.0|      4.0|      4.

In [5]:
import pyspark.sql.functions as F

feature_stats = matches_transformed_df.select(
    F.round(F.mean("HomeElo"), 2).alias("AvgHomeElo"),
    F.round(F.stddev("HomeElo"), 2).alias("StdDevHomeElo"),
    F.round(F.mean("AwayElo"), 2).alias("AvgAwayElo"),
    F.round(F.stddev("AwayElo"), 2).alias("StdDevAwayElo"),
    F.round(F.mean("EloDiff"), 2).alias("AvgEloDiff"),
    F.round(F.stddev("EloDiff"), 2).alias("StdDevEloDiff"),
    F.round(F.mean("Form3Home"), 2).alias("AvgForm3Home"),
    F.round(F.stddev("Form3Home"), 2).alias("StdDevForm3Home"),
    F.round(F.mean("Form3Away"), 2).alias("AvgForm3Away"),
    F.round(F.stddev("Form3Away"), 2).alias("StdForm3Away"),
)

feature_stats.show()

+----------+-------------+----------+-------------+----------+-------------+------------+---------------+------------+------------+
|AvgHomeElo|StdDevHomeElo|AvgAwayElo|StdDevAwayElo|AvgEloDiff|StdDevEloDiff|AvgForm3Home|StdDevForm3Home|AvgForm3Away|StdForm3Away|
+----------+-------------+----------+-------------+----------+-------------+------------+---------------+------------+------------+
|   1539.08|       150.83|   1539.09|       150.85|     -0.02|       134.12|        3.96|           2.38|        4.24|         2.4|
+----------+-------------+----------+-------------+----------+-------------+------------+---------------+------------+------------+



In [6]:
outcome_counts = matches_transformed_df.groupBy("FTResult").count()
outcome_counts.show()

+--------+-----+
|FTResult|count|
+--------+-----+
|       D|36377|
|       A|38194|
|       H|61265|
+--------+-----+



In [7]:
spark.stop()