# Football Data Computational Experiments


In [2]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append("src")

In [None]:
from config import SPARK_MASTER
from data_loader import load_parquet_data
from prepare_train_evaluate import train_and_evaluate
from spark_session import create_spark_session
from model_types import ModelConfig
from experiment_types import ExperimentConfig, ExperimentResult, ExperimentSeriesConfig


config = ExperimentSeriesConfig(
    cores=[1, 2, 3, 4, 6, 8],
    executor_memory_sizes=["1g", "2g", "4g", "6g", "10g", "16g"],
    driver_memory_sizes=["1g", "2g", "4g", "6g", "10g", "16g"],
    instances=([1, 2, 3, 4, 5, 6, 7, 8] if SPARK_MASTER != "local[*]" else [1]),
    model_configs=[ModelConfig("logistic"), ModelConfig("gbt")],
    train_size=0.2,
)


def run_single_experiment(config: ExperimentConfig) -> ExperimentResult:
    spark = create_spark_session(
        force=True,
        cores=config.cores,
        driver_memory=config.driver_memory_size,
        executor_memory=config.executor_memory_size,
        instances=config.instances,
    )

    df, _ = load_parquet_data(spark)
    df = df.cache()
    df.count()  # Force caching

    try:
        metrics, df = train_and_evaluate(
            df, config.model_config, test_size=config.test_size, train_size=config.train_size
        )
        spark.stop()

        return ExperimentResult(
            config,
            metrics,
        )
    except Exception as e:
        spark.stop()
        print(f"Failed experiment with {config.as_pretty_string()}")
        raise e


results: list[ExperimentResult] = []
experiments: list[ExperimentConfig] = config.generate_experiment_configs()
for index, config in enumerate(experiments):
    result = run_single_experiment(config)
    print(f"Experiment {index + 1} of {len(experiments)}: {result.as_pretty_string()}")
    results.append(result)

In [5]:
import pandas as pd
from data_loader import write_experiment_results

results_df = pd.DataFrame([r.as_pretty_dict() for r in results])

write_experiment_results(results_df)