In [1]:
from keras.layers import LSTM, Dense
from keras.models import Sequential

# Build LSTM model used to do predictions
model = Sequential()
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=50))
model.add(Dense(units=50, activation="relu"))
model.add(Dense(units=1))

model.compile(optimizer="adam", loss="mean_squared_error")


2023-08-07 13:41:48.319034: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2023-08-07 13:41:48.319050: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-08-07 13:41:48.319055: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-08-07 13:41:48.319086: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-08-07 13:41:48.319110: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from rich.jupyter import print


# Number of past rounds to sequence for the model to consider when predicting
# the growth of a player for round.
n_steps: int = 10

scaler_x = MinMaxScaler(feature_range=(0, 1))
scaler_y = MinMaxScaler(feature_range=(0, 1))


def prepare_data(df: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
    """
    Prepare the data for training the model. This includes normalizing the data
    and combining the data into a X and y array.
    """

    # NaN values is often because no stat was recorded for the given player for
    # that game. That means that the player was not on the pitch and thus 0 is
    # the proper value to use.
    df.fillna(0, inplace=True)

    # Normalize the data to gain better results. Using separate scaler to allow
    # for independently inverting again.
    df[df.columns.difference(["growth"])] = scaler_x.fit_transform(
        df[df.columns.difference(["growth"])]
    )
    df["growth"] = scaler_y.fit_transform(df[["growth"]])

    # Combine number of time_steps into a single array and set the target to the
    # growth of the following round. E.g. combine values from round 1-10 and set
    # the target to the growth of round 11.
    time_steps = n_steps
    features = []
    target = []
    for _, group in df.groupby("id"):
        # Drop columns that will overfit the data. The id and round have no real
        # influence on the players performance. We only keep the id to be able
        # to group the data by player.
        group.drop(columns=["id", "round"], inplace=True)

        for i in range(time_steps, len(group)):
            features.append(group.iloc[i - time_steps : i].values)
            target.append(group.iloc[i]["growth"])

    return np.array(features), np.array(target)


In [3]:
from sklearn.model_selection import train_test_split


def train(df: pd.DataFrame, epochs: int = 10, batch_size: int = 32) -> None:
    """
    Train the LSTM model using the given data frame. Returns the model for use
    to predict the growth of a player.
    """
    X, y = prepare_data(df)

    # Split the data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Train the model
    model.fit(
        X_train,
        y_train,
        # Number of iterations over the entire dataset
        epochs=epochs,
        # Number of samples per gradient update
        batch_size=batch_size,
        # Use 20% of the data for validation
        validation_split=0.2,
    )

    loss = model.evaluate(X_test, y_test)
    print(f"\nModel evaluation: {loss=!r}")


In [7]:
from holdet.data import sofascore
from holdet.game import Game, BaseCandidate, Round
from functools import cache


class CandidateLTSM(BaseCandidate):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def features(self, stat: sofascore.Statistics) -> dict[str, float | int]:
        """
        Return a dictionary with all the features that can be used to train a
        model.
        """

        return {
            # TODO: Add data from betting sites
            "opponent": stat.game.away.id
            if stat.side == sofascore.Side.HOME
            else stat.game.home.id,
            "side": stat.side.value,
            "substitute": int(stat.substitute),
            "assists": stat.assists,
            "expectedAssists": stat.expectedAssists,
            "expectedGoals": stat.expectedGoals,
            "goals": stat.goals,
            "goalsPrevented": stat.goalsPrevented,
            "minutesPlayed": stat.minutesPlayed,
            "onTargetScoringAttempt": stat.onTargetScoringAttempt,
            "savedShotsFromInsideTheBox": stat.savedShotsFromInsideTheBox,
            "saves": stat.saves,
            "team_goals": stat.team_goals,
            "team_goals_conceded": stat.team_goals_conceded,
            "win": int(stat.win),
            "loss": int(stat.loss),
            "draw": int(stat.draw),
            "clean_sheet": int(stat.clean_sheet),
            "decisive_goal_for_draw": int(stat.decisive_goal_for_draw),
            "decisive_goal_for_win": int(stat.decisive_goal_for_win),
        }

    def aggregate_features(self, round: Round) -> dict:
        """
        Aggregate all the features from a round into one. This is needed for
        rounds with multiple games in them. The features are summed together.

        Its not ideal for stuff like opponent or side which will simply be added
        together, but thats we can do for now.
        """
        round_stats: dict[str, int | float] = {}
        for stat in round.stats:
            for key, value in self.features(stat).items():
                round_stats[key] = round_stats.get(key, 0) + value
        return round_stats

    def generate_dataframe(self) -> pd.DataFrame:
        """
        Generate a dataframe for the candidate with features for every round
        """
        data = []

        for round in self.rounds:
            row = {
                "id": self.id,
                "round": round.number,
                "position": round.position.value,
                "team": self.team_id,
                "growth": round.growth,
            }

            # Append all features to the row
            features = self.aggregate_features(round)
            for key, value in features.items():
                row[key] = value

            data.append(row)

        return pd.DataFrame(data)

    @cache  # Cache the prediction so we do not have to do inference every time
    def xGrowth(self):
        """
        Predict the growth for the next game using the given data frame. The
        data frame should contain the same columns as the one used for training
        the model.
        """
        df = self.generate_dataframe()
        X, _ = prepare_data(df)
        y = model.predict(X)
        return scaler_y.inverse_transform(y)

    @property
    def xValue(self) -> float:
        xGrowth = float(self.xGrowth()[-1])
        if self.captain:
            return self.value + xGrowth * 2
        return self.value + xGrowth


game = Game.new(CandidateLTSM)


In [8]:
# Generate a dataframe for each candidate, concat into one and use it to train
# the model.
data = []
for candidate in game.candidates:
    candidate: CandidateLTSM
    data.append(candidate.generate_dataframe())
df = pd.concat(data)

train(df, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
from holdet.solver import lp

budget = 50 * 1000000  # 50 million

# Calculate the best starting 11 within the budget using linear programming
solution = lp.find_optimal_team(game.candidates, budget)




2023-08-07 13:51:42.974159: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-07 13:51:43.034264: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-07 13:51:43.067061: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [10]:
from holdet.formation import Formation

print(Formation(solution))


In [13]:
c = game.find_candidate("Alexander Isak")
c: CandidateLTSM  # Type annotate to help IDE

print(c, c.rounds, c.xGrowth())
