In [29]:
import json
import numpy as np

with open('../database/players.json', 'r') as f:
  players = json.load(f)

# filter players to ones who have played enough
players = [p for p in players if p.get('minutes') > 900 and p.get('element_type') == 1]

with open('../database/teams.json', 'r') as f_teams:
  teams = json.load(f_teams)

teams_lookup = {team['id']: team for team in teams}

print(f"Number of goalkeepers: {len(players)}")

Number of goalkeepers: 23


In [30]:
# Prepare training set for predicting 'goals_conceded'
training_set = []

# Find the maximum number of gameweeks across all players
max_gw = max(len(player.get('previous_fixtures', [])) for player in players)

print(f"max_gw: {max_gw}")

for player in players:
  previous_fixtures = [h for h in player.get('previous_fixtures', [])]
  print(f"Starting procesing player: {player['web_name']}, num_previous_fixtures: {len(previous_fixtures)}")
  for fixture in previous_fixtures:

    gw = fixture.get('round')

    history = [h for h in previous_fixtures if h['round'] < gw]

    if len(history) < 5:
      print(f"Not enough data for player: {player['web_name']}, gw: {gw}")
      continue

    # If last game minutes was 0, ignore
    if history[-1]['minutes'] == 0:
      print(f"Last game minutes is 0 for player: {player['web_name']}, gw: {gw}")
      continue

    print(f"Processing player: {player['web_name']}, gw: {gw}")
    # Calculate season totals for saves and goals_conceded per 90
    total_minutes = sum(gw['minutes'] for gw in history)
    total_saves = sum(gw['saves'] for gw in history)
    total_goals_conceded = sum(gw['goals_conceded'] for gw in history)
    saves_per_90 = total_saves / total_minutes * 90 if total_minutes > 0 else 0
    goals_conceded_per_90 = total_goals_conceded / total_minutes * 90 if total_minutes > 0 else 0
    total_bonus = sum(gw.get('bonus', 0) for gw in history)
    bonus_per_90 = total_bonus / total_minutes * 90 if total_minutes > 0 else 0

    sample = {}
    
    # Use the last 5 fixtures from history for feature extraction
    recent_history = history[-5:]

    for i in range(1, 6):
      sample[f'goals_conceded_gw-{i}'] = recent_history[-i]['goals_conceded']
      sample[f'saves_gw-{i}'] = recent_history[-i]['saves']
    # Difficulty of opponent's attack for current gameweek
    opponent_team_id = fixture['opponent_team']
    was_home = fixture['was_home']
    opponent_team = teams_lookup.get(opponent_team_id, {})
    # Include the strength of this player's defence
    player_team_id = player['team']
    player_team = teams_lookup.get(player_team_id, {})
    if was_home:
      sample['team_strength'] = player_team.get('strength_defence_home', np.nan)
      sample['oppenent_strength'] = opponent_team.get('strength_attack_away', np.nan)
    else:
      sample['team_strength'] = player_team.get('strength_defence_away', np.nan)
      sample['oppenent_strength'] = opponent_team.get('strength_attack_home', np.nan)
    sample['was_home'] = was_home
    sample['value'] = fixture['value']
    sample['selected'] = fixture['selected']
    sample['transfers_balance'] = fixture['transfers_balance']
    sample['expected_goals_conceded'] = fixture['expected_goals_conceded']
    sample['saves_per_90'] = saves_per_90
    sample['goals_conceded_per_90'] = goals_conceded_per_90
    sample['bonus_per_90'] = bonus_per_90
    # Target variables
    sample['goals_conceded'] = fixture['goals_conceded']
    sample['bonus'] = fixture['bonus']
    sample['saves'] = fixture['saves']
    training_set.append(sample)

max_gw: 38
Starting procesing player: Raya, num_previous_fixtures: 38
Not enough data for player: Raya, gw: 1
Not enough data for player: Raya, gw: 2
Not enough data for player: Raya, gw: 3
Not enough data for player: Raya, gw: 4
Not enough data for player: Raya, gw: 5
Processing player: Raya, gw: 6
Processing player: Raya, gw: 7
Processing player: Raya, gw: 8
Processing player: Raya, gw: 9
Processing player: Raya, gw: 10
Processing player: Raya, gw: 11
Processing player: Raya, gw: 12
Processing player: Raya, gw: 13
Processing player: Raya, gw: 14
Processing player: Raya, gw: 15
Processing player: Raya, gw: 16
Processing player: Raya, gw: 17
Processing player: Raya, gw: 18
Processing player: Raya, gw: 19
Processing player: Raya, gw: 20
Processing player: Raya, gw: 21
Processing player: Raya, gw: 22
Processing player: Raya, gw: 23
Processing player: Raya, gw: 24
Processing player: Raya, gw: 25
Processing player: Raya, gw: 26
Processing player: Raya, gw: 27
Processing player: Raya, gw: 2

In [31]:
print(f"Saving {len(training_set)} records to training_set.json")
with open('training_set.json', 'w') as outfile:
  json.dump(training_set, outfile, indent=2)

Saving 582 records to training_set.json
