In [1]:

%pip install pandas

import requests
import logging
import pandas as pd
import sys
import os
import json

# Add the parent directory to sys.path so 'Data' can be imported
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Call the API endpoint
response = requests.get("https://fantasy.premierleague.com/api/bootstrap-static/")
data = response.json()

# Get the count of events
event_count = len(data.get("events", []))

# Log the count of events
print(f"Count of events: {event_count}")

# Pull the CSV file into memory as a pandas DataFrame
file_path = "/Users/owen/src/Personal/fpl-team-picker/Data/raw/parsed_gw_2425.csv"
xpData = pd.read_csv(file_path)

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.




Count of events: 38


  xpData = pd.read_csv(file_path)


In [2]:
# Extract basic player data for players.json
# Simple extraction: player information, positions, teams, season totals

players = data.get("elements", [])
teams = data.get("teams", [])

# Filter and clean player data - keep only essential information
players_data = []
for player in players:
    # Only include outfield players and goalkeepers (element_type 1-4)
    if player['element_type'] < 5:
        # Keep essential player info for modeling
        clean_player = {
            "id": player.get("id"),
            "web_name": player.get("web_name"),
            "first_name": player.get("first_name"),
            "second_name": player.get("second_name"),
            "element_type": player.get("element_type"),  # 1=GK, 2=DEF, 3=MID, 4=FWD
            "team": player.get("team"),
            "now_cost": player.get("now_cost"),
            "selected_by_percent": player.get("selected_by_percent"),
            "form": player.get("form"),
            "points_per_game": player.get("points_per_game"),
            "total_points": player.get("total_points"),
            "minutes": player.get("minutes"),
            "goals_scored": player.get("goals_scored"),
            "assists": player.get("assists"),
            "clean_sheets": player.get("clean_sheets"),
            "goals_conceded": player.get("goals_conceded"),
            "own_goals": player.get("own_goals"),
            "penalties_saved": player.get("penalties_saved"),
            "penalties_missed": player.get("penalties_missed"),
            "yellow_cards": player.get("yellow_cards"),
            "red_cards": player.get("red_cards"),
            "saves": player.get("saves"),
            "bonus": player.get("bonus"),
            "bps": player.get("bps"),
            "expected_goals": player.get("expected_goals"),
            "expected_assists": player.get("expected_assists"),
            "expected_goal_involvements": player.get("expected_goal_involvements"),
            "expected_goals_conceded": player.get("expected_goals_conceded")
        }
        players_data.append(clean_player)

print(f"Extracted {len(players_data)} players")

# Save the extracted data to a JSON file
with open("database/players.json", "w") as file:
    json.dump(players_data, file, indent=2)

print("Players saved to database/players.json")


Extracted 663 players


FileNotFoundError: [Errno 2] No such file or directory: 'database/players.json'

In [None]:
# Extract basic team data for teams.json
# Simple extraction: team strength metrics, attacking/defensive ratings

# Filter and clean team data - keep only essential information for modeling
teams_data = []
for team in teams:
    clean_team = {
        "id": team.get("id"),
        "name": team.get("name"),
        "short_name": team.get("short_name"),
        "code": team.get("code"),
        # Strength metrics for modeling
        "strength": team.get("strength"),
        "strength_overall_home": team.get("strength_overall_home"),
        "strength_overall_away": team.get("strength_overall_away"),
        "strength_attack_home": team.get("strength_attack_home"),
        "strength_attack_away": team.get("strength_attack_away"),
        "strength_defence_home": team.get("strength_defence_home"),
        "strength_defence_away": team.get("strength_defence_away"),
        # Season performance metrics
        "played": team.get("played"),
        "win": team.get("win"),
        "draw": team.get("draw"),
        "loss": team.get("loss"),
        "points": team.get("points"),
        "position": team.get("position")
    }
    teams_data.append(clean_team)

print(f"Extracted {len(teams_data)} teams")

# Save the teams data to a JSON file
with open("database/teams.json", "w") as file:
    json.dump(teams_data, file, indent=2)

print("Teams saved to database/teams.json")

Extracted 20 teams
Teams saved to database/teams.json


In [None]:
# Extract fixtures data and create fixtures.json
import json
from datetime import datetime

def extract_fixtures_data():
    """
    Extract fixtures data from FPL fixtures API and create unified fixtures.json
    Contains both historical (finished) and upcoming fixtures
    """
    print("Fetching fixtures from FPL API...")
    
    # Call the fixtures API endpoint
    fixtures_response = requests.get("https://fantasy.premierleague.com/api/fixtures/")
    if fixtures_response.status_code != 200:
        print(f"Error fetching fixtures: {fixtures_response.status_code}")
        return []
    
    fixtures_data = fixtures_response.json()
    print(f"Retrieved {len(fixtures_data)} fixtures from API")
    
    fixtures = []
    current_season = "2025-26"  # Updated for current season
    
    for fixture in fixtures_data:
        fixture_data = {
            "id": fixture.get("id"),
            "gameweek": fixture.get("event"),
            "season": current_season,
            "team_h": fixture.get("team_h"),
            "team_a": fixture.get("team_a"), 
            "team_h_difficulty": fixture.get("team_h_difficulty"),
            "team_a_difficulty": fixture.get("team_a_difficulty"),
            "kickoff_time": fixture.get("kickoff_time"),
            "finished": fixture.get("finished", False)
        }
        
        # Add scores for finished fixtures
        if fixture_data["finished"]:
            fixture_data["team_h_score"] = fixture.get("team_h_score")
            fixture_data["team_a_score"] = fixture.get("team_a_score")
        else:
            fixture_data["team_h_score"] = None
            fixture_data["team_a_score"] = None
            
        fixtures.append(fixture_data)
    
    return fixtures

# Extract fixtures
print("Extracting fixtures data...")
fixtures_data = extract_fixtures_data()

# Save fixtures to JSON file
with open("database/fixtures.json", "w") as file:
    json.dump(fixtures_data, file, indent=2)

print(f"Fixtures saved to database/fixtures.json.")
print(f"Total fixtures: {len(fixtures_data)}")

# Show some stats
finished_fixtures = [f for f in fixtures_data if f["finished"]]
upcoming_fixtures = [f for f in fixtures_data if not f["finished"]]

print(f"Historical fixtures: {len(finished_fixtures)}")
print(f"Upcoming fixtures: {len(upcoming_fixtures)}")

# Show sample of each type
if finished_fixtures:
    print("\nSample historical fixture:")
    print(json.dumps(finished_fixtures[0], indent=2))

if upcoming_fixtures:
    print("\nSample upcoming fixture:")
    print(json.dumps(upcoming_fixtures[0], indent=2))

Extracting fixtures data...
Fetching fixtures from FPL API...
Retrieved 380 fixtures from API
Fixtures saved to database/fixtures.json.
Total fixtures: 380
Historical fixtures: 0
Upcoming fixtures: 380

Sample upcoming fixture:
{
  "id": 1,
  "gameweek": 1,
  "season": "2025-26",
  "team_h": 12,
  "team_a": 4,
  "team_h_difficulty": 3,
  "team_a_difficulty": 5,
  "kickoff_time": "2025-08-15T19:00:00Z",
  "finished": false,
  "team_h_score": null,
  "team_a_score": null
}
Retrieved 380 fixtures from API
Fixtures saved to database/fixtures.json.
Total fixtures: 380
Historical fixtures: 0
Upcoming fixtures: 380

Sample upcoming fixture:
{
  "id": 1,
  "gameweek": 1,
  "season": "2025-26",
  "team_h": 12,
  "team_a": 4,
  "team_h_difficulty": 3,
  "team_a_difficulty": 5,
  "kickoff_time": "2025-08-15T19:00:00Z",
  "finished": false,
  "team_h_score": null,
  "team_a_score": null
}


In [None]:
# Optional: Fetch historical fixtures from previous seasons
# This extends the fixtures.json with multi-season data if needed

def fetch_historical_fixtures():
    """
    Fetch historical fixtures from previous seasons
    Note: FPL API mainly provides current season data
    For full historical data, you might need external sources or stored data
    """
    historical_fixtures = []
    
    # For now, we'll work with what we have from current season
    # In the future, you can add logic here to fetch from:
    # - Stored CSV files with historical fixture data
    # - External APIs with historical data
    # - Previously saved JSON files from past seasons
    
    print("Note: Currently using fixtures from current season API data")
    print("For multi-season historical data, you'll need to:")
    print("1. Save fixtures.json at the end of each season")
    print("2. Merge saved data from multiple seasons")
    print("3. Or use external historical data sources")
    
    return historical_fixtures

# Add any additional historical data if available
additional_fixtures = fetch_historical_fixtures()

if additional_fixtures:
    # Load existing fixtures
    with open("database/fixtures.json", "r") as file:
        existing_fixtures = json.load(file)
    
    # Combine and deduplicate
    all_fixtures = existing_fixtures + additional_fixtures
    
    # Remove duplicates based on fixture id
    seen_ids = set()
    unique_fixtures = []
    for fixture in all_fixtures:
        if fixture["id"] not in seen_ids:
            unique_fixtures.append(fixture)
            seen_ids.add(fixture["id"])
    
    # Save updated fixtures
    with open("database/fixtures.json", "w") as file:
        json.dump(unique_fixtures, file, indent=2)
    
    print(f"Updated fixtures.json with {len(unique_fixtures)} total fixtures")
else:
    print("Using fixtures from current season only")

Note: Currently using fixtures from current season API data
For multi-season historical data, you'll need to:
1. Save fixtures.json at the end of each season
2. Merge saved data from multiple seasons
3. Or use external historical data sources
Using fixtures from current season only


In [None]:
# Validate and analyze fixtures data
import pandas as pd

# Load the fixtures data for analysis
with open("database/fixtures.json", "r") as file:
    fixtures_data = json.load(file)

# Convert to DataFrame for easier analysis
fixtures_df = pd.DataFrame(fixtures_data)

print("=== FIXTURES DATA ANALYSIS ===\n")

print(f"Total fixtures: {len(fixtures_df)}")
print(f"Columns: {list(fixtures_df.columns)}")

if not fixtures_df.empty:
    print(f"\nGameweeks covered: {fixtures_df['gameweek'].min()} to {fixtures_df['gameweek'].max()}")
    print(f"Seasons: {fixtures_df['season'].unique()}")
    
    # Finished vs Upcoming
    finished_count = len(fixtures_df[fixtures_df['finished'] == True])
    upcoming_count = len(fixtures_df[fixtures_df['finished'] == False])
    
    print(f"\nFinished fixtures: {finished_count}")
    print(f"Upcoming fixtures: {upcoming_count}")
    
    # Sample of finished fixtures (with scores)
    finished_fixtures = fixtures_df[fixtures_df['finished'] == True]
    if not finished_fixtures.empty:
        print(f"\nSample finished fixtures with scores:")
        sample_finished = finished_fixtures[['gameweek', 'team_h', 'team_a', 'team_h_score', 'team_a_score']].head(3)
        print(sample_finished.to_string(index=False))
    
    # Sample of upcoming fixtures  
    upcoming_fixtures = fixtures_df[fixtures_df['finished'] == False]
    if not upcoming_fixtures.empty:
        print(f"\nSample upcoming fixtures:")
        sample_upcoming = upcoming_fixtures[['gameweek', 'team_h', 'team_a', 'team_h_difficulty', 'team_a_difficulty']].head(3)
        print(sample_upcoming.to_string(index=False))
    
    # Check for any missing data
    print(f"\nData completeness check:")
    for col in fixtures_df.columns:
        null_count = fixtures_df[col].isnull().sum()
        if null_count > 0:
            print(f"  {col}: {null_count} missing values")
        else:
            print(f"  {col}: Complete")

print("\n=== FIXTURES EXTRACTION COMPLETE ===")
print("✅ fixtures.json created successfully")
print("\nNext steps:")
print("1. Use finished fixtures (finished: true) for model training")  
print("2. Use upcoming fixtures (finished: false) for predictions")
print("3. Update fixtures.json regularly as games complete")

=== FIXTURES DATA ANALYSIS ===

Total fixtures: 380
Columns: ['id', 'gameweek', 'season', 'team_h', 'team_a', 'team_h_difficulty', 'team_a_difficulty', 'kickoff_time', 'finished', 'team_h_score', 'team_a_score']

Gameweeks covered: 1 to 38
Seasons: ['2025-26']

Finished fixtures: 0
Upcoming fixtures: 380

Sample upcoming fixtures:
 gameweek  team_h  team_a  team_h_difficulty  team_a_difficulty
        1      12       4                  3                  5
        1       2      15                  3                  4
        1       6      10                  3                  3

Data completeness check:
  id: Complete
  gameweek: Complete
  season: Complete
  team_h: Complete
  team_a: Complete
  team_h_difficulty: Complete
  team_a_difficulty: Complete
  kickoff_time: Complete
  finished: Complete
  team_h_score: 380 missing values
  team_a_score: 380 missing values

=== FIXTURES EXTRACTION COMPLETE ===
✅ fixtures.json created successfully

Next steps:
1. Use finished fixtures (fi