In [1]:
from pathlib import Path

# Correct path (relative to your current CWD)
DATA = Path("./data")

print("DATA path:", DATA.resolve())
print("DATA exists:", DATA.exists())
print("Contents:", [p.name for p in DATA.iterdir()][:15])



DATA path: C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data
DATA exists: True
Contents: ['competitions.json', 'events', 'lineups', 'matches', 'three-sixty']


In [2]:
import os
print("CWD:", os.getcwd())


CWD: c:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master


In [3]:
matches_path = DATA / "matches"
print("Matches path:", matches_path.resolve())
print("Exists:", matches_path.exists())
print("First 15 items:", [p.name for p in matches_path.iterdir()][:15])


Matches path: C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches
Exists: True
First 15 items: ['11', '116', '12', '1238', '1267', '1470', '16', '2', '223', '35', '37', '43', '44', '49', '53']


In [4]:
from pathlib import Path

# Since your CWD is c:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master
DATA = Path("./data")

print("DATA path:", DATA.resolve())
print("DATA exists:", DATA.exists())


DATA path: C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data
DATA exists: True


In [5]:
import json
import pandas as pd

comp_id, season_id = 43, 3
matches_path = DATA / "matches" / str(comp_id) / str(season_id)

print("Looking inside:", matches_path.resolve())

all_matches = []
for f in matches_path.glob("*.json"):
    with open(f, "r", encoding="utf-8") as file:
        all_matches.extend(json.load(file))

matches = pd.json_normalize(all_matches)

print("Matches loaded:", matches.shape)
print("Columns sample:", matches.columns.tolist()[:20])

# check id and date columns
id_col = [c for c in matches.columns if "match_id" in c]
date_col = [c for c in matches.columns if "match_date" in c]
print("id_col:", id_col, "date_col:", date_col)

matches.head(3)


Looking inside: C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\43\3
Matches loaded: (0, 0)
Columns sample: []
id_col: [] date_col: []


In [6]:
import pandas as pd
import json
from pathlib import Path

DATA = Path("./open-data-master/data")
comp_id, season_id = 11, 1   # ✅ change to one you have locally

matches_path = DATA / "matches" / str(comp_id) / str(season_id)
print("Looking inside:", matches_path.resolve())

all_matches = []
for file in matches_path.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        all_matches.extend(json.load(f))

matches = pd.json_normalize(all_matches)

print("Matches loaded:", matches.shape)
print("Columns sample:", matches.columns[:10])


Looking inside: open-data-master\data\matches\11\1
Matches loaded: (0, 0)
Columns sample: RangeIndex(start=0, stop=0, step=1)


In [7]:
import os
print(os.getcwd())

c:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master


In [8]:
import pandas as pd
import json
from pathlib import Path

# Use absolute path
competitions_file = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\competitions.json")

# Load and normalize JSON
with open(competitions_file, "r", encoding="utf-8") as f:
    competitions = json.load(f)

competitions_df = pd.json_normalize(competitions)
print(competitions_df[['competition_id', 'competition_name', 'season_name']])

    competition_id        competition_name season_name
0                9           1. Bundesliga   2023/2024
1                9           1. Bundesliga   2015/2016
2             1267  African Cup of Nations        2023
3               16        Champions League   2018/2019
4               16        Champions League   2017/2018
..             ...                     ...         ...
70              35      UEFA Europa League   1988/1989
71              53       UEFA Women's Euro        2025
72              53       UEFA Women's Euro        2022
73              72       Women's World Cup        2023
74              72       Women's World Cup        2019

[75 rows x 3 columns]


In [9]:
import pandas as pd
import json
from pathlib import Path

comp_id, season_id = 9, 281  # Bundesliga 2023/2024
matches_path = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches") / str(comp_id) / str(season_id)

all_matches = []

# Check if path exists
if not matches_path.exists():
    print(f"Directory not found: {matches_path}")
else:
    json_files = list(matches_path.glob("*.json"))
    if not json_files:
        print(f"No JSON files found in: {matches_path}")
    else:
        for f in json_files:
            try:
                with open(f, "r", encoding="utf-8") as file:
                    data = json.load(file)
                    if isinstance(data, list):
                        all_matches.extend(data)
                    else:
                        all_matches.append(data)
            except Exception as e:
                print(f"Error reading {f.name}: {e}")

        if all_matches:
            matches = pd.json_normalize(all_matches)
            print("Matches loaded:", matches.shape)
            print(matches[['match_id', 'match_date', 'home_team.home_team_name', 'away_team.away_team_name']].head())
        else:
            print("No match data loaded. Check file contents.")

Directory not found: C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\9\281


In [10]:
import pandas as pd
import json
from pathlib import Path

competitions_file = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\competitions.json")

with open(competitions_file, "r", encoding="utf-8") as f:
    competitions = json.load(f)

df = pd.json_normalize(competitions)
print(df[df['competition_name'] == 'Bundesliga'][['competition_id', 'season_name', 'season_id']])

Empty DataFrame
Columns: [competition_id, season_name, season_id]
Index: []


In [11]:
matches_root = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches")
for f in matches_root.rglob("*.json"):
    print(f)

C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\1.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\2.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\21.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\22.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\23.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\24.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\25.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\26.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\27.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\278.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\37.json
C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\11\38.json
C:\Us

In [12]:
import json
import pandas as pd
from pathlib import Path

# Point to the exact file
match_file = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\9\281.json")

# Load and inspect structure
with open(match_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Check if it's a list and has content
if isinstance(data, list) and len(data) > 0:
    df = pd.json_normalize(data)
    print("Matches loaded:", df.shape)
    print("Available columns:", df.columns.tolist())

    # Try printing key columns if they exist
    expected_cols = ['match_id', 'match_date', 'home_team.home_team_name', 'away_team.away_team_name']
    available_cols = [col for col in expected_cols if col in df.columns]
    if available_cols:
        print(df[available_cols].head())
    else:
        print("Expected columns not found. Here's a sample:")
        print(df.head())
else:
    print("No match data found or unexpected format.")

Matches loaded: (34, 42)
Available columns: ['match_id', 'match_date', 'kick_off', 'home_score', 'away_score', 'match_status', 'match_status_360', 'last_updated', 'last_updated_360', 'match_week', 'competition.competition_id', 'competition.country_name', 'competition.competition_name', 'season.season_id', 'season.season_name', 'home_team.home_team_id', 'home_team.home_team_name', 'home_team.home_team_gender', 'home_team.home_team_group', 'home_team.country.id', 'home_team.country.name', 'home_team.managers', 'away_team.away_team_id', 'away_team.away_team_name', 'away_team.away_team_gender', 'away_team.away_team_group', 'away_team.country.id', 'away_team.country.name', 'away_team.managers', 'metadata.data_version', 'metadata.shot_fidelity_version', 'metadata.xy_fidelity_version', 'competition_stage.id', 'competition_stage.name', 'stadium.id', 'stadium.name', 'stadium.country.id', 'stadium.country.name', 'referee.id', 'referee.name', 'referee.country.id', 'referee.country.name']
   match

In [13]:
import pprint
pprint.pprint(data[0])

{'away_score': 0,
 'away_team': {'away_team_gender': 'male',
               'away_team_group': None,
               'away_team_id': 176,
               'away_team_name': 'Werder Bremen',
               'country': {'id': 85, 'name': 'Germany'},
               'managers': [{'country': {'id': 85, 'name': 'Germany'},
                             'dob': '1988-05-04',
                             'id': 4057,
                             'name': 'Ole Werner',
                             'nickname': None}]},
 'competition': {'competition_id': 9,
                 'competition_name': '1. Bundesliga',
                 'country_name': 'Germany'},
 'competition_stage': {'id': 1, 'name': 'Regular Season'},
 'home_score': 5,
 'home_team': {'country': {'id': 85, 'name': 'Germany'},
               'home_team_gender': 'male',
               'home_team_group': None,
               'home_team_id': 904,
               'home_team_name': 'Bayer Leverkusen',
               'managers': [{'country': {'id': 214

In [14]:
df_sample = pd.json_normalize(data, sep='.')
print(df_sample.columns.tolist())

['match_id', 'match_date', 'kick_off', 'home_score', 'away_score', 'match_status', 'match_status_360', 'last_updated', 'last_updated_360', 'match_week', 'competition.competition_id', 'competition.country_name', 'competition.competition_name', 'season.season_id', 'season.season_name', 'home_team.home_team_id', 'home_team.home_team_name', 'home_team.home_team_gender', 'home_team.home_team_group', 'home_team.country.id', 'home_team.country.name', 'home_team.managers', 'away_team.away_team_id', 'away_team.away_team_name', 'away_team.away_team_gender', 'away_team.away_team_group', 'away_team.country.id', 'away_team.country.name', 'away_team.managers', 'metadata.data_version', 'metadata.shot_fidelity_version', 'metadata.xy_fidelity_version', 'competition_stage.id', 'competition_stage.name', 'stadium.id', 'stadium.name', 'stadium.country.id', 'stadium.country.name', 'referee.id', 'referee.name', 'referee.country.id', 'referee.country.name']


In [15]:
import pandas as pd
import json
from pathlib import Path

# Bundesliga 2023/24 → comp_id=9, season_id=281
matches_path = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\9\281.json")

with open(matches_path, "r", encoding="utf-8") as f:
    data = json.load(f)

matches = pd.json_normalize(data, sep='.')

print("Matches loaded:", matches.shape)
print(matches[['match_id','match_date','home_team.home_team_name','away_team.away_team_name','home_score','away_score']].head())


Matches loaded: (34, 42)
   match_id  match_date home_team.home_team_name away_team.away_team_name  \
0   3895302  2024-04-14         Bayer Leverkusen            Werder Bremen   
1   3895292  2024-04-06             Union Berlin         Bayer Leverkusen   
2   3895333  2024-05-05      Eintracht Frankfurt         Bayer Leverkusen   
3   3895340  2024-05-12                   Bochum         Bayer Leverkusen   
4   3895348  2024-05-18         Bayer Leverkusen                 Augsburg   

   home_score  away_score  
0           5           0  
1           0           1  
2           1           5  
3           0           5  
4           2           1  


In [16]:
import json
import pandas as pd
from pathlib import Path

# Bundesliga 2023/24
comp_id, season_id = 9, 281
match_file = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches") / str(comp_id) / f"{season_id}.json"

with open(match_file, "r", encoding="utf-8") as f:
    matches = json.load(f)

matches_df = pd.json_normalize(matches, sep=".")
print("Matches loaded:", matches_df.shape)
print(matches_df[['match_id', 'match_date', 'home_team.home_team_name', 'away_team.away_team_name']].head())


Matches loaded: (34, 42)
   match_id  match_date home_team.home_team_name away_team.away_team_name
0   3895302  2024-04-14         Bayer Leverkusen            Werder Bremen
1   3895292  2024-04-06             Union Berlin         Bayer Leverkusen
2   3895333  2024-05-05      Eintracht Frankfurt         Bayer Leverkusen
3   3895340  2024-05-12                   Bochum         Bayer Leverkusen
4   3895348  2024-05-18         Bayer Leverkusen                 Augsburg


In [17]:
import json
import pandas as pd
from pathlib import Path

# Bundesliga 2023/2024 (competition_id=9, season_id=281)
match_file = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\matches\9\281.json")

with open(match_file, "r", encoding="utf-8") as f:
    data = json.load(f)

matches_df = pd.json_normalize(data, sep=".")
print("Matches loaded:", matches_df.shape)
print(matches_df[['match_id','match_date','home_team.home_team_name','away_team.away_team_name']].head())


Matches loaded: (34, 42)
   match_id  match_date home_team.home_team_name away_team.away_team_name
0   3895302  2024-04-14         Bayer Leverkusen            Werder Bremen
1   3895292  2024-04-06             Union Berlin         Bayer Leverkusen
2   3895333  2024-05-05      Eintracht Frankfurt         Bayer Leverkusen
3   3895340  2024-05-12                   Bochum         Bayer Leverkusen
4   3895348  2024-05-18         Bayer Leverkusen                 Augsburg


In [18]:
events_root = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\events")

all_events = []
for match in matches_df['match_id'].unique():
    f = events_root / f"{match}.json"
    if f.exists():
        with open(f, "r", encoding="utf-8") as file:
            all_events.extend(json.load(file))

events_df = pd.json_normalize(all_events, sep=".")
print("Events loaded:", events_df.shape)


Events loaded: (137765, 138)


In [19]:
print(events_df.columns.tolist()[:40])


['id', 'index', 'period', 'timestamp', 'minute', 'second', 'possession', 'duration', 'type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'related_events', 'location', 'player.id', 'player.name', 'position.id', 'position.name', 'pass.recipient.id', 'pass.recipient.name', 'pass.length', 'pass.angle', 'pass.height.id', 'pass.height.name', 'pass.end_location', 'pass.body_part.id', 'pass.body_part.name', 'pass.type.id', 'pass.type.name', 'carry.end_location', 'under_pressure', 'pass.outcome.id', 'pass.outcome.name', 'ball_receipt.outcome.id']


In [20]:
print(events_df['type.name'].unique()[:30])


['Starting XI' 'Half Start' 'Pass' 'Ball Receipt*' 'Carry' 'Pressure'
 'Miscontrol' 'Dribble' 'Duel' 'Interception' 'Ball Recovery'
 'Dispossessed' 'Block' 'Foul Committed' 'Foul Won' 'Error' 'Shot'
 'Goal Keeper' 'Injury Stoppage' 'Referee Ball-Drop' '50/50' 'Clearance'
 'Offside' 'Dribbled Past' 'Player Off' 'Player On' 'Half End'
 'Substitution' 'Bad Behaviour' 'Tactical Shift']


In [21]:
print(events_df[['type.name', 'team.name', 'player.name']].head(15))


        type.name         team.name      player.name
0     Starting XI  Bayer Leverkusen              NaN
1     Starting XI     Werder Bremen              NaN
2      Half Start  Bayer Leverkusen              NaN
3      Half Start     Werder Bremen              NaN
4            Pass     Werder Bremen   Nick Woltemade
5   Ball Receipt*     Werder Bremen   Marvin Ducksch
6           Carry     Werder Bremen   Marvin Ducksch
7            Pass     Werder Bremen   Marvin Ducksch
8   Ball Receipt*     Werder Bremen   Christian Groß
9           Carry     Werder Bremen   Christian Groß
10           Pass     Werder Bremen   Christian Groß
11  Ball Receipt*     Werder Bremen  Julián Malatini
12          Carry     Werder Bremen  Julián Malatini
13       Pressure  Bayer Leverkusen       Amine Adli
14           Pass     Werder Bremen  Julián Malatini


In [22]:
passes_df = events_df[events_df['type.name'] == "Pass"].copy()
print("Passes:", passes_df.shape)
print(passes_df[['player.name', 'team.name', 'pass.outcome.name']].head())


Passes: (39214, 138)
        player.name      team.name pass.outcome.name
4    Nick Woltemade  Werder Bremen               NaN
7    Marvin Ducksch  Werder Bremen               NaN
10   Christian Groß  Werder Bremen               NaN
14  Julián Malatini  Werder Bremen               NaN
17  Mitchell Weiser  Werder Bremen               NaN


In [23]:
shots_df = events_df[events_df['type.name'] == "Shot"].copy()
print("Shots:", shots_df.shape)
print(shots_df[['player.name', 'team.name', 'shot.statsbomb_xg', 'shot.outcome.name']].head())


Shots: (916, 138)
                     player.name         team.name  shot.statsbomb_xg  \
435         Leonardo Bittencourt     Werder Bremen           0.056644   
479  Piero Martín Hincapié Reyna  Bayer Leverkusen           0.143381   
596              Julián Malatini     Werder Bremen           0.038188   
683                 Jonathan Tah  Bayer Leverkusen           0.052781   
847                 Granit Xhaka  Bayer Leverkusen           0.021272   

    shot.outcome.name  
435           Blocked  
479             Saved  
596           Blocked  
683           Blocked  
847           Blocked  


In [24]:
passes_df['successful'] = passes_df['pass.outcome.name'].isna()
passes_completed = (
    passes_df[passes_df['successful']]
    .groupby(['player.name','team.name'])
    .size()
    .reset_index(name='passes_completed')
)
print(passes_completed.head())


                 player.name                 team.name  passes_completed
0                Adam Hložek          Bayer Leverkusen               131
1                Adrian Beck             FC Heidenheim                28
2              Alassane Pléa  Borussia Mönchengladbach                32
3  Alejandro Grimaldo García          Bayer Leverkusen              1783
4        Aleksandar Pavlović             Bayern Munich                46


In [25]:
print("=== WEEK 3 VERIFICATION ===")

# 1. Competitions check
try:
    print("\n[Competitions]")
    print(competitions_df[['competition_id','competition_name','season_id','season_name']].head(5))
except:
    print("Competitions data not loaded.")

# 2. Matches check
try:
    print("\n[Matches]")
    print(matches_df[['match_id','match_date','home_team.home_team_name','away_team.away_team_name']].head(5))
    print("Total matches loaded:", matches_df.shape[0])
except:
    print("Matches data not loaded.")

# 3. Events check
try:
    print("\n[Events]")
    print("Shape:", events_df.shape)
    print("Unique matches in events:", events_df['match_id'].nunique())
    print("Sample event types:", events_df['type.name'].unique()[:10])
    print("Sample teams:", events_df['team.name'].unique()[:5])
except:
    print("Events data not loaded.")


=== WEEK 3 VERIFICATION ===

[Competitions]
   competition_id        competition_name  season_id season_name
0               9           1. Bundesliga        281   2023/2024
1               9           1. Bundesliga         27   2015/2016
2            1267  African Cup of Nations        107        2023
3              16        Champions League          4   2018/2019
4              16        Champions League          1   2017/2018

[Matches]
   match_id  match_date home_team.home_team_name away_team.away_team_name
0   3895302  2024-04-14         Bayer Leverkusen            Werder Bremen
1   3895292  2024-04-06             Union Berlin         Bayer Leverkusen
2   3895333  2024-05-05      Eintracht Frankfurt         Bayer Leverkusen
3   3895340  2024-05-12                   Bochum         Bayer Leverkusen
4   3895348  2024-05-18         Bayer Leverkusen                 Augsburg
Total matches loaded: 34

[Events]
Shape: (137765, 138)
Events data not loaded.


In [26]:
import json
import pandas as pd
from pathlib import Path

competitions_file = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\competitions.json")
with open(competitions_file, "r", encoding="utf-8") as f:
    competitions = json.load(f)

competitions_df = pd.json_normalize(competitions)


In [27]:
events_root = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\events")

all_events = []
for match in matches_df['match_id'].unique():
    f = events_root / f"{match}.json"
    if f.exists():
        with open(f, "r", encoding="utf-8") as file:
            all_events.extend(json.load(file))

events_df = pd.json_normalize(all_events, sep=".")


In [28]:
print("=== WEEK 4 VERIFICATION ===")

# 1. Passes Completed
try:
    passes_df = events_df[events_df['type.name']=="Pass"].copy()
    passes_df['successful'] = passes_df['pass.outcome.name'].isna()
    passes_completed = (
        passes_df[passes_df['successful']]
        .groupby(['player.name','team.name'], dropna=False)
        .size().reset_index(name='passes_completed')
    )
    print("\n[Passes Completed]")
    print(passes_completed.sort_values('passes_completed', ascending=False).head(5))
except Exception as e:
    print("Passes check failed:", e)


# 2. Expected Goals (xG)
try:
    shots_df = events_df[events_df['type.name']=="Shot"].copy()
    shots_df['shot.statsbomb_xg'] = shots_df['shot.statsbomb_xg'].fillna(0.0)
    xg_by_player = (
        shots_df.groupby(['player.name','team.name'], dropna=False)['shot.statsbomb_xg']
        .sum().reset_index(name='xG')
    )
    print("\n[Expected Goals (xG)]")
    print(xg_by_player.sort_values('xG', ascending=False).head(5))
except Exception as e:
    print("xG check failed:", e)


# 3. Assists
try:
    goals = shots_df[shots_df['shot.outcome.name']=="Goal"][['match_id','id','shot.key_pass_id']].dropna().copy()
    goals['shot.key_pass_id'] = goals['shot.key_pass_id'].astype(str)

    passes_df['id'] = passes_df['id'].astype(str)
    assists = goals.merge(passes_df[['id','player.name','team.name']], left_on='shot.key_pass_id', right_on='id')

    assists_by_player = (
        assists.groupby(['player.name','team.name'])
        .size().reset_index(name='assists')
    )
    print("\n[Assists]")
    print(assists_by_player.sort_values('assists', ascending=False).head(5))
except Exception as e:
    print("Assists check failed:", e)


# 4. Player Summary
try:
    from functools import reduce
    summary = reduce(
        lambda left,right: pd.merge(left, right, on=['player.name','team.name'], how='outer'),
        [passes_completed, xg_by_player, assists_by_player]
    ).fillna(0)

    print("\n[Player Summary]")
    print(summary.sort_values('passes_completed', ascending=False).head(10))
except Exception as e:
    print("Summary check failed:", e)


=== WEEK 4 VERIFICATION ===

[Passes Completed]
                     player.name         team.name  passes_completed
111                 Granit Xhaka  Bayer Leverkusen              3045
150                 Jonathan Tah  Bayer Leverkusen              2057
87   Exequiel Alejandro Palacios  Bayer Leverkusen              1867
72         Edmond Fayçal Tapsoba  Bayer Leverkusen              1783
3      Alejandro Grimaldo García  Bayer Leverkusen              1783

[Expected Goals (xG)]
              player.name         team.name         xG
170  Victor Okoh Boniface  Bayer Leverkusen  15.970215
71       Jeremie Frimpong  Bayer Leverkusen   8.719662
52          Florian Wirtz  Bayer Leverkusen   8.470831
73          Jonas Hofmann  Bayer Leverkusen   5.709160
139         Patrik Schick  Bayer Leverkusen   5.447158
Assists check failed: "['match_id'] not in index"
Summary check failed: name 'assists_by_player' is not defined


In [29]:
print([col for col in shots_df.columns if "match" in col or "id" in col])


['id', 'type.id', 'possession_team.id', 'play_pattern.id', 'team.id', 'player.id', 'position.id', 'pass.recipient.id', 'pass.height.id', 'pass.body_part.id', 'pass.type.id', 'pass.outcome.id', 'ball_receipt.outcome.id', 'dribble.outcome.id', 'duel.type.id', 'duel.outcome.id', 'interception.outcome.id', 'shot.technique.id', 'shot.body_part.id', 'shot.type.id', 'shot.outcome.id', 'goalkeeper.position.id', 'goalkeeper.type.id', 'pass.assisted_shot_id', 'shot.key_pass_id', 'goalkeeper.body_part.id', 'goalkeeper.outcome.id', 'goalkeeper.technique.id', '50_50.outcome.id', 'clearance.body_part.id', 'pass.technique.id', 'foul_committed.card.id', 'substitution.outcome.id', 'substitution.replacement.id', 'bad_behaviour.card.id', 'foul_committed.type.id']


In [30]:
# Goals scored (shots where outcome = Goal)
goals = shots_df[shots_df['shot.outcome.name'] == "Goal"][['id','shot.key_pass_id']].dropna().copy()
goals['shot.key_pass_id'] = goals['shot.key_pass_id'].astype(str)

# Prepare passes with IDs
passes_df['id'] = passes_df['id'].astype(str)

# Merge to find which passes became assists
assists = goals.merge(
    passes_df[['id','player.name','team.name']], 
    left_on='shot.key_pass_id', 
    right_on='id'
)

# Count assists per player
assists_by_player = (
    assists.groupby(['player.name','team.name'])
    .size().reset_index(name='assists')
)

print(assists_by_player.sort_values('assists', ascending=False).head(10))


                       player.name         team.name  assists
1        Alejandro Grimaldo García  Bayer Leverkusen       13
11                   Florian Wirtz  Bayer Leverkusen       10
30            Victor Okoh Boniface  Bayer Leverkusen        8
14                   Jonas Hofmann  Bayer Leverkusen        7
13                Jeremie Frimpong  Bayer Leverkusen        7
2                       Amine Adli  Bayer Leverkusen        5
9      Exequiel Alejandro Palacios  Bayer Leverkusen        4
0                      Adam Hložek  Bayer Leverkusen        2
23                    Nathan Tella  Bayer Leverkusen        2
4   Arthur Augusto de Matos Soares  Bayer Leverkusen        2


In [31]:
from functools import reduce

# Rebuild summary from the three components
summary = reduce(
    lambda left,right: pd.merge(left, right, on=['player.name','team.name'], how='outer'),
    [passes_completed, xg_by_player, assists_by_player]
).fillna(0)

# Now save everything
events_df.to_csv("events_week3.csv", index=False)
passes_completed.to_csv("passes_completed.csv", index=False)
xg_by_player.to_csv("xg_by_player.csv", index=False)
assists_by_player.to_csv("assists_by_player.csv", index=False)
summary.to_csv("player_summary_week4.csv", index=False)

print("✅ All outputs saved successfully.")


✅ All outputs saved successfully.


In [32]:
# Save Week 3 full events dataset
events_df.to_csv("events_week3.csv", index=False)

# Save Week 4 outputs
passes_completed.to_csv("passes_completed.csv", index=False)
xg_by_player.to_csv("xg_by_player.csv", index=False)
assists_by_player.to_csv("assists_by_player.csv", index=False)
summary.to_csv("player_summary_week4.csv", index=False)

print("✅ All outputs saved successfully.")


✅ All outputs saved successfully.


In [33]:
# WEEK 3 - MATCHES + EVENTS
events_root = Path(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\data\events")

all_events = []
for match in matches_df['match_id'].unique():
    f = events_root / f"{match}.json"
    if f.exists():
        with open(f, "r", encoding="utf-8") as file:
            all_events.extend(json.load(file))

events_df = pd.json_normalize(all_events, sep=".")
print("Events loaded:", events_df.shape)
print(events_df.columns.tolist()[:20])  # sample cols


Events loaded: (137765, 138)
['id', 'index', 'period', 'timestamp', 'minute', 'second', 'possession', 'duration', 'type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'related_events', 'location']


In [34]:
# WEEK 4 - PLAYER PERFORMANCE
from functools import reduce

# Passes Completed
passes_df = events_df[events_df['type.name']=="Pass"].copy()
passes_df['successful'] = passes_df['pass.outcome.name'].isna()
passes_completed = (
    passes_df[passes_df['successful']]
    .groupby(['player.name','team.name'], dropna=False)
    .size().reset_index(name='passes_completed')
)

# Expected Goals
shots_df = events_df[events_df['type.name']=="Shot"].copy()
shots_df['shot.statsbomb_xg'] = shots_df['shot.statsbomb_xg'].fillna(0.0)
xg_by_player = (
    shots_df.groupby(['player.name','team.name'], dropna=False)['shot.statsbomb_xg']
    .sum().reset_index(name='xG')
)

# Assists
goals = shots_df[shots_df['shot.outcome.name']=="Goal"][['id','shot.key_pass_id']].dropna().copy()
goals['shot.key_pass_id'] = goals['shot.key_pass_id'].astype(str)
passes_df['id'] = passes_df['id'].astype(str)
assists = goals.merge(passes_df[['id','player.name','team.name']], left_on='shot.key_pass_id', right_on='id')
assists_by_player = (
    assists.groupby(['player.name','team.name'])
    .size().reset_index(name='assists')
)

# Merge Summary
summary = reduce(
    lambda left,right: pd.merge(left, right, on=['player.name','team.name'], how='outer'),
    [passes_completed, xg_by_player, assists_by_player]
).fillna(0)

print(summary.head())


                 player.name                 team.name  passes_completed  \
0                Adam Hložek          Bayer Leverkusen             131.0   
1                Adrian Beck             FC Heidenheim              28.0   
2              Alassane Pléa  Borussia Mönchengladbach              32.0   
3  Alejandro Grimaldo García          Bayer Leverkusen            1783.0   
4        Aleksandar Pavlović             Bayern Munich              46.0   

         xG  assists  
0  3.010453      2.0  
1  0.000000      0.0  
2  0.023374      0.0  
3  5.209482     13.0  
4  0.000000      0.0  
