# Match Data Club

### 1. Import and append the raw data files

In [16]:
import os
import json
import pandas as pd
import numpy as np

In [2]:
# list all subdirectories in matches folder
match_ids = next(os.walk('./matches'))[1]
match_ids[0:5]

['3829698', '3829690', '3797481', '3797410', '3797419']

In [3]:
# combine match info to 1 dataset 'matches'
matches = []

for match_id in match_ids:    
    # get match info as JSON object
    # store in dictionary
    f = open('./matches/' + match_id + '/info.json')
    info = json.load(f)
    
    matches.append(info)
    
pd.DataFrame(matches).head(2)

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,attendance,...,match_status,match_status_360,last_updated,last_updated_360,metadata,match_week,competition_stage,stadium,referee,match_date_utc
0,3829698,2022-05-22,18:30:00,"{'competition_id': 46, 'country_name': 'Belgiu...","{'season_id': 108, 'season_name': '2021/2022'}","{'home_team_id': 977, 'home_team_name': 'Club ...","{'away_team_id': 986, 'away_team_name': 'RSC A...",1,1,,...,available,available,2022-12-01T19:17:39.296860,2022-08-04T12:00:00,"{'data_version': '1.1.0', 'shot_fidelity_versi...",40,"{'id': 69, 'name': 'Championship Round'}","{'id': 508, 'name': 'Jan Breydelstadion', 'cou...","{'id': 1584, 'name': 'Lothar D''hondt', 'count...",2022-05-22T18:30:00+02:00
1,3829690,2022-05-11,20:30:00,"{'competition_id': 46, 'country_name': 'Belgiu...","{'season_id': 108, 'season_name': '2021/2022'}","{'home_team_id': 977, 'home_team_name': 'Club ...","{'away_team_id': 6193, 'away_team_name': 'Unio...",1,0,,...,available,available,2022-12-01T19:10:31.419785,2022-08-04T12:00:00,"{'data_version': '1.1.0', 'shot_fidelity_versi...",38,"{'id': 69, 'name': 'Championship Round'}","{'id': 508, 'name': 'Jan Breydelstadion', 'cou...","{'id': 773, 'name': 'Erik Lambrechts', 'countr...",2022-05-11T20:30:00+02:00


In [17]:
# combine match stats to 1 dataset 'stats'
stats = []

for match_id in match_ids:    
    # get match info as JSON object
    # store in dictionary
    f = open('./matches/' + match_id + '/stats.json')
    match_stats = json.load(f)
    
    stats = stats + match_stats

# delete all attributes that are always null
stats = pd.DataFrame(stats).dropna(axis=1, how='all')
stats = stats.replace({np.nan: None})
stats.head(5)

Unnamed: 0,match_id,team_id,team_name,player_id,player_name,player_match_minutes,player_match_np_xg_per_shot,player_match_np_xg,player_match_np_shots,player_match_goals,...,player_match_obv_defensive_action,player_match_obv_dribble_carry,player_match_obv_gk,player_match_deep_completions,player_match_ball_recoveries,player_match_np_psxg,player_match_penalties_faced,player_match_penalties_conceded,player_match_fhalf_ball_recoveries,player_match_360_minutes
0,3829698,977,Club Brugge,18740,Brandon Mechele,94.3,0.492484,0.492484,1,0,...,0.228259,0.011091,,0,9,0.0,0,0,2,94.3
1,3829698,986,RSC Anderlecht,23665,Lior Refaelov,21.3,,0.0,0,0,...,,-0.185701,,1,0,0.0,0,0,0,21.3
2,3829698,986,RSC Anderlecht,3588,Wesley Hoedt,94.3,,0.0,0,0,...,-0.010454,0.039687,,0,13,0.0,0,0,2,94.3
3,3829698,977,Club Brugge,12259,Denis Odoi,94.3,,0.0,0,0,...,0.021306,0.063976,,0,10,0.0,0,0,3,94.3
4,3829698,977,Club Brugge,16281,Hans Vanaken,94.3,0.103047,0.103047,1,0,...,0.01678,0.029325,,1,9,0.0,0,0,4,94.3


In [18]:
stats = stats.to_dict('records')

### 2. Improve field naming

In [43]:
matches[0]['home_team']

{'home_team_id': 977,
 'home_team_name': 'Club Brugge',
 'home_team_gender': 'male',
 'home_team_youth': False,
 'home_team_group': None,
 'country': {'id': 22, 'name': 'Belgium'},
 'managers': [{'id': 2768,
   'name': 'Alfred Schreuder',
   'nickname': None,
   'dob': '1972-11-02',
   'country': {'id': 160, 'name': 'Netherlands'}}]}

In [44]:
for m in matches:
    for prefix in ['home_team', 'away_team']: 
        keys = list(m[prefix].keys())
        for key in [k for k in keys if f'{prefix}_' in k]:
            new_key = key.removeprefix(f'{prefix}_')
            m[prefix][new_key] = m[prefix][key]
            del m[prefix][key]

In [45]:
matches[0]['home_team']

{'country': {'id': 22, 'name': 'Belgium'},
 'managers': [{'id': 2768,
   'name': 'Alfred Schreuder',
   'nickname': None,
   'dob': '1972-11-02',
   'country': {'id': 160, 'name': 'Netherlands'}}],
 'id': 977,
 'name': 'Club Brugge',
 'gender': 'male',
 'youth': False,
 'group': None}

### 3. Output 'matches' and 'stats'

In [46]:
with open('./output/matches.json', 'w') as f:
    json.dump(matches, f, indent=4)

In [19]:
with open('./output/stats.json', 'w') as f:
    json.dump(stats, f, indent=4)