## Data Loading

In [1]:
import json
import pandas as pd

def parse_event_data(events_file_path):
    with open(events_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    events = data

    # Extract the relevant data for our model
    pass_events = []
    shot_events = []

    for event in events:
        if 'type' in event and event['type']['name'] == 'Pass':
            period = event['period']['name'] if isinstance(event['period'], dict) else event['period']
            pass_events.append({
                'id': event['id'],
                'timestamp': event['timestamp'],
                'period': period,
                'duration': event['duration'],
                'possession': event['possession'],
                'team': event['team']['name'],
                'player': event['player']['name'],
                'location': event['location'],
                'pass_end_location': event['pass']['end_location'],
                'pass_height': event['pass']['height']['name'],
                'pass_length': event['pass']['length'],
                'pass_angle': event['pass']['angle'],
                'pass_type': event['pass']['type']['name'] if 'type' in event['pass'] else None,
                'pass_outcome': event['pass']['outcome']['name'] if 'outcome' in event['pass'] else None,
                'pass_body_part': event['pass']['body_part']['name'] if 'body_part' in event['pass'] else None
            })
        elif 'type' in event and event['type']['name'] == 'Shot':
            period = event['period']['name'] if isinstance(event['period'], dict) else event['period']
            shot_events.append({
                'id': event['id'],
                'timestamp': event['timestamp'],
                'period': period,
                'duration': event['duration'],
                'possession': event['possession'],
                'team': event['team']['name'],
                'player': event['player']['name'],
                'location': event['location'],
                'shot_end_location': event['shot']['end_location'],
                'shot_outcome': event['shot']['outcome']['name'] if 'outcome' in event['shot'] else None,
                'shot_type': event['shot']['type']['name'] if 'type' in event['shot'] else None,
                'shot_body_part': event['shot']['body_part']['name'] if 'body_part' in event['shot'] else None,
                'shot_technique': event['shot']['technique']['name'] if 'technique' in event['shot'] else None,
                'shot_first_time': event['shot']['first_time'] if 'first_time' in event['shot'] else None
            })

    pass_df = pd.DataFrame(pass_events)
    shot_df = pd.DataFrame(shot_events)

    return pass_df, shot_df


def merge_data(pass_df, shot_df, lineup_file_path):
    # Load the lineup data
    with open(lineup_file_path, encoding='utf-8') as file:
        lineup_data = json.load(file)

    # Create a dictionary to map player IDs to player names
    players = {}
    for team in lineup_data:
        for player in team['lineup']:
            players[player['player']['id']] = player['player']['name']

    # Replace player IDs with player names in the pass data
    pass_df['player_name'] = pass_df['player_id'].map(players)

    # Replace player IDs with player names in the shot data
    shot_df['player_name'] = shot_df['player_id'].map(players)

    # Extract additional columns from the pass data
    pass_df['pass_length'] = np.sqrt((pass_df['location'].str[0] - pass_df['pass_end_location'].str[0]) ** 2 + (pass_df['location'].str[1] - pass_df['pass_end_location'].str[1]) ** 2)
    pass_df['pass_angle'] = np.arctan2(pass_df['pass_end_location'].str[1] - pass_df['location'].str[1], pass_df['pass_end_location'].str[0] - pass_df['location'].str[0]) * 180 / np.pi
    pass_df['pass_height'] = pass_df['pass_type'].apply(lambda x: 'High' if 'High' in x else ('Low' if 'Low' in x else 'Normal'))

    # Extract additional columns from the shot data
    shot_df['shot_distance'] = np.sqrt((shot_df['location'].str[0] - shot_df['shot_end_location'].str[0]) ** 2 + (shot_df['location'].str[1] - shot_df['shot_end_location'].str[1]) ** 2)
    shot_df['shot_body_part'] = shot_df['shot'].apply(lambda x: x['body_part']['name'] if 'body_part' in x else None)
    shot_df['shot_first_time'] = shot_df['shot'].apply(lambda x: x['first_time'] if 'first_time' in x else None)
    shot_df['shot_technique'] = shot_df['shot'].apply(lambda x: x['technique']['name'] if 'technique' in x else None)

    # Merge the pass and shot data frames on the 'location' column
    merged_df = pd.merge(pass_df, shot_df, on='location', how='outer', suffixes=('_pass', '_shot'))

    return merged_df



In [2]:
from sklearn.preprocessing import StandardScaler

def preprocess_data(merged_df):
    # Convert timestamp to seconds
    merged_df['timestamp'] = pd.to_timedelta(merged_df['timestamp']).dt.total_seconds()

    # Calculate pass angle and length from pass location and end location
    pass_end_x, pass_end_y = zip(*merged_df[merged_df['type_name'] == 'Pass']['pass_end_location'])
    player_x, player_y = zip(*merged_df[merged_df['type_name'] == 'Pass']['location'])
    merged_df.loc[merged_df['type_name'] == 'Pass', 'pass_angle'] = np.degrees(np.arctan2(np.subtract(pass_end_y, player_y), np.subtract(pass_end_x, player_x)))
    merged_df.loc[merged_df['type_name'] == 'Pass', 'pass_length'] = np.sqrt((np.array(pass_end_x) - np.array(player_x)) ** 2 + (np.array(pass_end_y) - np.array(player_y)) ** 2)

    # Calculate shot distance from shot location and end location
    shot_end_x, shot_end_y = zip(*merged_df[merged_df['type_name'] == 'Shot']['shot_end_location'])
    shot_x, shot_y = zip(*merged_df[merged_df['type_name'] == 'Shot']['location'])
    merged_df.loc[merged_df['type_name'] == 'Shot', 'shot_distance'] = np.sqrt((np.array(shot_end_x) - np.array(shot_x)) ** 2 + (np.array(shot_end_y) - np.array(shot_y)) ** 2)

    # Scale the numerical features
    scaler = StandardScaler()
    merged_df[['timestamp', 'pass_length', 'pass_angle']] = scaler.fit_transform(merged_df[['timestamp', 'pass_length', 'pass_angle']])
    merged_df.loc[merged_df['type_name'] == 'Shot', 'timestamp'] = scaler.fit_transform(merged_df[merged_df['type_name'] == 'Shot'][['timestamp']])
    
    return merged_df


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def build_model(merged_df):
    # Prepare the data for the model
    X_pass = merged_df.loc[merged_df['type_name'] == 'Pass', ['timestamp', 'location', 'pass_end_location', 'pass_length', 'pass_angle', 'pass_height', 'pass_type']]
    X_shot = merged_df.loc[merged_df['type_name'] == 'Shot', ['timestamp', 'location', 'shot_end_location', 'shot_distance', 'shot_type', 'shot_body_part', 'shot_first_time', 'shot_technique']]
    y_pass = np.zeros(X_pass.shape[0])
    y_shot = np.ones(X_shot.shape[0])
    
    # Combine the pass and shot data
    X = pd.concat([X_pass, X_shot], axis=0)
    y = np.concatenate([y_pass, y_shot])
    
    # Build the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    return model


In [5]:
# Define file paths
events_file_path = './statsbomb json files/g2312152/ManCity_LeicesterCity_events.json'
lineup_file_path = './statsbomb json files/g2312152/ManCity_LeicesterCity_lineups.json'

# Parse the event data
pass_df, shot_df = parse_event_data(events_file_path)

# Merge the event data with the lineup data
merged_df = merge_data(pass_df, shot_df, lineup_file_path)

# Preprocess the data
pass_df, shot_df = preprocess_data(pass_df, shot_df)

# Build the model
model = build_model(merged_df)


KeyError: 'player'