In [32]:
import os

def print_directory_contents(path):
    for item in os.listdir(path):
        if os.path.isdir(os.path.join(path, item)):
            print(f"\n- {item}/")
            print_directory_contents(os.path.join(path, item))
        else:
            print(f"  - {item}")

# Use "." to refer to the current directory
#print_directory_contents(".")


## Data Loading

In [79]:
import json
import pandas as pd

def parse_event_data(events_file_path):
    with open(events_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    events = data

    # Extract the relevant data for our model
    pass_events = []
    shot_events = []

    for event in events:
        if 'type' in event and event['type']['name'] == 'Pass':
            period = event['period']['name'] if isinstance(event['period'], dict) else event['period']
            pass_events.append({
                'id': event['id'],
                'timestamp': event['timestamp'],
                'period': period,
                'duration': event['duration'],
                'possession': event['possession'],
                'team': event['team']['name'],
                'player': event['player']['name'],
                'location': event['location'],
                'pass_end_location': event['pass']['end_location'],
                'pass_height': event['pass']['height']['name'],
                'pass_length': event['pass']['length'],
                'pass_angle': event['pass']['angle'],
                'pass_type': event['pass']['type']['name'] if 'type' in event['pass'] else None,
                'pass_outcome': event['pass']['outcome']['name'] if 'outcome' in event['pass'] else None,
                'pass_body_part': event['pass']['body_part']['name'] if 'body_part' in event['pass'] else None
            })
        elif 'type' in event and event['type']['name'] == 'Shot':
            period = event['period']['name'] if isinstance(event['period'], dict) else event['period']
            shot_events.append({
                'id': event['id'],
                'timestamp': event['timestamp'],
                'period': period,
                'duration': event['duration'],
                'possession': event['possession'],
                'team': event['team']['name'],
                'player': event['player']['name'],
                'location': event['location'],
                'shot_end_location': event['shot']['end_location'],
                'shot_outcome': event['shot']['outcome']['name'] if 'outcome' in event['shot'] else None,
                'shot_type': event['shot']['type']['name'] if 'type' in event['shot'] else None,
                'shot_body_part': event['shot']['body_part']['name'] if 'body_part' in event['shot'] else None,
                'shot_technique': event['shot']['technique']['name'] if 'technique' in event['shot'] else None,
                'shot_first_time': event['shot']['first_time'] if 'first_time' in event['shot'] else None
            })

    pass_df = pd.DataFrame(pass_events)
    shot_df = pd.DataFrame(shot_events)

    return pass_df, shot_df


# Define function to merge event data with lineup data
# def merge_data(pass_df, shot_df, lineup_file_path):
#     # Load lineup data from JSON file
#     with open(lineup_file_path, 'r', encoding='utf-8') as file:
#         data = json.load(file)

#     # Create lineup DataFrame
#     players = []
#     for team in data:
#         for player in team['lineup']:
#             player_dict = {
#                 'player_name': None,
#                 'position_name': None,
#             }
#             if 'player' in player:
#                 player_dict['player_name'] = player['player']['name']
#             if 'position' in player:
#                 player_dict['position_name'] = player['position']['name']
#             players.append(player_dict)
#     lineup_df = pd.DataFrame(players)

#     # Merge pass data with lineup data
#     pass_df = pd.merge(pass_df, lineup_df[['player_name', 'position_name']], left_on='player', right_on='player_name', how='left')
#     pass_df.drop('player_name', axis=1, inplace=True)

#     # Merge shot data with lineup data
#     shot_df = pd.merge(shot_df, lineup_df[['player_name', 'position_name']], left_on='player', right_on='player_name', how='left')
#     shot_df.drop('player_name', axis=1, inplace=True)

#     return pass_df, shot_df



In [80]:
def merge_data(pass_df, shot_df, lineup_file_path):
    # Load lineup data from JSON file
    with open(lineup_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Create lineup DataFrame
    players = []
    for team in data:
        for player in team['lineup']:
            player_dict = {
                'player_name': None,
                'position_name': None,
            }
            if 'player' in player:
                player_dict['player_name'] = player['player']['name']
            if 'position' in player:
                player_dict['position_name'] = player['position']['name']
            players.append(player_dict)
    lineup_df = pd.DataFrame(players)

    # Merge pass data with lineup data
    pass_df = pd.merge(pass_df, lineup_df[['player_name', 'position_name']], left_on='player', right_on='player_name', how='left')
    pass_df.drop('player_name', axis=1, inplace=True)

    # Merge shot data with lineup data
    shot_df = pd.merge(shot_df, lineup_df[['player_name', 'position_name']], left_on='player', right_on='player_name', how='left')
    shot_df.drop('player_name', axis=1, inplace=True)

    # Extract additional columns from the shot data
    shot_df['shot_distance'] = np.sqrt((shot_df['location'].str[0] - shot_df['shot_end_location'].str[0]) ** 2 + (shot_df['location'].str[1] - shot_df['shot_end_location'].str[1]) ** 2)
    shot_df['shot_body_part'] = shot_df['shot'].apply(lambda x: x['body_part']['name'] if 'body_part' in x else None)
    shot_df['shot_first_time'] = shot_df['shot'].apply(lambda x: x['first_time'] if 'first_time' in x else None)
    shot_df['shot_technique'] = shot_df['shot'].apply(lambda x: x['technique']['name'] if 'technique' in x else None)

    return pass_df, shot_df



In [81]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import math

# Define function to preprocess the data
def preprocess_data(pass_df, shot_df):
    # Convert timestamp to seconds
    pass_df['timestamp'] = pd.to_timedelta(pass_df['timestamp']).dt.total_seconds()
    shot_df['timestamp'] = pd.to_timedelta(shot_df['timestamp']).dt.total_seconds()

    # Calculate pass angle from the pass end location and the player's location
    pass_end_x, pass_end_y = zip(*pass_df['pass_end_location'])
    player_x, player_y = zip(*pass_df['location'])
    pass_df['pass_angle'] = np.degrees(np.arctan2(np.subtract(pass_end_y, player_y), np.subtract(pass_end_x, player_x)))

    # Calculate pass length from the player's location and the pass end location
    pass_df['pass_length'] = np.sqrt((np.array(pass_end_x) - np.array(player_x)) ** 2 + (np.array(pass_end_y) - np.array(player_y)) ** 2)

    # Scale the numerical features
    scaler = StandardScaler()
    pass_df[['timestamp', 'pass_length', 'pass_angle']] = scaler.fit_transform(pass_df[['timestamp', 'pass_length', 'pass_angle']])
    shot_df[['timestamp']] = scaler.fit_transform(shot_df[['timestamp']])
    shot_df['shot_distance'] = np.sqrt((shot_df['location'].str[0] - shot_df['shot_end_location'].str[0]) ** 2 + (shot_df['location'].str[1] - shot_df['shot_end_location'].str[1]) ** 2)

    return pass_df, shot_df

In [83]:
# Test parse_event_data function
pass_df, shot_df = parse_event_data('./statsbomb json files/g2312152/ManCity_LeicesterCity_events.json')
print(pass_df.head())
print(shot_df.head())

                                     id     timestamp  period  duration  \
0  f86a49e6-2d11-4e89-8c54-a736720a8b0d  00:00:00.793       1  1.106021   
1  43c872ae-0e27-4a49-8462-342bef278580  00:00:02.565       1  1.307097   
2  d814f6ad-99ef-489a-9040-4e903deca237  00:00:04.474       1  1.219923   
3  b32b2820-3fe9-4435-842d-f3200df60088  00:00:06.664       1  1.480582   
4  abced20e-c44a-447d-92a0-e5028cbcfeed  00:00:09.179       1  2.694161   

   possession                 team                 player      location  \
0           2  Manchester City WFC    Khadija Monifa Shaw  [60.0, 40.0]   
1           2  Manchester City WFC           Yui Hasegawa  [48.4, 39.8]   
2           2  Manchester City WFC  Laia Aleixandri López  [35.5, 49.2]   
3           2  Manchester City WFC         Alex Greenwood  [35.3, 27.1]   
4           2  Manchester City WFC       Esme Beth Morgan   [36.2, 7.1]   

  pass_end_location  pass_height  pass_length  pass_angle pass_type  \
0      [47.9, 38.5]  Ground

In [84]:
# Test merge_data function
pass_df, shot_df = merge_data(pass_df, shot_df, './statsbomb json files/g2312152/ManCity_LeicesterCity_lineups.json')
print(pass_df.head())
print(shot_df.head())

KeyError: 'shot'

In [None]:
# Test preprocess_data function
pass_df, shot_df = preprocess_data(pass_df[['timestamp', 'location', 'pass_end_location', 'pass_height', 'pass_type']], shot_df[['timestamp', 'location', 'shot_end_location', 'shot_type']])
print(pass_df.head())
print(shot_df.head())

## Logistic Regression Model

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def build_model(pass_df, shot_df):
    # Prepare the data for the model
    X_pass = pass_df[['timestamp', 'location', 'pass_end_location', 'pass_length', 'pass_angle', 'pass_height', 'pass_type']]
    X_shot = shot_df[['timestamp', 'location', 'shot_end_location', 'shot_distance', 'shot_type', 'shot_body_part', 'shot_first_time', 'shot_technique']]
    y_pass = np.zeros(len(pass_df))
    y_shot = np.ones(len(shot_df))
    
    # Split the data into training and testing sets
    X_pass_train, X_pass_test, y_pass_train, y_pass_test = train_test_split(X_pass, y_pass, test_size=0.2, random_state=42)
    X_shot_train, X_shot_test, y_shot_train, y_shot_test = train_test_split(X_shot, y_shot, test_size=0.2, random_state=42)

    # Combine the pass and shot data
    X_train = pd.concat([X_pass_train, X_shot_train])
    y_train = np.concatenate([y_pass_train, y_shot_train])
    X_test = pd.concat([X_pass_test, X_shot_test])
    y_test = np.concatenate([y_pass_test, y_shot_test])

    # Scale the numerical features
    scaler = StandardScaler()
    X_train[['timestamp', 'pass_length', 'pass_angle', 'shot_distance']] = scaler.fit_transform(X_train[['timestamp', 'pass_length', 'pass_angle', 'shot_distance']])
    X_test[['timestamp', 'pass_length', 'pass_angle', 'shot_distance']] = scaler.transform(X_test[['timestamp', 'pass_length', 'pass_angle', 'shot_distance']])

    # One-hot encode the categorical features
    X_train = pd.get_dummies(X_train, columns=['pass_height', 'pass_type', 'shot_type', 'shot_body_part', 'shot_first_time', 'shot_technique'])
    X_test = pd.get_dummies(X_test, columns=['pass_height', 'pass_type', 'shot_type', 'shot_body_part', 'shot_first_time', 'shot_technique'])

    # Train the model
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)

    print(f"Training score: {train_score}")
    print(f"Testing score: {test_score}")

    return model

In [78]:
model = build_model(pass_df, shot_df)

KeyError: "['shot_body_part', 'shot_first_time', 'shot_technique'] not in index"