## Statsbomb dataset

In [27]:
from statsbombpy import sb
import pandas as pd
import numpy as np

In [28]:
df = sb.matches(competition_id=11, season_id=27)



In [29]:
df.columns

Index(['match_id', 'match_date', 'kick_off', 'competition', 'season',
       'home_team', 'away_team', 'home_score', 'away_score', 'match_status',
       'match_status_360', 'last_updated', 'last_updated_360', 'match_week',
       'competition_stage', 'stadium', 'referee', 'home_managers',
       'away_managers', 'data_version', 'shot_fidelity_version',
       'xy_fidelity_version'],
      dtype='object')

In [4]:
teams1 = df['home_team'].unique()
teams2 = df['away_team'].unique()

In [5]:
teams = set(teams1) | set(teams2)

In [6]:
def get_team_id_matches(team, df):
    idvalues = df[(df['home_team'] == team) | (df['away_team'] == team)]['match_id'].unique()
    return idvalues

In [7]:
def get_players(team, df):
    #given a team, it will return its players
    idvalues = get_team_id_matches(team, df)
    players = set()
    for id in idvalues:
        lineup = sb.lineups(match_id=id)[team]
        players.update(lineup['player_name'].unique())
    print(players)
    return players

In [9]:
lineups = {}

for team in teams:
    lineups[f'lineup_{team}'] = set() 
    lineups[f'lineup_{team}'].update(get_players(team, df=df))  

{'Manuel Agudo Durán', 'Jonathan Castro Otto', 'Claudio Beauvue', 'Nemanja Radoja', "Théo Bongonda Mbul''Ofeko Batombo", 'Gustavo Daniel Cabral Cáceres', 'Rubén Blanco Veiga', 'Sergio Álvarez Conde', 'Josep Señé Escudero', 'Lévy Clément Madinda', 'Sergi Gómez Solà', 'Pedro Pablo Hernández', 'Andreu Fontàs Prat', 'Borja Iglesias Quintas', 'Hugo Mallo Novegil', 'Iago Aspas Juncal', 'Pape Cheikh Diop Gueye', 'Fabián Ariel Orellana Valenzuela', 'Diego Alende López', 'Dejan Dražić', 'Daniel Wass', 'David Goldar Gómez', 'Néstor Díaz García', 'Carles Planas Antolínez', 'Augusto Matías Fernández', 'Borja Fernández Fernández', 'John Guidetti', 'Marcelo Alfonso Díaz Rojas'}
{'Asier Illarramendi Andonegi', 'Sergio Canales Madrazo', 'Aritz Elustondo Irribaria', 'Íñigo Martínez Berridi', 'Álvaro Odriozola Arzallus', 'Rubén Pardo Gutiérrez', 'Markel Bergara Larrañaga', 'Carlos Martínez Díez', 'Armindo Tué Na Bangna', 'Esteban Félix Granero Molina', 'Jon Ansotegi Gorostola', 'Mikel González de Martín

KeyboardInterrupt: 

In [None]:
def get_events(team_ids):
    events = pd.DataFrame()
    for id in team_ids:
        match_events = sb.events(match_id=id)
        events = pd.concat([events, match_events], axis=0, ignore_index=True)
    return events

In [None]:
def calculate_passes_ij(player1, player2, events_df):
    #calculates probability of pass from player 1 to player 2 based on an estimator
    #relevant events (pass)
    pass_events = events_df[events_df['type']=='Pass']
    
    #relevant passes (players)
    pass_events_ij = pass_events[(pass_events['player'] == player1) & (pass_events['pass_recipient'] == player2)]
    total_i_pass = pass_events[pass_events['player']==player1]

    #length
    passes_ij = len(pass_events_ij)
    passes_i = len(total_i_pass)


    return passes_ij/passes_i if passes_i >0 else 0

In [None]:
def calculate_pass_matrix(team, df, lineups, team_events):
    # First, we get the IDs of the matches that the team took part in
    #team_ids = get_team_id_matches(team, df)
    
    # Now, we get the players
    players = lineups[f'lineup_{team}']
    
    # Initialize an empty pass matrix for the given team
    num_players = len(players)
    pass_matrix = np.zeros((num_players, num_players))

    #i want to keep track og which players represent the [i][j] values
    player_index_map = {player: i for i, player in enumerate(players)}

    
    # Now, we get all of the events relevant to the team
    #team_events = get_events(team_ids)

    print(f"Building pass matrix for '{team}'...")
    for i, player1 in enumerate(players):
        for j, player2 in enumerate(players):
            if player1 != player2:
                pass_matrix[i, j] = calculate_passes_ij(player1, player2, team_events)
        
        #print(f"Finished processing passes for {player1}. Access pass proportions with index {i}")

    print("Pass matrix construction complete.")
    #print(f"Pass Matrix for '{team}':")
    
    return pass_matrix, player_index_map

In [128]:
import warnings
#para que no aparezca el warning
warnings.filterwarnings("ignore", category=UserWarning, module='statsbombpy')

In [129]:
pass_matrices = {}
player_index_map = {}

In [131]:

for team in teams:
    team_ids = get_team_id_matches(team, df)
    team_events = get_events(team_ids)
    pass_matrices[team], player_index_map[team] = calculate_pass_matrix(team, df, lineups, team_events)

Building pass matrix for 'Espanyol'...
Pass matrix construction complete.
Building pass matrix for 'Málaga'...
Pass matrix construction complete.
Building pass matrix for 'Barcelona'...
Pass matrix construction complete.
Building pass matrix for 'Getafe'...
Pass matrix construction complete.
Building pass matrix for 'Granada'...
Pass matrix construction complete.
Building pass matrix for 'RC Deportivo La Coruña'...
Pass matrix construction complete.
Building pass matrix for 'Eibar'...
Pass matrix construction complete.
Building pass matrix for 'Villarreal'...
Pass matrix construction complete.
Building pass matrix for 'Celta Vigo'...
Pass matrix construction complete.
Building pass matrix for 'Valencia'...
Pass matrix construction complete.
Building pass matrix for 'Real Betis'...
Pass matrix construction complete.
Building pass matrix for 'Sevilla'...
Pass matrix construction complete.
Building pass matrix for 'Atlético Madrid'...
Pass matrix construction complete.
Building pass matri

In [132]:
def pass_effectiveness(matrix, player_index_maps):
    data = []

    #keys have the team values
    for team in matrix.keys():
        # we get the pass matrix of the team
        pass_matrix = matrix[team]
        # now the players  index for the team
        player_index_map = player_index_maps[team]
        
        # go over each player in the pass matrix
        for i in range(len(pass_matrix)):
            # sum  values for the current player
            e = np.sum(pass_matrix[i])
            
            # now we will get the player name from the index to be able to link the pass effectivenesss
            player_name = [name for name, idx in player_index_map.items() if idx == i][0]
            
            # add this data to a list
            data.append({'team': team, 'player_name': player_name, 'pass_effectiveness': e})

    # add all data to a df
    df = pd.DataFrame(data, columns=['team', 'player_name', 'pass_effectiveness'])
    
    return df

In [133]:
df_pass_effectiveness = pass_effectiveness(pass_matrices, player_index_map)

In [134]:
df_pass_effectiveness

Unnamed: 0,team,player_name,pass_effectiveness
0,Espanyol,Antonio José Raillo Arenas,0.938053
1,Espanyol,Jorge Franco Alviz,0.851163
2,Espanyol,Salvador Sevilla López,0.924699
3,Espanyol,Mickaël Ciani,0.885417
4,Espanyol,Juan Rafael Fuentes Hernández,0.939828
...,...,...,...
607,Real Sociedad,Igor Zubeldia Elorza,1.000000
608,Real Sociedad,Alberto De La Bella Madureño,0.882426
609,Real Sociedad,Jonathas Cristian de Jesus,0.875000
610,Real Sociedad,Sergio Canales Madrazo,0.880626


In [135]:
df_pass_effectiveness_sorted = df_pass_effectiveness.sort_values(by='pass_effectiveness', ascending=False)

In [145]:
df_pass_effectiveness_sorted[(df_pass_effectiveness_sorted['pass_effectiveness'] < 1.0) & 
                             (df_pass_effectiveness_sorted['pass_effectiveness'] > 0.0)]


Unnamed: 0,team,player_name,pass_effectiveness
504,Real Madrid,Marcos Llorente Moreno,1.000000
253,Celta Vigo,Diego Alende López,1.000000
564,Rayo Vallecano,Luis Carlos Fariña Olivera,1.000000
223,Villarreal,Rodrigo Hernández Cascante,0.986667
86,Barcelona,Arda Turan,0.982372
...,...,...,...
350,Sevilla,Gaël Kakuta,0.533333
267,Valencia,Carlos Carbonell Gil,0.500000
43,Málaga,Hachim Mastour,0.500000
392,Atlético Madrid,Guilherme Magdalena Siqueira,0.428571


## Paper dataset

In [32]:
import pandas as pd
import openpyxl
import numpy as np

In [33]:
#problema con las propiedades leyendolo directo del exel -> por eso lo pasamos a csv
df = pd.read_csv('/Users/luisalawson/Downloads/Sample Data/epl.csv', delimiter=';')

In [34]:
df_prop = pd.DataFrame({
    'column_name': df.columns,
    'data_type': df.dtypes.values
})

In [35]:
df_vals = pd.DataFrame({
    'column_name': df.columns,
    'column_values': [set(df[col].tolist()) for col in df.columns],
    'unique_value_count': [len(set(df[col].tolist())) for col in df.columns]
})

In [36]:
def get_team_id_matches(team, df):
    idvalues = df[(df['home_team_name'] == team) | (df['away_team_name'] == team)]['match_id'].unique()
    return idvalues

In [37]:
def get_lineups(team_id, df):
    df_team = df[df['team_id'] == team_id] 
    lineups = set()
    for index, row in df_team.iterrows():
        lineups.add(row['playerName'])
        
    return lineups

In [38]:
def build_team_info(df):
    team_info = {}

    # to organize the info for a better access

    # we will use the team name as keys
    team_names = set(df['home_team_name']).union(df['away_team_name'])
    team_ids = set(df['home_team_id']).union(df['away_team_id'])

    # fill our dict
    for team_name in team_names:
        
        # gather team id data
        if (df['home_team_name'] == team_name).any():
            team_ids_for_name = df[df['home_team_name'] == team_name]['home_team_id'].unique()
        elif (df['away_team_name'] == team_name).any():
            team_ids_for_name = df[df['away_team_name'] == team_name]['away_team_id'].unique()
        
        team_id = team_ids_for_name[0] if len(team_ids_for_name) > 0 else None
        
        team_info[team_name] = {
            'team_id': team_id,
            'matches': get_team_id_matches(team_name, df),
            'lineups': get_lineups(team_id, df)
        }

    return team_info

In [39]:
team_info = build_team_info(df)

In [40]:
teams1 = df['home_team_name'].unique()
teams2 = df['away_team_name'].unique()

teams = set(teams2) | set(teams2)

In [41]:
df['description'].value_counts()

description
Pass                            367734
Out                              48526
Ball recovery                    34529
Clearance                        26242
Aerial                           25228
Foul                             17124
Ball touch                       16773
Tackle                           14539
Take On                          11877
Interception                     11823
Deleted event                     9786
Dispossessed                      9073
Corner Awarded                    8478
Attempt Saved                     5254
Keeper pick-up                    5249
Save                              5210
Challenge                         5136
Miss                              4115
End                               2280
Player on                         2035
Player off                        2035
Offside provoked                  1736
Offside Pass                      1736
Start                             1520
Card                              1251
Start delay  

In [42]:
def game_outcome(df):
    match_id_values = df['match_id'].unique()
    match_outcomes = []
    shot_categories = ['Attempt Saved', 'Miss', 'Goal', 'Out']
    
    for match in match_id_values:
        
        df_match = df[df['match_id'] == match]
        
        # should be the same teams on every row
        home_team = df_match['home_team_name'].iloc[0]
        away_team = df_match['away_team_name'].iloc[0]

        # useful to know which player scored (to add goal to that team) 
        home_team_id = df_match['home_team_id'].iloc[0]
        away_team_id = df_match['away_team_id'].iloc[0]

        # same as home_team
        date = df_match['date'].iloc[0]
        
        # goal calculation
        home_score = df_match[(df_match['team_id'] == home_team_id) & (df_match['description'] == "Goal")]['outcome'].sum()
        away_score = df_match[(df_match['team_id'] == away_team_id) & (df_match['description'] == "Goal")]['outcome'].sum()

        #shot calculation
        # attempt saved in description 
        # miss
        # goal (?
        # out(?
        
        home_shots = ((df_match['team_id'] == home_team_id) & (df_match['description'].isin(shot_categories))).sum()
        away_shots = ((df_match['team_id'] == away_team_id) & (df_match['description'].isin(shot_categories))).sum()

        #possession calculation
        home_possession = (df_match['team_id'] == home_team_id).sum()
        away_possession = (df_match['team_id'] == away_team_id).sum()

        #PSL calculation
        home_PSL = home_shots/home_possession
        away_PSL = away_shots/away_possession

        #PSL diff
        home_PSL_dif = home_PSL - away_PSL
        away_PSL_dif = away_PSL - home_PSL


        # recover all match
        match_outcomes.append({
            'match_id': match,
            'date': date,
            'home_team': home_team,
            'away_team': away_team,
            'home_score': home_score,
            'away_score': away_score,
            'home_shots':home_shots,
            'away_shots':away_shots,
            'home_possesion': home_possession,
            'away_possession': away_possession,
            'home_PSL':home_PSL,
            'away_PSL': away_PSL,
            'home_PSL_dif': home_PSL_dif,
            'away_PSL_dif': away_PSL_dif
        })
    
    
    outcome_df = pd.DataFrame(match_outcomes)
    
    return outcome_df   

In [43]:
match_results_df = game_outcome(df)
# not accurate --> we found og (own goals) are not distinguished from sg (successful goals)
# eg. 19May2013	Wigan Athletic- Aston Villa	--> 1- 3 was actually 2-2 for an aston villa player scored an og in the minute 45

match_results_df

Unnamed: 0,match_id,date,home_team,away_team,home_score,away_score,home_shots,away_shots,home_possesion,away_possession,home_PSL,away_PSL,home_PSL_dif,away_PSL_dif
0,441999,18Aug2012,Arsenal,Sunderland,0,0,78,59,1104,684,0.070652,0.086257,-0.015605,0.015605
1,442000,20Aug2012,Everton,Manchester United,1,0,79,77,731,1081,0.108071,0.071230,0.036841,-0.036841
2,442001,18Aug2012,Fulham,Norwich City,5,0,65,59,918,676,0.070806,0.087278,-0.016472,0.016472
3,442002,19Aug2012,Manchester City,Southampton,3,2,78,66,1051,741,0.074215,0.089069,-0.014854,0.014854
4,442003,18Aug2012,Newcastle United,Tottenham Hotspur,2,1,65,71,795,769,0.081761,0.092328,-0.010567,0.010567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,442374,19May2013,Swansea City,Fulham,0,3,71,52,1056,778,0.067235,0.066838,0.000397,-0.000397
376,442375,19May2013,Tottenham Hotspur,Sunderland,1,0,110,83,956,678,0.115063,0.122419,-0.007356,0.007356
377,442376,19May2013,West Bromwich Albion,Manchester United,6,4,74,69,659,955,0.112291,0.072251,0.040040,-0.040040
378,442377,19May2013,West Ham United,Reading,4,2,98,95,782,767,0.125320,0.123859,0.001461,-0.001461


In [44]:
excel_file_path = '/Users/luisalawson/Downloads/Sample Data/match_results.xlsx'  # Replace with your desired file path

match_results_df.to_excel(excel_file_path, index=False)

print(f"DataFrame saved to {excel_file_path}")


DataFrame saved to /Users/luisalawson/Downloads/Sample Data/match_results.xlsx


In [22]:
def shared_time(player1, player2, match_id, df = df):
    #data only from current match
    df_match = df[df['match_id']==match_id]

    players = df_match['playerName'].unique().tolist()

    if player1 not in players or player2 not in players:
        return 0

    #get teams that played
    #match_teams = df_match[(df['home_team_id']) & (df['away_team_id'])] 

    #error management
    #if team_id not in match_teams:
        #return "The team with id {team_id} did not participate in match {match_id}"
    
    # on column description when value is player off --> player comes off 
    # on column time --> moment of the event describes
    # on column description when value is player on --> player comes on as a substitute
    # on column description when value is start --> match begins

    # i assume that if a player performs an event (meaning the players name appears on 'playerName'), and the event is not player on, that player was in the initial fomration meaning his time of play started when the game started
    # on the other hand, if a player appears for the first time when the event description is 'player on', that is the begining of his time on field
    # whenever a player appears with a linked event description as 'player off' that timestamp will mark the ending of his time there

    match_start = df_match[df_match['description'] == 'Start']['min'].iloc[0]
    match_end = df_match['min'].max()

    player1_start = None
    player1_end = None
    player2_start = None
    player2_end = None

    # events in given match
    for index, row in df_match.iterrows():
        # using the logic stated before: case 2 (when a player isnt part of the og formation)
        if row['description'] == 'Player on' and row['playerName'] == player1:
            player1_start = row['min']
        #same logic for player 2
        elif row['description'] == 'Player on' and row['playerName'] == player2:
            player2_start = row['min']
        
        # now, the ending time using case 3 (if a player leaves the field)
        elif row['description'] == 'Player off' and row['playerName'] == player1:
            player1_end = row['min']
        elif row['description'] == 'Player off' and row['playerName'] == player2:
            player2_end = row['min']
    
    # case 1
    if player1_start is None:
        player1_start = match_start
    if player2_start is None:
        player2_start = match_start

    # If players' end times are not set, assume they ended at the match end time
    # Here, the match end time should be derived or specified elsewhere
    if player1_end is None:
        player1_end = match_end
    if player2_end is None:
        player2_end = match_end

    # Calculate the time each player was on the field
    player1_time_on_field = (player1_end - player1_start) if ((player1_end is not None) and (player1_start is not None)) else None
    player2_time_on_field = (player2_end - player2_start) if ((player2_end is not None) and (player2_start is not None)) else None

    # Calculate the shared time both players were on the field together
    shared_start_time = max(player1_start, player2_start)
    shared_end_time = min(player1_end, player2_end)
    shared_time = (shared_end_time - shared_start_time) if shared_end_time > shared_start_time else None

    return shared_time

In [25]:
def time_shared_total(player1,player2,team_matches):
    total_shared_time = 0
    for match in team_matches:
        time_add = shared_time(player1,player2,match)
        if time_add is not None:
            total_shared_time+= time_add
    return total_shared_time

In [29]:
def calculate_time_matrix(team_name, lineups, team_events):
    players = lineups
    num_players = len(players)
    time_matrix = np.zeros((num_players, num_players))

    print(f"Building time matrix for '{team_name}'...")

    for i, player1 in enumerate(players):
        for j, player2 in enumerate(players):
            if player1 != player2:  
                if time_matrix[j,i] != 0:
                    time_matrix[i,j] = time_matrix[j,i]
                else:
                    time_matrix[i, j] = time_shared_total(player1, player2, team_events)

    print("Time matrix construction complete.")
    
    return time_matrix, {player: idx for idx, player in enumerate(players)}

In [30]:
time_matrices = {}
player_index_map = {}

In [31]:
for team in teams:
   time_matrices[team], player_index_map[team] = calculate_time_matrix(team, team_info[team]['lineups'], team_info[team]['matches'])

Building time matrix for 'Norwich City'...
Time matrix construction complete.
Building time matrix for 'Swansea City'...


KeyboardInterrupt: 

In [33]:
df.columns

Index(['season_id', 'match_id', 'home_team_id', 'home_team_name',
       'away_team_id', 'away_team_name', 'id', 'event_id', 'date', 'time',
       'period_id', 'min', 'sec', 'team_id', 'player_id', 'playerName',
       'playerPosition', 'x', 'y', 'type', 'description', 'outcome'],
      dtype='object')

In [None]:
def calculate_passes_ij(player1, player2, df = df):
    #calculates probability of pass from player 1 to player 2 based on an estimator
    #relevant events (pass)
    pass_events = df[df['description']=='Pass']
    
    #relevant passes (players)
    pass_events_ij = pass_events[(pass_events['playerName'] == player1) & (pass_events['pass_recipient'] == player2)]
    total_i_pass = pass_events[pass_events['playerName']==player1]

    #length
    passes_ij = len(pass_events_ij)
    passes_i = len(total_i_pass)


    return passes_ij/passes_i if passes_i >0 else 0