In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/Users/luisalawson/Downloads/Sample Data/epl.csv', delimiter=';')

In [3]:
def match_outcome(df):
    relevant_df = df[df['description'].isin(['Goal', 'Pass'])]
    matches = relevant_df['match_id'].unique()
    match_data = []

    for match in matches:

        match_df = relevant_df[relevant_df['match_id'] == match].reset_index(drop=True)
        
        home_team_id = match_df['home_team_id'].iloc[0]
        away_team_id = match_df['away_team_id'].iloc[0]
        home_team_name = match_df['home_team_name'].iloc[0]
        away_team_name = match_df['away_team_name'].iloc[0]

        home_score = 0
        away_score = 0

        for i, row in match_df.iterrows():

            team_id = row['team_id']
            
            if team_id == home_team_id and row['description'] == 'Goal' and row['outcome'] == 1:
              
                if i + 1 < len(match_df):
                    next_row = match_df.iloc[i + 1]
                    
                    if next_row['team_id'] == home_team_id:
                        away_score += 1
                    else:
                        home_score += 1
                else:
                    home_score += 1
            elif team_id == away_team_id and row['description'] == 'Goal' and row['outcome'] == 1:
                
                if i + 1 < len(match_df):
                    next_row = match_df.iloc[i + 1]
                    if next_row['team_id'] == away_team_id:
                        home_score += 1
                    else:
                        away_score += 1
                else:
                    away_score += 1

        match_stats = {
            'match_id': match,
            'home_team': home_team_name,
            'away_team': away_team_name,
            'home_score': home_score,
            'away_score': away_score
        }

        match_data.append(match_stats)

    return pd.DataFrame(match_data)

In [8]:
matches_outcome = match_outcome(df)

In [9]:
df['x_float'] = df['x'].str.replace(',', '.').astype(float)
dangerous_area = (70, 100)
df['dangerous_zone'] = df['x_float'].apply(lambda x: 1 if dangerous_area[0] < x < dangerous_area[1] else 0)

In [65]:
# cuando la funcion encuentra algun evento -> pass
skip_events = {
    'Chance missed':None, 
    'Collection End':None, 
    'Cross not claimed':None, 
    'Deleted event':None, 
    'Formation change':None, 
    'Good skill':None, 
    'Team set up':None, 
    'Start':None, 
    'Temp_Attempt':None,
    'Out':1,
    'Clearance':0,
    'Corner awarded': 1,
    'Aerial': 0,
    'Take on':1,
    'Foul':1,
    'Claim':None,
    'Penalty faced':None,
    'Player on': None,
    'Player off': None,
    'Player retired': None,
    'Save':None,
    'Shield ball opp': None,
    'Card': None,
    'Challenge':None,
    'Foul throw-in':None,
    'Offside provoked':None
}

In [66]:
stop_events = {
    'Attempt saved': None,
    'Ball recovery': None,
    'Ball touch': None,
    'Take on': 0,  # Stops play only if outcome == 0
    'Clearance': 1,  # Stops play only if outcome == 1
    'Contentious referee decision': None,
    'Corner awarded': 0,  # Stops play only if outcome == 0
    'Dispossessed': None,
    'Error': None,
    'Foul': 0,  # Stops play only if outcome == 0
    'Goal': None,
    'Interception': None,
    'Keeper pick-up': None,
    'Keeper sweeper': None,
    'Miss': None,
    'Offside pass': None,
    'Out': 0,  # Stops play only if outcome == 0
    'Post': None,
    'Punch': None,
    'Smother': None,
    'Start delay': None,
    'End delay': None,
    'Tackle': None,
    'Pass': 0  # Stops play only if outcome == 0
}

In [107]:
def group_plays(df, skip_events, stop_events):
    
    match_stats_list = []
    unique_matches = df['match_id'].unique()
    # unique_matches = [442374]

    for currentmatch in unique_matches:

        # para que sea mas rapido
        match_df = df[df['match_id'] == currentmatch]

        # me interesa para saber a quien le asigno la jugada (lo hacemos con team_id)
        home_team_id = match_df['home_team_id'].iloc[0]
        away_team_id = match_df['away_team_id'].iloc[0]

        #solo para devolver con el nombre y entender mejor (no es necesario para el analisis)
        home_team_name = match_df['home_team_name'].iloc[0]
        away_team_name = match_df['away_team_name'].iloc[0]

        #inicializamos todo en 0
        home_plays = 0
        away_plays = 0
        home_passes = 0
        away_passes = 0
        home_dangerous_play = 0
        away_dangerous_play = 0

        # antes de arrancar no esta en juego, no hay pases, no hay jugadas peligrosas y tampoco hay equipo asignado
        # variables "mas importantes"
        current_team = None
        in_play = False
        current_passes = 0
        play_in_danger_zone = []

        #arranca desde el 4 porque siempre desde esa row empiezan los eventos reales
        for i, row in match_df.iloc[4:].iterrows():
            #me quedo con el evento xq quiero ver si me sirve esa row o no 
            event = row['description']

            # si es uno de skip pasa a la siguiente fila (continue hace que pase una interacion del for)
            if event in skip_events:
                skip_condition = skip_events[event]
                if skip_condition is None or skip_condition == row['outcome']:
                    #print(f'Event type: {event}, ignored')
                    continue

            # si es un evento de los que frenan jugada es mas complicado
            # primero va a ver si esta en el diccionatrio
            if event in stop_events:
                # ahora agarra cual es la condicion (xq alufnos tienen en cuenta el outcome)
                # para que realmente sea stop
                stop_condition = stop_events[event]
                # si no tiene condicion va a entrar a frenar la jugada
                # si tiene condicion, se va a fijar que la condicion se cumpla
                if stop_condition is None or stop_condition == row['outcome']:
                    # si esta en juego...
                    #print(f'Event type: {event}, stop play')
                    if in_play:
                        # y el que estaba en juego es el home
                        if current_team == home_team_id:
                            #termina la jugada
                            home_plays += 1
                            # suma los pases acumulados a los pases del home
                            #print(f'Home passes current play: {current_passes}')
                            home_passes += current_passes
                            #se fija si hubieron jugadas peligrosas en la jugada
                            #print(f'Home passes total: {home_passes}')
                            if len(play_in_danger_zone)>3:
                                home_dangerous_play += 1
                        #idem para el away (todo lo mismo)
                        elif current_team == away_team_id:
                            away_plays += 1
                            #print(f'Away passes current play: {current_passes}')
                            away_passes += current_passes
                            #print(f'Away passes total: {away_passes}')
                            if len(play_in_danger_zone)>3:
                                away_dangerous_play += 1

                    # una vez que se hizo el update, se va a terminar la jugada
                    # se resetea todo 
                    # se pasa a la siguiente iteracion del for (continue)
                    #print(f'Play ends for team {current_team}')
                    current_team = None
                    in_play = False
                    current_passes = 0
                    play_in_danger_zone = []
                    continue

            # si no paso nada de skip o de stop empieza la jugada -> necesitamos los datos
            # primer dato= quien esta jugando -> lo vemos en team_id
            team_id = row['team_id']
            # segundo dato -> ver si la jugada es peligrosa
            dangerous_area = row['dangerous_zone']
            play_in_danger_zone.append(dangerous_area)

            # si no hay equipo asignado, asigno y le aviso que empieza una jgada (flag de in_play)
            if current_team is None:
                current_team = team_id
                in_play = True
                current_passes = 0

            #si estaba ya jugando, solo me interesa sumar pases ahora
            if current_team == team_id:
                if row['description']=='Pass' and row['outcome'] == 1:
                    current_passes += 1
            #hubo cambio de equipo
            else:
                if in_play:
                    #cierro jugada para el que estaba jugando (misma logica que cuando hay un stop event)
                    if current_team == home_team_id:
                        #termina la jugada
                        home_plays += 1
                        # suma los pases acumulados a los pases del home
                        #print(f'Home passes current play: {current_passes}')
                        home_passes += current_passes
                        #se fija si hubieron jugadas peligrosas en la jugada
                        #print(f'Home passes total: {home_passes}')
                        if len(play_in_danger_zone) >3:
                            home_dangerous_play += 1
                    #idem si el que estaba jugando era el de visitante
                    elif current_team == away_team_id:
                        away_plays += 1
                        #print(f'Away passes current play: {current_passes}')
                        away_passes += current_passes
                        #print(f'Away passes total: {away_passes}')
                        if len(play_in_danger_zone)>3:
                            away_dangerous_play += 1

                # empiezo la nueva jugada
                #print(f'Play ends for team {current_team}')
                current_team = team_id
                #print(f'New play starts for team {current_team}')
                in_play = True
                current_passes = 0
        
            

        match_stats = {
            'match_id': currentmatch,
            'home_team': home_team_name,
            'away_team': away_team_name,
            'home_team_plays': home_plays,
            'away_team_plays': away_plays,
            'home_passes_per_play': round(home_passes / home_plays, 1) if home_plays > 0 else 0,
            'away_passes_per_play': round(away_passes / away_plays, 1) if away_plays > 0 else 0,
            'home_passes': home_passes,
            'away_passes': away_passes,
            'home_dangerous_plays': home_dangerous_play,
            'away_dangerous_plays': away_dangerous_play,
            'home_RPDA': f"{(away_dangerous_play / away_plays):.1%}",
            'away_RPDA': f"{(home_dangerous_play / home_plays):.1%}",
        }

        match_stats_list.append(match_stats)

    return pd.DataFrame(match_stats_list)

In [108]:
full_information_df = group_plays(df, skip_events, stop_events)

In [109]:
full_information_df

Unnamed: 0,match_id,home_team,away_team,home_team_plays,away_team_plays,home_passes_per_play,away_passes_per_play,home_passes,away_passes,home_dangerous_plays,away_dangerous_plays,home_RPDA,away_RPDA
0,441999,Arsenal,Sunderland,152,107,4.4,2.2,663,238,66,26,24.3%,43.4%
1,442000,Everton,Manchester United,128,152,1.7,4.0,222,603,29,61,40.1%,22.7%
2,442001,Fulham,Norwich City,132,112,4.0,3.0,534,333,66,46,41.1%,50.0%
3,442002,Manchester City,Southampton,139,115,4.7,2.9,660,335,67,39,33.9%,48.2%
4,442003,Newcastle United,Tottenham Hotspur,125,129,2.9,2.6,364,332,46,42,32.6%,36.8%
...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,442374,Swansea City,Fulham,155,121,4.1,2.6,633,316,76,35,28.9%,49.0%
376,442375,Tottenham Hotspur,Sunderland,179,109,2.3,1.5,416,166,57,27,24.8%,31.8%
377,442376,West Bromwich Albion,Manchester United,116,135,2.3,4.1,271,551,38,59,43.7%,32.8%
378,442377,West Ham United,Reading,136,139,2.3,2.2,308,310,47,35,25.2%,34.6%


In [110]:
matches_outcome

Unnamed: 0,match_id,home_team,away_team,home_score,away_score
0,441999,Arsenal,Sunderland,0,0
1,442000,Everton,Manchester United,1,0
2,442001,Fulham,Norwich City,5,0
3,442002,Manchester City,Southampton,3,2
4,442003,Newcastle United,Tottenham Hotspur,2,1
...,...,...,...,...,...
375,442374,Swansea City,Fulham,0,3
376,442375,Tottenham Hotspur,Sunderland,1,0
377,442376,West Bromwich Albion,Manchester United,5,5
378,442377,West Ham United,Reading,4,2


In [111]:
complete_df = pd.merge(full_information_df, matches_outcome.drop(columns=['match_id', 'home_team', 'away_team']), left_index=True, right_index=True, how='left')

In [112]:
excel_file_path = '/Users/luisalawson/Downloads/complete_df.xlsx'  

complete_df.to_excel(excel_file_path, index=False)
print(f"DataFrame saved to {excel_file_path}")

DataFrame saved to /Users/luisalawson/Downloads/complete_df.xlsx
