In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#problema con las propiedades leyendolo directo del excel -> por eso lo pasamos a csv
df = pd.read_csv('/Users/luisalawson/Downloads/Sample Data/epl.csv', delimiter=';')

In [3]:
df_prop = pd.DataFrame({
    'column_name': df.columns,
    'data_type': df.dtypes.values
})

In [4]:
df_vals = pd.DataFrame({
    'column_name': df.columns,
    'column_values': [set(df[col].tolist()) for col in df.columns],
    'unique_value_count': [len(set(df[col].tolist())) for col in df.columns]
})

In [5]:
def get_team_id_matches(team, df):
    idvalues = df[(df['home_team_name'] == team) | (df['away_team_name'] == team)]['match_id'].unique()
    return idvalues

In [6]:
def get_lineups(team_id, df):
    df_team = df[df['team_id'] == team_id] 
    lineups = set()
    for index, row in df_team.iterrows():
        lineups.add(row['playerName'])
        
    return lineups

In [7]:
def build_team_info(df):
    team_info = {}

    # to organize the info for a better access

    # we will use the team name as keys
    team_names = set(df['home_team_name']).union(df['away_team_name'])
    team_ids = set(df['home_team_id']).union(df['away_team_id'])

    # fill our dict
    for team_name in team_names:
        
        # gather team id data
        if (df['home_team_name'] == team_name).any():
            team_ids_for_name = df[df['home_team_name'] == team_name]['home_team_id'].unique()
        elif (df['away_team_name'] == team_name).any():
            team_ids_for_name = df[df['away_team_name'] == team_name]['away_team_id'].unique()
        
        team_id = team_ids_for_name[0] if len(team_ids_for_name) > 0 else None
        
        team_info[team_name] = {
            'team_id': team_id,
            'matches': get_team_id_matches(team_name, df),
            'lineups': get_lineups(team_id, df)
        }

    return team_info

In [8]:
def calculate_plays(df, home_team, away_team):
   
    home_plays = 0
    away_plays = 0
    

    home_current = False
    away_current = False

    for index, row in df.iterrows():
        
        if index > 4:  # the match starts always on row 5
            
            team_play = row['team_id']
            situation = row['description']
            
            if situation == 'End':
                return home_plays, away_plays
            
            if team_play == home_team and not home_current:
                home_plays +=1
                home_current = True
                away_current = False
            
            elif team_play== away_team and not away_current:
                away_plays +=1
                away_current = True
                home_current = False
            
            else:
                pass

        
    return home_plays, away_plays

In [9]:
def calculate_dangerous_zone(df, home_team, away_team):

    home_team_side_1st = [0,50]
    away_team_side_1st = [50.1,100]

    home_team_side_2nd = [50.1,100]
    away_team_side_2nd = [0,50]

    home_team_dangerous_plays = 0
    away_team_dangerous_plays = 0

    home_current = False
    away_current = False


    for index, row in df.iterrows():
        
        if index > 4:  # the match starts always on row 5
            
            team_play = row['team_id']
            situation = row['description']
            period = row['period_id']
            x = row['x']
            x_value = x.replace(',', '.')
            
            if situation == 'End':
                return home_team_dangerous_plays, away_team_dangerous_plays
            
            if team_play == home_team and not home_current and period == 1 and (away_team_side_1st[0]+25)<= float(x_value) <=(away_team_side_1st[1]):
                home_team_dangerous_plays +=1
                home_current = True
                away_current = False
            elif team_play == home_team and not home_current and period == 2 and (away_team_side_2nd[0])<= float(x_value) <=(away_team_side_2nd[1] - 25):
                home_team_dangerous_plays +=1
                home_current = True
                away_current = False
            if team_play == away_team and not away_current and period == 1 and (home_team_side_1st[0])<= float(x_value) <=(home_team_side_1st[1] - 25):
                away_team_dangerous_plays +=1
                away_current = True
                home_current = False
            elif team_play == away_team and not away_current and period == 2 and (home_team_side_2nd[0]+ 25)<= float(x_value) <=(home_team_side_1st[1]):
                away_team_dangerous_plays +=1
                away_current = True
                home_current = False
            else:
                pass

        
    return home_team_dangerous_plays, away_team_dangerous_plays


In [10]:
def game_outcome(df):
    match_id_values = df['match_id'].unique()
    match_outcomes = []
    shot_categories = ['Attempt Saved', 'Miss', 'Goal', 'Out']
    defensive_events_1 = ['Tackle', 'Interception', 'Clearance', 'Blocked Pass', 'Ball Recovery'] 
    defensive_events_0 = ['Tackle', 'Failed to Block'] 
    
    for match in match_id_values:
        
        df_match = df[df['match_id'] == match]
        
        # should be the same teams on every row
        home_team = df_match['home_team_name'].iloc[0]
        away_team = df_match['away_team_name'].iloc[0]

        # useful to know which player scored (to add goal to that team) 
        home_team_id = df_match['home_team_id'].iloc[0]
        away_team_id = df_match['away_team_id'].iloc[0]

        # same as home_team
        date = df_match['date'].iloc[0]
        
        # goal calculation
        home_score = df_match[(df_match['team_id'] == home_team_id) & (df_match['description'] == "Goal")]['outcome'].sum()
        away_score = df_match[(df_match['team_id'] == away_team_id) & (df_match['description'] == "Goal")]['outcome'].sum()

        #shot calculation
        # attempt saved in description 
        # miss
        # goal (?
        # out(?
        home_shots = ((df_match['team_id'] == home_team_id) & (df_match['description'].isin(shot_categories)) & (df_match['outcome'] == 1)).sum()
        away_shots = ((df_match['team_id'] == away_team_id) & (df_match['description'].isin(shot_categories) & (df_match['outcome'] == 1))).sum()

        #possession calculation
        #home_possession = (df_match['team_id'] == home_team_id).sum()
        #away_possession = (df_match['team_id'] == away_team_id).sum()

        home_possession, away_possession = calculate_plays(df_match, home_team_id, away_team_id)

        # Defense calculation

        total_defenses_home = ((df_match['team_id'] == home_team_id) & ((df_match['description'].isin(defensive_events_0)) | (df_match['description'].isin(defensive_events_1)))).sum()
        total_defenses_away = ((df_match['team_id'] == away_team_id) & ((df_match['description'].isin(defensive_events_0)) | (df_match['description'].isin(defensive_events_1)))).sum()

        succesfull_defense_home = ((df_match['team_id'] == home_team_id) & ( (  (df_match['description'].isin(defensive_events_0)) & (df_match['outcome'] == 0) ) | (  (df_match['description'].isin(defensive_events_1)) & (df_match['outcome'] == 1) ))).sum()
        succesfull_defense_away = ((df_match['team_id'] == away_team_id) & ( (  (df_match['description'].isin(defensive_events_0)) & (df_match['outcome'] == 0) ) | (  (df_match['description'].isin(defensive_events_1)) & (df_match['outcome'] == 1) ))).sum()

        #PSL calculation
        home_PSL = home_shots/home_possession
        away_PSL = away_shots/away_possession

        #PSL diff
        home_PSL_dif = home_PSL - away_PSL
        away_PSL_dif = away_PSL - home_PSL

        # Calculo DC
        dc_home = succesfull_defense_home / total_defenses_home
        dc_away = succesfull_defense_away / total_defenses_away

        dc_dif = abs(dc_home- dc_away)

        dc_accurate = 0
    
        if(home_score > away_score and dc_home > dc_away):
            dc_accurate += 1
        elif(home_score < away_score and dc_home < dc_away):
            dc_accurate += 1
        elif(home_score == away_score and dc_dif < 0.1):
            dc_accurate += 1
        else:
            dc_accurate += 0

        #calculate RPDA

        home_dangerous_possession, away_dangerous_possession = calculate_dangerous_zone(df_match,home_team_id,away_team_id)

        home_RPDA = home_dangerous_possession
        away_RPDA = away_dangerous_possession

        # recover all match
        match_outcomes.append({
            'match_id': match,
            'date': date,
            'home_team': home_team,
            'away_team': away_team,
            'home_score': home_score,
            'away_score': away_score,
            'home_shots':home_shots,
            'away_shots':away_shots,
            'home_possesion': home_possession,
            'away_possession': away_possession,
            'home_PSL':home_PSL,
            'away_PSL': away_PSL,
            'home_PSL_dif': home_PSL_dif,
            'away_PSL_dif': away_PSL_dif,
            'dc_home': dc_home,
            'dc_away': dc_away,
            'dc_dif': dc_dif,
            'dc_accuracy': dc_accurate,
            'rpda_home':home_RPDA,
            'rpda_away':away_RPDA
        })
    
    
    outcome_df = pd.DataFrame(match_outcomes)
    
    return outcome_df  

In [11]:
match_results_df = game_outcome(df)
# not accurate --> we found og (own goals) are not distinguished from sg (successful goals)
# eg. 19May2013	Wigan Athletic- Aston Villa	--> 1- 3 was actually 2-2 for an aston villa player scored an og in the minute 45

match_results_df

Unnamed: 0,match_id,date,home_team,away_team,home_score,away_score,home_shots,away_shots,home_possesion,away_possession,home_PSL,away_PSL,home_PSL_dif,away_PSL_dif,dc_home,dc_away,dc_dif,dc_accuracy,rpda_home,rpda_away
0,441999,18Aug2012,Arsenal,Sunderland,0,0,46,36,150,149,0.306667,0.241611,0.065056,-0.065056,0.977778,0.862069,0.115709,0,47,46
1,442000,20Aug2012,Everton,Manchester United,1,0,47,46,163,162,0.288344,0.283951,0.004393,-0.004393,0.756410,0.771930,0.015520,0,45,46
2,442001,18Aug2012,Fulham,Norwich City,5,0,42,32,131,131,0.320611,0.244275,0.076336,-0.076336,0.727273,0.796296,0.069024,0,38,39
3,442002,19Aug2012,Manchester City,Southampton,3,2,52,36,127,127,0.409449,0.283465,0.125984,-0.125984,0.811321,0.760000,0.051321,1,42,43
4,442003,18Aug2012,Newcastle United,Tottenham Hotspur,2,1,42,36,165,165,0.254545,0.218182,0.036364,-0.036364,0.709677,0.682540,0.027138,1,24,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,442374,19May2013,Swansea City,Fulham,0,3,49,30,129,129,0.379845,0.232558,0.147287,-0.147287,0.979592,0.703704,0.275888,0,40,41
376,442375,19May2013,Tottenham Hotspur,Sunderland,1,0,71,47,167,166,0.425150,0.283133,0.142017,-0.142017,0.852941,0.737500,0.115441,1,52,53
377,442376,19May2013,West Bromwich Albion,Manchester United,6,4,43,46,117,117,0.367521,0.393162,-0.025641,0.025641,0.804878,0.862745,0.057867,0,35,36
378,442377,19May2013,West Ham United,Reading,4,2,63,52,153,154,0.411765,0.337662,0.074102,-0.074102,0.818182,0.872727,0.054545,0,40,41


In [12]:
excel_file_path = '/Users/luisalawson/Downloads/match_results.xlsx'  

match_results_df.to_excel(excel_file_path, index=False)

print(f"DataFrame saved to {excel_file_path}")

DataFrame saved to /Users/luisalawson/Downloads/match_results.xlsx


In [13]:
match_results_df['dc_accuracy'].sum()

np.int64(196)

In [14]:
match_results_df['dc_away'].min()

np.float64(0.5434782608695652)

In [15]:
teams1 = df['home_team_name'].unique()
teams2 = df['away_team_name'].unique()

teams = set(teams2) | set(teams2)

In [16]:
# Calculate averages
home_dc_avg = match_results_df.groupby(['home_team', 'away_team'])['dc_home'].mean().reset_index()
away_dc_avg = match_results_df.groupby(['away_team', 'home_team'])['dc_away'].mean().reset_index()

In [17]:
# Initialize dictionary to store average opponent dc
average_opponents_dc = {}

# Get unique teams
teams = match_results_df['home_team'].unique()

# Calculate average opponents' mean_dc for each team
for team in teams:
    # Find opponents for this team
    opponents = home_dc_avg[home_dc_avg['home_team'] == team]['away_team'].tolist()
    
    # Filter away_dc_avg for opponents and calculate mean dc
    opponents_dc = away_dc_avg[away_dc_avg['away_team'].isin(opponents)]['dc_away']
    
    # Store average opponent dc
    average_opponents_dc[team] = opponents_dc.mean()

In [18]:
average_opponents_dc

{'Arsenal': np.float64(0.7796827989775985),
 'Everton': np.float64(0.7815826108653073),
 'Fulham': np.float64(0.7808680679415381),
 'Manchester City': np.float64(0.7808908106335143),
 'Newcastle United': np.float64(0.7830521427995245),
 'Queens Park Rangers': np.float64(0.7795731005328362),
 'Reading': np.float64(0.7810886270361524),
 'West Bromwich Albion': np.float64(0.7808601388564708),
 'West Ham United': np.float64(0.7822008884531649),
 'Wigan Athletic': np.float64(0.7786241977578361),
 'Aston Villa': np.float64(0.7815822374605327),
 'Chelsea': np.float64(0.778915121774588),
 'Liverpool': np.float64(0.7799492337439509),
 'Manchester United': np.float64(0.7801864969322005),
 'Norwich City': np.float64(0.78038803600119),
 'Southampton': np.float64(0.7788788415606799),
 'Stoke City': np.float64(0.7791849455256461),
 'Sunderland': np.float64(0.7812029817538991),
 'Swansea City': np.float64(0.7823200790202749),
 'Tottenham Hotspur': np.float64(0.7827620798029945)}

In [None]:
for team in teams:
    
    home_matches = home_dc_avg[home_dc_avg['home_team'] == team]
    away_matches = away_dc_avg[away_dc_avg['away_team'] == team]

    plt.figure(figsize=(10, 5))

    # Plot home_dc
    plt.plot(home_matches['away_team'], home_matches['dc_home'], color='palevioletred', marker='o', label='home_dc')

    # Plot away_dc
    plt.plot(away_matches['home_team'], away_matches['dc_away'], color='darkcyan', marker='o', label='away_dc')
    
    
    plt.title(f'{team} DC Values')
    plt.xlabel('Opponent Team')
    plt.ylabel('DC Value')
    plt.legend()
    plt.xticks(range(len(home_matches)), home_matches['away_team'], rotation=90)
    plt.tight_layout()
    plt.ylim(0.5, 1.0)

    
    plt.show()  # 
    #  
    #plt.savefig(f'{team}_dc_values.png')