In [142]:
import pandas as pd
import json
import duckdb as db
import glob
import math
import numpy as np

In [143]:
item_list = ['pass',
       'carry', 'ball_receipt', 'under_pressure', 'duel', 'counterpress',
       'interception', 'off_camera', 'ball_recovery', 'shot', 'goalkeeper',
       'clearance', 'block', 'dribble', 'foul_committed', 'foul_won', 'half_start',
       'miscontrol', 'bad_behaviour', 'substitution', '50_50', 'injury_stoppage', 'player_off']

event_weight_dict = {'Starting XI': 0, 'Half Start':0, 'Pass':0.05, 'Ball Receipt*':0.2, 'Carry':0.1, 'Duel':1,
       'Ball Recovery':0.5, 'Pressure': 0, 'Miscontrol': 0.1, 'Block': 0.5, 'Interception': 1.5,
       'Shot': 4, 'Goal Keeper': 0, 'Clearance': 0, 'Dispossessed': 1, 'Dribble':0,
       'Foul Committed': 0.5, 'Foul Won': 0.8, 'Dribbled Past': 2, 'Bad Behaviour': 1,
       'Half End': 0, 'Substitution': 0, 'Tactical Shift': 0, 'Error':1, 'Shield': 1 ,
       'Referee Ball-Drop': 0, 'Injury Stoppage': 0, '50/50': 0, 'Camera On*': 0, 
       'Offside': 0, 'Own Goal Against': 0, 'Own Goal For': 0, 'Player On': 0, 'Player Off': 0, 'Camera On': 0, 'Camera off': 0}

drop_list = item_list + ['tactics', 'player', 'position', 'location']


In [144]:
'''
Weightings 
Half Start: 0 (relavence in outcome)
Pass: 0.1 Most common event
Ball Receipt: 0.2 Succesful pass
Carry: 0.1 control the ball moving to one spot
Duel: .8 50-50 between two players of opposite teams (Winner is the team that has possession on next row
Ball Recovery: .5 Attempt to recover ball (only occurs when a team keeps possession after a shot)
Pressure: 0 providing pressure to player with ball (event does not directly lead to any outcome)
Miscontrol: 0.1 Player losses possession of ball (to opposite team, does not always lead to change of possession)
Block: 0.2 blocked shot to blocking team
Interception: 1 take another teams pass
Shot: 3 most important event 
Goal Keeper: 0 action done by goal keeper
Clearence: 0 no control in clearences
Dispossessed: 1 ball taken away
dribble: 0 just an attempt, successful would be dribble passed unsuccessful would be lost of possession
Foul committed: 0.5 to team the foul was committed on
Foul won: 0.1 to team that won, not always a win when follow commited thats why less
Dribbled Past: 1.5 went past a player
Bad Behaiour: 1 Card to other team
Half end: 0 no bearing
Substitution: 0
Tactical Shift: 0 no way of telling if this helps either team
Error: 1 leads to shot by team not commiting error
Sheild: 1 indicates tema sheilding is up in close game
Referee Ball-Drop: 0
Injury Stoppage: 0
50/50: 0 no outcome known
Camera On*: 0
Offside: 0
Own Goal Against: 0 can't look at goals
Own Goal For: 0 can't look at goals
Player On: 0
Player Off: 0
'''
positive_events = ['Pass', 'Carry', 'Ball Receipt*',
       'Interception', 'Ball Recovery', 'Shot', 'Dribbled Past'
       'Clearance', 'Block', 'Dribble', 'Foul Won', 'Shield']
negative_events = ['Foul Committed', 'Miscontrol', 'Bad Behaviour','Dispossessed', 'Error']
special_events = ['Duel']
self_positive_events = ['Ball Recovery', 'Carry', 'Pass', 'Ball Receipt*', 'Foul Won']

In [150]:

combine_final_entropy = pd.DataFrame(columns=['match_id', 'home_team', 'away_team','t1_t1_entropy', 
                                              't2_t2_entropy', 't1_t2_entropy', 
                                              't2_t1_entropy', 'total_entropy', 'team1_entropy_segment',
                                              'team2_entropy_segment', 'SEI'])
combine_final_entropy

Unnamed: 0,match_id,home_team,away_team,t1_t1_entropy,t2_t2_entropy,t1_t2_entropy,t2_t1_entropy,total_entropy,team1_entropy_segment,team2_entropy_segment,SEI


In [151]:
for idx, i in enumerate(glob.glob('data\events\*')):
    #print(i)
    match = i.split('\\')[2].split('.')[0]
    df = pd.read_json(i)
    
    df['type'] = df['type'].astype('str')
    df['type'] = df['type'].str.split(':').str[2].str[2:].str[:-2]
    df['possession_team'] = df['possession_team'].astype('str')
    df['possession_team'] = df['possession_team'].str.split(':').str[2].str[2:].str[:-2]
    df['play_pattern'] = df['play_pattern'].astype('str')
    df['play_pattern'] = df['play_pattern'].str.split(':').str[2].str[2:].str[:-2]
    df['team'] = df['team'].astype('str')
    df['team'] = df['team'].str.split(':').str[2].str[2:].str[:-2]
    df['player'] = df['player'].astype('str')
    df['player'] = df['player'].str.split(':').str[2].str[2:].str[:-2]
    df['position'] = df['position'].astype('str')
    df['position'] = df['position'].str.split(':').str[2].str[2:].str[:-2]
    
    df_drop = df.drop(columns=drop_list, errors='ignore')
    
    df_prob = df_drop.groupby(['type']).count()['id'].reset_index()
    df_prob['weight'] = df_prob['type'].map(event_weight_dict)
    df_prob['value'] = df_prob['id']*df_prob['weight']
    total = df_prob['value'].sum()
    df_prob['total_prob'] = df_prob['value']/total
    df_prob['indv_prob'] = df_prob['total_prob']/df_prob['id']
    #print(df)
    
    df_prob['type'] = df_prob['type'].astype('string')
    df_drop['type'] = df_drop['type'].astype('string')
    #print(df_prob.dtypes)
    #print(df_drop.dtypes)
    match_df = db.sql("SELECT o.*, indv_prob FROM df_drop o\
       left join df_prob p\
       on p.type=o.type").df()
    '''
    To do
    team 1 is home team first team to appear in team'
    team 2 is away team second team to appear in team
    Activity +1 for good -1 for bad:
    team 1 to team 1
    team 2 to team 2
    team 1 to team 2
    team 2 to team 1
    Entropy:
    team 1 to team 1
    team 2 to team 2
    team 1 to team 2
    team 2 to team 1
    '''
    away_home = db.sql("SELECT team from match_df where type='Starting XI'").df()
    team1 = away_home['team'].to_list()[0]
    team2 = away_home['team'].to_list()[1]
    
    t1_t1_entropy = 0
    t2_t2_entropy = 0
    t1_t2_entropy = 0
    t2_t1_entropy = 0
    t1_t1_N = 0
    t2_t2_N = 0
    t1_t2_N = 0
    t2_t1_N = 0
    match_df['t1_t1_entropy'] = ''
    match_df['t2_t2_entropy'] = ''
    match_df['t1_t2_entropy'] = ''
    match_df['t2_t1_entropy'] = ''
    match_df['total_entropy'] = ''
    count = 0
    N_count = 0
    Duel = False
    for index, row in match_df.iterrows():
        #print(row['type'])
        if Duel:
            if row['possession_team'] == team1:
                t2_t1_entropy = t2_t1_entropy + duel_entropy

            if row['possession_team'] == team2:
                t1_t2_entropy = t1_t2_entropy + duel_entropy

            duel_entropy = 0
            N_count += 1
            Duel = False
                
        if row['indv_prob'] == 0.0:
            pass
        elif row['team'] == row['possession_team'] and row['team'] == team1 and row['type'] in positive_events:
            if row['indv_prob'] == 0:
                t1_t1_entropy = t1_t1_entropy
            else:
                t1_t1_entropy = t1_t1_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['team'] == row['possession_team'] and row['team'] == team2 and row['type'] in positive_events:
            if row['indv_prob'] == 0:
                t2_t2_entropy = t2_t2_entropy
            else:
                t2_t2_entropy = t2_t2_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['team'] == team1 and row['type'] in negative_events:
            if row['indv_prob'] == 0:
                t1_t2_entropy = t1_t2_entropy
            else:
                t1_t2_entropy = t1_t2_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['team'] == team2 and row['type'] in negative_events:
            if row['indv_prob'] == 0:
                t2_t1_entropy = t2_t1_entropy
            else:
                t2_t1_entropy = t2_t1_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['team'] != row['possession_team'] and row['team'] == team1 and row['type'] in self_positive_events:
            if row['indv_prob'] == 0:
                t1_t1_entropy = t1_t1_entropy
            else:
                t1_t1_entropy = t1_t1_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['team'] != row['possession_team'] and row['team'] == team2 and row['type'] in self_positive_events:
            if row['indv_prob'] == 0:
                t2_t2_entropy = t2_t2_entropy
            else:
                t2_t2_entropy = t2_t2_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['team'] != row['possession_team'] and row['team'] == team2 and row['type'] in positive_events:
            if row['indv_prob'] == 0:
                t1_t2_entropy = t1_t2_entropy
            else:
                t1_t2_entropy = t1_t2_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['team'] != row['possession_team'] and row['team'] == team1 and row['type'] in positive_events:
            if row['indv_prob'] == 0:
                t2_t1_entropy = t2_t1_entropy
            else:
                t2_t1_entropy = t2_t1_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        elif row['type'] == 'Duel':
            Duel = True
            duel_entropy = (-(row['indv_prob']*math.log2(row['indv_prob'])))
            current_pos = row['possession_team']
        elif row['type'] == 'Dribbled Past':
            if row['possession_team'] == team1:
                t2_t1_entropy = t2_t1_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

            if row['possession_team'] == team1:
                t1_t2_entropy = t1_t2_entropy + (-(row['indv_prob']*math.log2(row['indv_prob'])))

        else:
            count+=1
            print(row)
            Duel = False
        total_entropy = t1_t1_entropy + t2_t2_entropy + t2_t1_entropy + t1_t2_entropy
    
        match_df.loc[index, 'total_entropy'] = total_entropy
        match_df.loc[index, 't1_t1_entropy'] = t1_t1_entropy
        match_df.loc[index, 't2_t2_entropy'] = t2_t2_entropy
        match_df.loc[index, 't1_t2_entropy'] = t1_t2_entropy
        match_df.loc[index, 't2_t1_entropy'] = t2_t1_entropy
        
        match_df.loc[index, 'N_count'] = N_count
        #print(t1_t1_entropy)
    
    match_df.loc[match_df['t1_t1_entropy'] == 0, 't1_t1_entropy'] = np.nan
    match_df.loc[match_df['t2_t2_entropy'] == 0, 't2_t2_entropy'] = np.nan
    match_df.loc[match_df['t1_t2_entropy'] == 0, 't1_t2_entropy'] = np.nan
    match_df.loc[match_df['t2_t1_entropy'] == 0, 't2_t1_entropy'] = np.nan
    
    match_df.loc[match_df['total_entropy'] == 0, 'total_entropy'] = np.nan
        
    print(match, ': ', count)
    #print(t1_t1_entropy)
    #print(t2_t2_entropy)
    #print(t1_t2_entropy)
    #print(t2_t1_entropy)
    
    match_df['team1_entropy_segment'] = (match_df['t1_t1_entropy'] + match_df['t2_t1_entropy'] + match_df['t1_t2_entropy'])\
                                        /match_df['total_entropy']
    match_df['team2_entropy_segment'] = (match_df['t2_t2_entropy'] + match_df['t1_t2_entropy'] + match_df['t2_t1_entropy'])\
                                        /match_df['total_entropy']
    match_df['SEI'] = match_df['team1_entropy_segment']-match_df['team2_entropy_segment']
    
    total_entropy = t1_t1_entropy + t2_t2_entropy + t2_t1_entropy + t1_t2_entropy
    s1 = (t1_t1_entropy + t2_t1_entropy - t1_t2_entropy)/total_entropy
    s2 = (t2_t2_entropy + t1_t2_entropy - t2_t1_entropy)/total_entropy
    #print(s1)
    #print(s2)
    #print(s1-s2)
    combine_final_entropy
    columns=['match_id', 'home_team', 'away_team','t1_t1_entropy', 
                                              't2_t2_entropy', 't1_t2_entropy', 
                                              't2_t1_entropy', 'total_entropy', 'team1_entropy_segment',
                                              'team2_entropy_segment', 'SEI']
    
    combine_final_entropy.loc[idx, 'match_id'] = match
    combine_final_entropy.loc[idx, 'home_team'] = team1
    combine_final_entropy.loc[idx, 'away_team'] = team2
    combine_final_entropy.loc[idx, 'total_entropy'] = total_entropy
    combine_final_entropy.loc[idx, 't1_t1_entropy'] = t1_t1_entropy
    combine_final_entropy.loc[idx, 't2_t2_entropy'] = t2_t2_entropy
    combine_final_entropy.loc[idx, 't1_t2_entropy'] = t1_t2_entropy
    combine_final_entropy.loc[idx, 't2_t1_entropy'] = t2_t1_entropy
    combine_final_entropy.loc[idx, 'team1_entropy_segment'] = s1
    combine_final_entropy.loc[idx, 'team2_entropy_segment'] = s2
    combine_final_entropy.loc[idx, 'SEI'] = s1-s2

    #match_df.to_csv(f'data\\match_calculations\\{match}.csv')
    #print(match)
    #break
#match_df
combine_final_entropy.to_csv('all_matches.csv', index=False)

15946 :  0
15956 :  0
15973 :  0
15978 :  0
15986 :  0
15998 :  0
16010 :  0
16023 :  0
16029 :  0
16056 :  0
16073 :  0
16079 :  0
16086 :  0
16095 :  0
16109 :  0
16120 :  0
16131 :  0
16136 :  0
16149 :  0
16157 :  0
16173 :  0
16182 :  0
16190 :  0
16196 :  0
16205 :  0
16215 :  0
16231 :  0
16240 :  0
16248 :  0
16265 :  0
16275 :  0
16289 :  0
16306 :  0
16317 :  0
18235 :  0
18236 :  0
18237 :  0
18240 :  0
18241 :  0
18242 :  0
18243 :  0
18244 :  0
18245 :  0
22536 :  0
22912 :  0
2302764 :  0
265830 :  0
265835 :  0
265837 :  0
265839 :  0
265857 :  0
265866 :  0
265894 :  0
265896 :  0
265905 :  0
265918 :  0
265944 :  0
265952 :  0
265958 :  0
265963 :  0
266015 :  0
266033 :  0
266045 :  0
266056 :  0
266066 :  0
266074 :  0
266106 :  0
266117 :  0
266142 :  0
266148 :  0
266149 :  0
266160 :  0
266166 :  0
266191 :  0
266201 :  0
266230 :  0
266234 :  0
266236 :  0
266240 :  0
266254 :  0
266256 :  0
266273 :  0
266274 :  0
266280 :  0
266299 :  0
266310 :  0
266320 :  0


In [None]:
214, 153, 118, 113