## Setting up

In [5]:
# Importing packages and data, data is from here: https://figshare.com/collections/Soccer_match_event_dataset/4415000/2?fbclid=IwAR14COLJt-dka8XMYrYZG6oo_UkWIUX4cqpk6wI573ODDy6P2N-22TrNv28

import os 
import pandas as pd
import json
import warnings
warnings.filterwarnings("ignore")

os.chdir("/FootballAnalysis_DataScience/")

with open('events_World_Cup.json','r') as f:
    s = f.read()
    data = json.loads(s)
    
    # converting from json to pandas dataframe
df = pd.DataFrame.from_dict(json_normalize(data), orient='columns')
df.head()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId
0,8,Pass,1.656214,258612104,2057954,1H,122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",85,Simple pass,[{'id': 1801}],16521
1,8,Pass,4.487814,258612106,2057954,1H,139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",83,High pass,[{'id': 1801}],16521
2,1,Duel,5.937411,258612077,2057954,1H,103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",10,Air duel,"[{'id': 703}, {'id': 1801}]",14358
3,1,Duel,6.406961,258612112,2057954,1H,122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",10,Air duel,"[{'id': 701}, {'id': 1802}]",16521
4,8,Pass,8.562167,258612110,2057954,1H,122847,"[{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]",85,Simple pass,[{'id': 1801}],16521


#### Preprocessing data

In [6]:
df_dk = df.copy()
df_dk = df_dk[df_dk.teamId == 7712]  # Only data from DK players 

# Splitting the start-/ and end coordinate columns 
df_dk['start_coordinate'] = [i[0] for i in df_dk['positions']] 
df_dk['start_coordinate'] = [list(i.values()) for i in df_dk['start_coordinate']] 
df_dk['end_coordinate'] = [i[1] for i in df_dk['positions']] 
df_dk['end_coordinate'] = [list(i.values()) for i in df_dk['end_coordinate']] 

# A column for each coordinate and scaling the coordinates so they fit the football field  
df_dk['x1'] = [i[0]/100*120 for i in df_dk["start_coordinate"]]
df_dk['y1'] = [i[1]/100*80 for i in df_dk["start_coordinate"]]
df_dk['x2'] = [i[0]/100*120 for i in df_dk["end_coordinate"]]
df_dk['y2'] = [i[1]/100*80 for i in df_dk["end_coordinate"]]

# Load data with tags (edited in excel)
tags = pd.read_csv("tags.csv", sep=";")
df_dk['accurate'] =  tags['accurate'].values
df_dk['not_accurate'] =  tags['not_accurate'].values
df_dk['goal'] =  tags['goal'].values
df_dk['missed_ball'] =  tags['missed ball'].values
df_dk['opportunity'] =  tags['opportunity'].values

df_dk.matchId.unique()

array([2057967, 2057969, 2057970, 2058005])

#### Subsetting data (choosing what area to focus on)

In [7]:
df_dk = df_dk[df_dk.eventName == "Pass"]  # Keep only data on passes

#------- CHOSING ONLY OFFENSIVE PLAY ---------_#
df_dk = df_dk[df_dk.x1 > 60] # Look only at offensive play
df_dk = df_dk[df_dk.x2 > 60] # Look only at offensive play

#------- CHOSING ONLY DEFFENSIVE PLAY ---------_#
#df_dk = df_dk[df_dk.x1 < 60] # Look only at offensive play
#df_dk = df_dk[df_dk.x2 < 60] # Look only at offensive play

# Create a subset for each match (DK's)
dk_fr = df_dk[df_dk.matchId == 2057970]  # Denmark - France (0-0)
dk_cr = df_dk[df_dk.matchId == 2058005]  # Denmark - Croatia (1-1)
dk_au = df_dk[df_dk.matchId == 2057969]  # Denmark - Australia (1-1)
dk_pe = df_dk[df_dk.matchId == 2057967]  # Peru - Denmark (0-1)


# Create a column with match name
dk_fr['match'] = "DK_FR" 
dk_cr['match'] = "DK_CR" 
dk_au['match'] = "DK_AU" 
dk_pe['match'] = "DK_PE" 


# Creating a total dataset with all matches
df_dk = pd.concat([dk_fr,dk_cr, dk_au, dk_pe])
df_dk.to_csv('df_dk.csv') # Write csv
df_dk.head()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,...,x1,y1,x2,y2,accurate,not_accurate,goal,missed_ball,opportunity,match
25074,8,Pass,146.487662,260506095,2057970,1H,55968,"[{'y': 88, 'x': 78}, {'y': 93, 'x': 70}]",85,Simple pass,...,105.6,62.4,111.6,56.0,1,0,0,0,0,DK_FR
25075,8,Pass,148.048895,260506097,2057970,1H,56394,"[{'y': 93, 'x': 70}, {'y': 67, 'x': 77}]",85,Simple pass,...,111.6,56.0,80.4,61.6,1,0,0,0,0,DK_FR
25076,8,Pass,149.871205,260506099,2057970,1H,54,"[{'y': 67, 'x': 77}, {'y': 71, 'x': 75}]",85,Simple pass,...,80.4,61.6,85.2,60.0,1,0,0,0,0,DK_FR
25077,8,Pass,151.747571,260506100,2057970,1H,55990,"[{'y': 71, 'x': 75}, {'y': 65, 'x': 72}]",85,Simple pass,...,85.2,60.0,78.0,57.6,1,0,0,0,0,DK_FR
25078,8,Pass,152.334386,260506103,2057970,1H,54,"[{'y': 65, 'x': 72}, {'y': 54, 'x': 73}]",85,Simple pass,...,78.0,57.6,64.8,58.4,0,1,0,0,0,DK_FR
