# Soccer Prediction

An endeavor to predict soccer using the previous events and experiments.


In [2]:
import json

import pandas as pd
import matplotlib

from utils import apply_time_limits


## Inspect the Events Collection
First of all lets take a look at what we have in the events collection which seems to be quite informative.

In [3]:
df = pd.read_json('../data/events_World_Cup.json', orient='records')
df.head()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId
0,8,Pass,1.656214,258612104,2057954,1H,122671,"[{'x': 50, 'y': 50}, {'x': 35, 'y': 53}]",85,Simple pass,[{'id': 1801}],16521
1,8,Pass,4.487814,258612106,2057954,1H,139393,"[{'x': 35, 'y': 53}, {'x': 75, 'y': 19}]",83,High pass,[{'id': 1801}],16521
2,1,Duel,5.937411,258612077,2057954,1H,103668,"[{'x': 25, 'y': 81}, {'x': 37, 'y': 83}]",10,Air duel,"[{'id': 703}, {'id': 1801}]",14358
3,1,Duel,6.406961,258612112,2057954,1H,122940,"[{'x': 75, 'y': 19}, {'x': 63, 'y': 17}]",10,Air duel,"[{'id': 701}, {'id': 1802}]",16521
4,8,Pass,8.562167,258612110,2057954,1H,122847,"[{'x': 63, 'y': 17}, {'x': 71, 'y': 15}]",85,Simple pass,[{'id': 1801}],16521


## Creating New Features
Gonna try adding some new features to our new data frame.

### Time
EventSec is, unfortunately, second from the beginning of each half. We need something to start counting from the beginning of the game. Which will be time.
The time feature is going to be somehow the percentage of the game. This means that, in the beginning it will be equal to zero. At the end of the first game half, it will be 0.5 and finally, 1 at the end of the second half.

In [4]:
def normalize(x, scale=0.5, shift=0):
    x['time'] = x['eventSec'] / x['eventSec'].max() * scale + shift
    return x

first_half = df[df['matchPeriod'] == '1H'].groupby('matchId').apply(normalize)
second_half = df[df['matchPeriod'] == '2H'].groupby('matchId').apply(lambda x: normalize(x, shift=0.5))

df = pd.concat([first_half, second_half])

df.tail()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId,time
101754,8,Pass,2978.301867,263885652,2058017,2H,3476,"[{'x': 46, 'y': 20}, {'x': 64, 'y': 6}]",85,Simple pass,[{'id': 1801}],9598,0.996028
101755,7,Others on the ball,2979.084611,263885653,2058017,2H,14812,"[{'x': 64, 'y': 6}, {'x': 82, 'y': 2}]",72,Touch,[],9598,0.996159
101756,8,Pass,2983.448628,263885654,2058017,2H,14812,"[{'x': 82, 'y': 2}, {'x': 100, 'y': 100}]",80,Cross,"[{'id': 401}, {'id': 801}, {'id': 1802}]",9598,0.996886
101757,4,Goalkeeper leaving line,2985.869275,263885613,2058017,2H,25381,"[{'x': 0, 'y': 0}, {'x': 18, 'y': 98}]",40,Goalkeeper leaving line,[],4418,0.997289
101758,8,Pass,3002.148765,263885618,2058017,2H,25381,"[{'x': 14, 'y': 43}, {'x': 0, 'y': 0}]",84,Launch,[{'id': 1802}],4418,1.0


### Limit the Dataframe to the Interval

In [5]:
results = apply_time_limits(df, 0, 50)
results.head()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId,time
0,8,Pass,1.656214,258612104,2057954,1H,122671,"[{'x': 50, 'y': 50}, {'x': 35, 'y': 53}]",85,Simple pass,[{'id': 1801}],16521,0.000294
1,8,Pass,4.487814,258612106,2057954,1H,139393,"[{'x': 35, 'y': 53}, {'x': 75, 'y': 19}]",83,High pass,[{'id': 1801}],16521,0.000796
2,1,Duel,5.937411,258612077,2057954,1H,103668,"[{'x': 25, 'y': 81}, {'x': 37, 'y': 83}]",10,Air duel,"[{'id': 703}, {'id': 1801}]",14358,0.001053
3,1,Duel,6.406961,258612112,2057954,1H,122940,"[{'x': 75, 'y': 19}, {'x': 63, 'y': 17}]",10,Air duel,"[{'id': 701}, {'id': 1802}]",16521,0.001136
4,8,Pass,8.562167,258612110,2057954,1H,122847,"[{'x': 63, 'y': 17}, {'x': 71, 'y': 15}]",85,Simple pass,[{'id': 1801}],16521,0.001519


In [6]:
event_names = sorted(set(df['eventName']))
for e in event_names:
    col_name = 'num{}'.format(e.title().replace(' ', ''))
    
    counts_df = df[df['eventName'] == e]
    counts_df = counts_df.groupby(['matchId', 'teamId'])['id'].count().reset_index(name=col_name)
    print(len(counts_df))
    results[col_name] = counts_df[col_name]


['Duel', 'Foul', 'Free Kick', 'Goalkeeper leaving line', 'Offside', 'Others on the ball', 'Pass', 'Save attempt', 'Shot']
128
128
128
91
93
128
128
124
128


In [12]:
results.head(100)

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,...,time,numDuel,numFoul,numFreeKick,numGoalkeeperLeavingLine,numOffside,numOthersOnTheBall,numPass,numSaveAttempt,numShot
0,8,Pass,1.656214,258612104,2057954,1H,122671,"[{'x': 50, 'y': 50}, {'x': 35, 'y': 53}]",85,Simple pass,...,0.000294,221.0,22.0,50.0,1.0,3.0,62.0,311.0,7.0,11.0
1,8,Pass,4.487814,258612106,2057954,1H,139393,"[{'x': 35, 'y': 53}, {'x': 75, 'y': 19}]",83,High pass,...,0.000796,247.0,10.0,55.0,4.0,1.0,66.0,516.0,3.0,7.0
2,1,Duel,5.937411,258612077,2057954,1H,103668,"[{'x': 25, 'y': 81}, {'x': 37, 'y': 83}]",10,Air duel,...,0.001053,219.0,6.0,38.0,1.0,1.0,95.0,579.0,4.0,11.0
3,1,Duel,6.406961,258612112,2057954,1H,122940,"[{'x': 75, 'y': 19}, {'x': 63, 'y': 17}]",10,Air duel,...,0.001136,227.0,12.0,42.0,5.0,1.0,68.0,421.0,1.0,7.0
4,8,Pass,8.562167,258612110,2057954,1H,122847,"[{'x': 63, 'y': 17}, {'x': 71, 'y': 15}]",85,Simple pass,...,0.001519,259.0,10.0,57.0,4.0,1.0,87.0,385.0,5.0,12.0
5,8,Pass,10.991292,258612113,2057954,1H,122832,"[{'x': 71, 'y': 15}, {'x': 92, 'y': 11}]",85,Simple pass,...,0.001949,231.0,10.0,51.0,3.0,2.0,73.0,420.0,3.0,12.0
6,1,Duel,14.081637,258612115,2057954,1H,122847,"[{'x': 92, 'y': 11}, {'x': 88, 'y': 11}]",11,Ground attacking duel,...,0.002497,164.0,10.0,40.0,2.0,2.0,68.0,473.0,4.0,13.0
7,8,Pass,15.039163,258612116,2057954,1H,122847,"[{'x': 88, 'y': 11}, {'x': 88, 'y': 13}]",85,Simple pass,...,0.002667,163.0,13.0,45.0,1.0,3.0,76.0,553.0,8.0,7.0
8,8,Pass,16.438878,258612117,2057954,1H,122940,"[{'x': 88, 'y': 13}, {'x': 70, 'y': 13}]",85,Simple pass,...,0.002916,183.0,19.0,38.0,1.0,1.0,75.0,327.0,1.0,3.0
9,8,Pass,17.745299,258612119,2057954,1H,122832,"[{'x': 70, 'y': 13}, {'x': 71, 'y': 28}]",85,Simple pass,...,0.003147,173.0,17.0,54.0,3.0,3.0,37.0,466.0,7.0,12.0


In [57]:
set(df['eventName'])

{'Duel',
 'Foul',
 'Free Kick',
 'Goalkeeper leaving line',
 'Offside',
 'Others on the ball',
 'Pass',
 'Save attempt',
 'Shot'}

In [71]:
print(counts_df)

        eventId  eventName     eventSec         id  matchId matchPeriod  \
13            3  Free Kick    36.815886  258612125  2057954          1H   
22            3  Free Kick    62.091400  258612087  2057954          1H   
29            3  Free Kick    93.204477  258612093  2057954          1H   
45            3  Free Kick   119.630485  258612105  2057954          1H   
51            3  Free Kick   164.159873  258612118  2057954          1H   
59            3  Free Kick   209.138650  258612171  2057954          1H   
67            3  Free Kick   230.570693  258612181  2057954          1H   
72            3  Free Kick   243.141717  258612185  2057954          1H   
78            3  Free Kick   257.054411  258612193  2057954          1H   
125           3  Free Kick   351.645264  258612268  2057954          1H   
134           3  Free Kick   370.592560  258612286  2057954          1H   
141           3  Free Kick   382.226623  258612292  2057954          1H   
157           3  Free Kic