# Soccer Prediction

An endeavor to predict soccer using the previous events and experiments.


In [2]:
import json

import pandas as pd
import matplotlib

from utils import apply_time_limits


## Inspect the Events Collection
First of all lets take a look at what we have in the events collection which seems to be quite informative.

In [4]:
df = pd.read_json('../data/events_World_Cup.json', orient='records')
df.head()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId
0,8,Pass,1.656214,258612104,2057954,1H,122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",85,Simple pass,[{'id': 1801}],16521
1,8,Pass,4.487814,258612106,2057954,1H,139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",83,High pass,[{'id': 1801}],16521
2,1,Duel,5.937411,258612077,2057954,1H,103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",10,Air duel,"[{'id': 703}, {'id': 1801}]",14358
3,1,Duel,6.406961,258612112,2057954,1H,122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",10,Air duel,"[{'id': 701}, {'id': 1802}]",16521
4,8,Pass,8.562167,258612110,2057954,1H,122847,"[{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]",85,Simple pass,[{'id': 1801}],16521


## Creating New Features
Gonna try adding some new features to our new data frame.

### Time
EventSec is, unfortunately, second from the beginning of each half. We need something to start counting from the beginning of the game. Which will be time.
The time feature is going to be somehow the percentage of the game. This means that, in the beginning it will be equal to zero. At the end of the first game half, it will be 0.5 and finally, 1 at the end of the second half.

In [11]:
def normalize(x, scale=0.5, shift=0):
    x['time'] = x['eventSec'] / x['eventSec'].max() * scale + shift
    return x

first_half = df[df['matchPeriod'] == '1H'].groupby('matchId').apply(normalize)
second_half = df[df['matchPeriod'] == '2H'].groupby('matchId').apply(lambda x: normalize(x, shift=0.5))

df = pd.concat([first_half, second_half])

df.tail()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId,time
101754,8,Pass,2978.301867,263885652,2058017,2H,3476,"[{'y': 20, 'x': 46}, {'y': 6, 'x': 64}]",85,Simple pass,[{'id': 1801}],9598,0.996028
101755,7,Others on the ball,2979.084611,263885653,2058017,2H,14812,"[{'y': 6, 'x': 64}, {'y': 2, 'x': 82}]",72,Touch,[],9598,0.996159
101756,8,Pass,2983.448628,263885654,2058017,2H,14812,"[{'y': 2, 'x': 82}, {'y': 100, 'x': 100}]",80,Cross,"[{'id': 401}, {'id': 801}, {'id': 1802}]",9598,0.996886
101757,4,Goalkeeper leaving line,2985.869275,263885613,2058017,2H,25381,"[{'y': 0, 'x': 0}, {'y': 98, 'x': 18}]",40,Goalkeeper leaving line,[],4418,0.997289
101758,8,Pass,3002.148765,263885618,2058017,2H,25381,"[{'y': 43, 'x': 14}, {'y': 0, 'x': 0}]",84,Launch,[{'id': 1802}],4418,1.0


### Limit the Dataframe to the Interval

In [12]:
results = apply_time_limits(df, 0, 50)
results.head()

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId,time
0,8,Pass,1.656214,258612104,2057954,1H,122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",85,Simple pass,[{'id': 1801}],16521,0.000294
1,8,Pass,4.487814,258612106,2057954,1H,139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",83,High pass,[{'id': 1801}],16521,0.000796
2,1,Duel,5.937411,258612077,2057954,1H,103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",10,Air duel,"[{'id': 703}, {'id': 1801}]",14358,0.001053
3,1,Duel,6.406961,258612112,2057954,1H,122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",10,Air duel,"[{'id': 701}, {'id': 1802}]",16521,0.001136
4,8,Pass,8.562167,258612110,2057954,1H,122847,"[{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]",85,Simple pass,[{'id': 1801}],16521,0.001519


In [13]:
event_names = sorted(set(df['eventName']))
for e in event_names:
    col_name = 'num{}'.format(e.title().replace(' ', ''))
    
    counts_df = df[df['eventName'] == e]
    counts_df = counts_df.groupby(['matchId', 'teamId'])['id'].count().reset_index(name=col_name)
    print(len(counts_df))
    results[col_name] = counts_df[col_name]


128
128
128
91
93
128
128
124
128


In [14]:
results.head(100)

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,...,time,numDuel,numFoul,numFreeKick,numGoalkeeperLeavingLine,numOffside,numOthersOnTheBall,numPass,numSaveAttempt,numShot
0,8,Pass,1.656214,258612104,2057954,1H,122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",85,Simple pass,...,0.000294,221.0,22.0,50.0,1.0,3.0,62.0,311.0,7.0,11.0
1,8,Pass,4.487814,258612106,2057954,1H,139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",83,High pass,...,0.000796,247.0,10.0,55.0,4.0,1.0,66.0,516.0,3.0,7.0
2,1,Duel,5.937411,258612077,2057954,1H,103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",10,Air duel,...,0.001053,219.0,6.0,38.0,1.0,1.0,95.0,579.0,4.0,11.0
3,1,Duel,6.406961,258612112,2057954,1H,122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",10,Air duel,...,0.001136,227.0,12.0,42.0,5.0,1.0,68.0,421.0,1.0,7.0
4,8,Pass,8.562167,258612110,2057954,1H,122847,"[{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]",85,Simple pass,...,0.001519,259.0,10.0,57.0,4.0,1.0,87.0,385.0,5.0,12.0
5,8,Pass,10.991292,258612113,2057954,1H,122832,"[{'y': 15, 'x': 71}, {'y': 11, 'x': 92}]",85,Simple pass,...,0.001949,231.0,10.0,51.0,3.0,2.0,73.0,420.0,3.0,12.0
6,1,Duel,14.081637,258612115,2057954,1H,122847,"[{'y': 11, 'x': 92}, {'y': 11, 'x': 88}]",11,Ground attacking duel,...,0.002497,164.0,10.0,40.0,2.0,2.0,68.0,473.0,4.0,13.0
7,8,Pass,15.039163,258612116,2057954,1H,122847,"[{'y': 11, 'x': 88}, {'y': 13, 'x': 88}]",85,Simple pass,...,0.002667,163.0,13.0,45.0,1.0,3.0,76.0,553.0,8.0,7.0
8,8,Pass,16.438878,258612117,2057954,1H,122940,"[{'y': 13, 'x': 88}, {'y': 13, 'x': 70}]",85,Simple pass,...,0.002916,183.0,19.0,38.0,1.0,1.0,75.0,327.0,1.0,3.0
9,8,Pass,17.745299,258612119,2057954,1H,122832,"[{'y': 13, 'x': 70}, {'y': 28, 'x': 71}]",85,Simple pass,...,0.003147,173.0,17.0,54.0,3.0,3.0,37.0,466.0,7.0,12.0


In [15]:
set(df['eventName'])

{'Duel',
 'Foul',
 'Free Kick',
 'Goalkeeper leaving line',
 'Offside',
 'Others on the ball',
 'Pass',
 'Save attempt',
 'Shot'}

In [16]:
print(counts_df)

     matchId  teamId  numShot
0    2057954   14358       11
1    2057954   16521        7
2    2057955   15670       11
3    2057955   16129        7
4    2057956   14358       12
5    2057956   16129       12
6    2057957   15670       13
7    2057957   16521        7
8    2057958   14358        3
9    2057958   15670       12
10   2057959   16129        7
11   2057959   16521       18
12   2057960    1598       11
13   2057960    9905        5
14   2057961   10840        5
15   2057961   16216       13
16   2057962    9905        8
17   2057962   16216       15
18   2057963    1598       12
19   2057963   10840        5
20   2057964    9905       10
21   2057964   10840        5
22   2057965    1598       15
23   2057965   16216        6
24   2057966    4418        8
25   2057966    8493        2
26   2057967    7712       10
27   2057967   15594       14
28   2057968    4418       12
29   2057968   15594        9
..       ...     ...      ...
98   2058003    4418        5
99   20580