# Soccer Prediction

An endeavor to predict soccer scores


In [1]:
import pandas as pd
import json
import matplotlib


## Playing with Events

In [2]:
df = pd.read_json('data/events_World_Cup.json', orient='records')
df.head()

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",2057954,Pass,16521,1H,1.656214,85,258612104
1,8,High pass,[{'id': 1801}],139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",2057954,Pass,16521,1H,4.487814,83,258612106
2,1,Air duel,"[{'id': 703}, {'id': 1801}]",103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",2057954,Duel,14358,1H,5.937411,10,258612077
3,1,Air duel,"[{'id': 701}, {'id': 1802}]",122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",2057954,Duel,16521,1H,6.406961,10,258612112
4,8,Simple pass,[{'id': 1801}],122847,"[{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]",2057954,Pass,16521,1H,8.562167,85,258612110


In [3]:
df.describe()

Unnamed: 0,eventId,playerId,matchId,teamId,eventSec,id
count,101759.0,101759.0,101759.0,101759.0,101759.0,101759.0
mean,5.749772,86281.211922,2057986.0,10375.224589,1388.637801,261140100.0
std,3.115089,100669.835706,18.66148,5170.481351,850.980508,4062610.0
min,1.0,12.0,2057954.0,1598.0,0.122553,258612100.0
25%,1.0,10131.0,2057970.0,6380.0,643.469486,259394700.0
50%,8.0,51217.0,2057986.0,9905.0,1356.739817,260305200.0
75%,8.0,116166.0,2058003.0,15594.0,2115.778717,261094900.0
max,10.0,552555.0,2058017.0,19314.0,3258.366837,280217500.0


In [4]:
result = df.groupby('eventName')['id'].count().reset_index(name='count')
result['count'] = result['count'] / result['count'].max() * 100
result['count'] = result['count'] / result['count'].sum()

In [5]:
result.plot.bar(x='eventName', y='count')

<matplotlib.axes._subplots.AxesSubplot at 0x7f306468a4d0>

In [6]:
df.groupby('matchId').apply(lambda x: x.sort_values(by='eventSec'))

Unnamed: 0_level_0,Unnamed: 1_level_0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
matchId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2057954,0,8,Simple pass,[{'id': 1801}],122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",2057954,Pass,16521,1H,1.656214,85,258612104
2057954,788,8,Simple pass,[{'id': 1801}],101707,"[{'y': 50, 'x': 52}, {'y': 37, 'x': 34}]",2057954,Pass,14358,2H,3.346688,85,258613014
2057954,1,8,High pass,[{'id': 1801}],139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",2057954,Pass,16521,1H,4.487814,83,258612106
2057954,2,1,Air duel,"[{'id': 703}, {'id': 1801}]",103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",2057954,Duel,14358,1H,5.937411,10,258612077
2057954,3,1,Air duel,"[{'id': 701}, {'id': 1802}]",122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",2057954,Duel,16521,1H,6.406961,10,258612112
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2058017,101754,8,Simple pass,[{'id': 1801}],3476,"[{'y': 20, 'x': 46}, {'y': 6, 'x': 64}]",2058017,Pass,9598,2H,2978.301867,85,263885652
2058017,101755,7,Touch,[],14812,"[{'y': 6, 'x': 64}, {'y': 2, 'x': 82}]",2058017,Others on the ball,9598,2H,2979.084611,72,263885653
2058017,101756,8,Cross,"[{'id': 401}, {'id': 801}, {'id': 1802}]",14812,"[{'y': 2, 'x': 82}, {'y': 100, 'x': 100}]",2058017,Pass,9598,2H,2983.448628,80,263885654
2058017,101757,4,Goalkeeper leaving line,[],25381,"[{'y': 0, 'x': 0}, {'y': 98, 'x': 18}]",2058017,Goalkeeper leaving line,4418,2H,2985.869275,40,263885613


## Creating New Features
Gonna try adding some new features to our new data frame.

In [7]:
results = df.groupby(['matchId', 'teamId'])['id'].count().reset_index(name='numEvents')
results.head()

Unnamed: 0,matchId,teamId,numEvents
0,2057954,14358,680
1,2057954,16521,909
2,2057955,15670,953
3,2057955,16129,786
4,2057956,14358,812


In [8]:
event_names = sorted(set(df['eventName']))
for e in event_names:
    col_name = 'num{}'.format(e.title().replace(' ', ''))
    counts_df = df[df['eventName'] == e]
    counts_df = counts_df.groupby(['matchId', 'teamId'])['id'].count().reset_index(name=col_name)
    results[col_name] = counts_df[col_name]


In [9]:
results.head()

Unnamed: 0,matchId,teamId,numEvents,numDuel,numFoul,numFreeKick,numGoalkeeperLeavingLine,numOffside,numOthersOnTheBall,numPass,numSaveAttempt,numShot
0,2057954,14358,680,221,22,50,1.0,3.0,62,311,7.0,11
1,2057954,16521,909,247,10,55,4.0,1.0,66,516,3.0,7
2,2057955,15670,953,219,6,38,1.0,1.0,95,579,4.0,11
3,2057955,16129,786,227,12,42,5.0,1.0,68,421,1.0,7
4,2057956,14358,812,259,10,57,4.0,1.0,87,385,5.0,12


In [10]:
set(df['eventName'])

{'Duel',
 'Foul',
 'Free Kick',
 'Goalkeeper leaving line',
 'Offside',
 'Others on the ball',
 'Pass',
 'Save attempt',
 'Shot'}

In [11]:
print(counts_df)

     matchId  teamId  numShot
0    2057954   14358       11
1    2057954   16521        7
2    2057955   15670       11
3    2057955   16129        7
4    2057956   14358       12
..       ...     ...      ...
123  2058015    9598       22
124  2058016    2413       14
125  2058016    5629       11
126  2058017    4418        7
127  2058017    9598       14

[128 rows x 3 columns]
