In [1]:
import pymongo

from tqdm import tqdm

In [2]:
client = pymongo.MongoClient()
db = client.soccerdb

In [4]:
db.events.find_one()

{'_id': ObjectId('5d9dced91c801fea8e22e72e'),
 'eventId': 8,
 'eventName': 'Pass',
 'eventSec': 1.255989999999997,
 'id': 88178642,
 'matchId': 1694390,
 'matchPeriod': '1H',
 'playerId': 26010,
 'positions': [{'x': 50, 'y': 48}, {'x': 47, 'y': 50}],
 'subEventId': 85,
 'subEventName': 'Simple pass',
 'tags': [{'id': 1801}],
 'teamId': 4418}

In [17]:
def count_some_event(event, matchId=None, teamId=None, playerId=None):
    cond = {'eventName': event}
    if matchId is not None:
        cond.update({'matchId': matchId})
    if teamId is not None:
        cond.update({'teamId': teamId})
    if playerId is not None:
        cond.update('playerId')
        
    return db.events.find(cond, {'_id': 0}).count()

### Get a set of all (matchId, teamId) tuples
We need the set of all matches and the corresponding teams to work with.

In [3]:
set_matches2teams = set(map(lambda dic: (dic['matchId'], dic['teamId']), db.events.find(
    {'eventName': 'Pass', 'matchPeriod': '1H'}, 
    {'_id': 0, 'matchId': 1, 'teamId': 1})))

### Get the list of all events
Find 3 different matchIds to find the set of events inside them. This helps decreasing the probability of not seeing an event due to it's absence in a random game.

In [52]:
some_matchids = [x['matchId'] for x in db.events.find({}, {'_id': 0, 'matchId': 1}, limit=3)]
set_event_names = set([x['eventName'] for x in db.events.find({
    'matchId': 1694430}, {'_id': 0, 'eventName': 1})])

In [57]:
for matchId, teamId in tqdm(set_matches2teams):
    base = counts = {
        'matchId': matchId,
        'teamId': teamId
    }
    for event in set_event_names:
        counts.update({
            'num{}'.format(event.title().replace(' ', '')): count_some_event(event, matchId, teamId)
        })
    db.results.update(counts, base, upsert=True)

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 102/102 [00:56<00:00,  1.81it/s]


### Are they playing at their home?
If true, the team is playing at their own home. This might be correlated to the winning because of the weather condition, time zone, being exhausted or emotional state of the players.

In [8]:
for matchId, teamId in tqdm(set_matches2teams):
    isHome = db.matches.find_one({'wyId': matchId},
                                 {'_id': 0, 'teamsData.{}.side'.format(str(teamId)): 1}
                                )['teamsData'][str(teamId)]['side']
    isHome = (isHome == 'home')
    db.results.update_one({'matchId': matchId, 'teamId': teamId}, {'$set': {'isHome': isHome}})

100%|██████████| 102/102 [00:00<00:00, 507.56it/s]


### Set of All Event Tags
Each event is described using a set of tags which could be quite fruitful for the purpose of prediction. However, we have to first recognize these tags.

Unfortunately, for now, we don't know what these tags represent. There should be a mapping, which apparently, there isn't.

In [8]:
all_tags = []
for doc in db.events.find({}, {'tags': 1, '_id': 0}):
    tags = [x['id'] for x in doc['tags']]
    all_tags.extend(tags)
all_tags = set(all_tags)
print(all_tags)

{901, 1801, 1802, 401, 402, 403, 1301, 1302, 801, 1701, 1702, 1703, 301, 302, 1201, 1202, 1203, 1204, 2101, 1205, 1207, 1208, 1206, 1210, 1209, 1212, 701, 702, 1215, 1216, 1601, 703, 1214, 1220, 1217, 1213, 1219, 1222, 201, 1221, 1223, 1101, 1211, 1102, 2001, 601, 602, 101, 102, 1001, 1901, 1218, 501, 502, 503, 504, 1401}


### Percentage of Accurate Passes
Being accurate when passing can be an indication of how good a team is performing during the game. This can later on be examined in moving windows.