In [35]:
import pymongo

from tqdm import tqdm

In [36]:
client = pymongo.MongoClient()
db = client.soccerdb

In [37]:
db.events.find_one()

{'_id': ObjectId('5da3961a04f673bbf271d22b'),
 'eventId': 8,
 'eventName': 'Pass',
 'eventSec': 1.255989999999997,
 'id': 88178642,
 'matchId': 1694390,
 'matchPeriod': '1H',
 'playerId': 26010,
 'positions': [{'x': 50, 'y': 48}, {'x': 47, 'y': 50}],
 'subEventId': 85,
 'subEventName': 'Simple pass',
 'tags': [{'id': 1801}],
 'teamId': 4418}

In [38]:
def count_some_event(event, matchId=None, teamId=None, playerId=None, extra_cond={}):
    cond = {'eventName': event}
    cond.update(extra_cond)
    
    if matchId is not None:
        cond.update({'matchId': matchId})
    if teamId is not None:
        cond.update({'teamId': teamId})
    if playerId is not None:
        cond.update('playerId')
        
        
    return db.events.count_documents(cond)

### Get a set of all (matchId, teamId) tuples
We need the set of all matches and the corresponding teams to work with.

In [39]:
set_matches2teams = set(map(lambda dic: (dic['matchId'], dic['teamId']), db.events.find(
    {'eventName': 'Pass', 'matchPeriod': '1H'}, 
    {'_id': 0, 'matchId': 1, 'teamId': 1})))

### Get the list of all events
Find 3 different matchIds to find the set of events inside them. This helps decreasing the probability of not seeing an event due to it's absence in a random game.

In [40]:
some_matchids = [x['matchId'] for x in db.events.find({}, {'_id': 0, 'matchId': 1}, limit=3)]
set_event_names = set([x['eventName'] for x in db.events.find({
    'matchId': {'$in': some_matchids}}, {'_id': 0, 'eventName': 1})])

In [41]:
for matchId, teamId in set_matches2teams:
    base = {
        'matchId': matchId,
        'teamId': teamId
    }
    for event in set_event_names:
        db.results.update_one(base, {
            '$set': {
                'num{}'.format(
                    event.title().replace(' ', '')
                ): count_some_event(event, matchId, teamId, extra_cond={'matchPeriod': '1H'})
            }
        }, upsert=True)

### Are they playing at their home?
If true, the team is playing at their own home. This might be correlated to the winning because of the weather condition, time zone, being exhausted or emotional state of the players.

In [8]:
for matchId, teamId in tqdm(set_matches2teams):
    isHome = db.matches.find_one({'wyId': matchId},
                                 {'_id': 0, 'teamsData.{}.side'.format(str(teamId)): 1}
                                )['teamsData'][str(teamId)]['side']
    isHome = (isHome == 'home')
    db.results.update_one({'matchId': matchId, 'teamId': teamId}, {'$set': {'isHome': isHome}})

100%|██████████| 102/102 [00:00<00:00, 507.56it/s]


### Set of All Event Tags
Each event is described using a set of tags which could be quite fruitful for the purpose of prediction. However, we have to first recognize these tags.

Unfortunately, for now, we don't know what these tags represent. There should be a mapping, which apparently, there isn't.

In [42]:
all_tags = []
for doc in db.events.find({}, {'tags': 1, '_id': 0}):
    tags = [x['id'] for x in doc['tags']]
    all_tags.extend(tags)
all_tags = set(all_tags)
print(all_tags)

{901, 1801, 1802, 401, 402, 403, 1301, 1302, 801, 1701, 1702, 1703, 301, 302, 1201, 1202, 1203, 1204, 2101, 1205, 1207, 1208, 1206, 1210, 1209, 1212, 701, 702, 1215, 1216, 1601, 703, 1214, 1220, 1217, 1213, 1219, 1222, 201, 1221, 1223, 1101, 1211, 1102, 2001, 601, 602, 101, 102, 1001, 1901, 1218, 501, 502, 503, 504, 1401}


### Percentage of Accurate Passes
Being accurate when passing can be an indication of how good a team is performing during the game. This can later on be examined in moving windows.

In [43]:
for match_id, team_id in tqdm(list(set_matches2teams)):
    num_pass = count_some_event('Pass', match_id, team_id, extra_cond={'matchPeriod': '1H'})
    num_accurate_pass = count_some_event('Pass', match_id, team_id, extra_cond={
        'matchPeriod': '1H',
        'tags.id': 1801
    })
    rate = num_accurate_pass / num_pass
    db.results.update_one({'matchId': match_id, 'teamId': team_id}, {
        '$set': {'numAccuratePass': num_accurate_pass, 'rateAccuratePass': rate}
    })


  0%|          | 0/102 [00:00<?, ?it/s][A
  1%|          | 1/102 [00:00<00:16,  5.98it/s][A
  2%|▏         | 2/102 [00:00<00:16,  5.94it/s][A
  3%|▎         | 3/102 [00:00<00:16,  5.96it/s][A
  4%|▍         | 4/102 [00:00<00:16,  5.89it/s][A
  5%|▍         | 5/102 [00:00<00:16,  5.95it/s][A
  6%|▌         | 6/102 [00:01<00:16,  5.98it/s][A
  7%|▋         | 7/102 [00:01<00:15,  5.96it/s][A
  8%|▊         | 8/102 [00:01<00:15,  5.96it/s][A
  9%|▉         | 9/102 [00:01<00:15,  5.97it/s][A
 10%|▉         | 10/102 [00:01<00:15,  5.98it/s][A
 11%|█         | 11/102 [00:01<00:15,  5.90it/s][A
 12%|█▏        | 12/102 [00:02<00:15,  5.99it/s][A
 13%|█▎        | 13/102 [00:02<00:14,  5.97it/s][A
 14%|█▎        | 14/102 [00:02<00:14,  6.00it/s][A
 15%|█▍        | 15/102 [00:02<00:14,  6.03it/s][A
 16%|█▌        | 16/102 [00:02<00:14,  6.06it/s][A
 17%|█▋        | 17/102 [00:02<00:13,  6.10it/s][A
 18%|█▊        | 18/102 [00:03<00:13,  6.09it/s][A
 19%|█▊        | 19/102 [00:0

### Add the Class Variable(s)
For now, class variable could be the number of goals or weather the team has won the game or not. Let's begin with the latter and expand it to former later on.

In [57]:
for match_id, team_id in set_matches2teams:
    print(match_id, team_id)
    results = db.matches.find_one({
        'wyId': matchId,
    }, {
        'winner': 1,
         'teamsData.{}.score'.format(str(team_id)): 1,
         'teamsData.{}.scoreET'.format(str(team_id)): 1,
         'teamsData.{}.scoreP'.format(str(team_id)): 1,
        '_id': 0
    })
    print(results)
    final_score = sum(results['teamsData'][str(team_id)].values())
    print(final_score)
    break

1694430 3148
{'winner': 8274, 'teamsData': {}}


KeyError: '3148'