# Предсказание рейтингов игроков спортивного "Что? Где? Когда?"

**Background**: в спортивном “Что? Где? Когда?” соревнующиеся команды отвечают на одни и те же вопросы. После минуты обсуждения команды записывают и сдают свои ответы на карточках; побеждает тот, кто ответил на большее число вопросов. Турнир обычно состоит из нескольких десятков вопросов (обычно 36 или 45, иногда 60, больше редко). Часто бывают синхронные турниры, когда на одни и те же вопросы отвечают команды на сотнях игровых площадок по всему миру, т.е. в одном турнире могут играть сотни, а то и тысячи команд. Соответственно, нам нужно: <br> <br>
- построить рейтинг-лист, который способен нетривиально предсказывать результаты будущих турниров;<br> <br>
- при этом, поскольку ЧГК — это хобби, и контрактов тут никаких нет, игроки постоянно переходят из команды в команду, сильный игрок может на один турнир сесть поиграть за другую команду и т.д.; поэтому единицей рейтинг-листа должна быть не команда, а отдельный игрок;<br> <br>
- а что сильно упрощает задачу и переводит её в область домашних заданий на EM-алгоритм — это характер данных: начиная с какого-то момента, в базу результатов начали вносить все повопросные результаты команд, т.е. в данных будут записи вида “какая команда на какой вопрос правильно ответила”.

In [1]:
# imports
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import pickle
from collections import Counter

from sklearn.linear_model import LogisticRegression, LinearRegression

import pdb

### 1. Processing data 

In [2]:
tournaments = pickle.load(open('tournaments.pkl', 'rb'))
results = pickle.load(open('results.pkl', 'rb'))

In [3]:
# get id of tournaments for specific year
def get_tournaments_id(year):
    cur_tournaments = [v for k,v in tournaments.items() if v['dateStart'][:4] == year]
    tournaments_with_results = [v for idx, v in enumerate(cur_tournaments) if v['id'] in results and results[v['id']] != []]
    tournaments_with_results_mask = [v for idx, v in enumerate(tournaments_with_results) if 'mask' in results[v['id']][0]]
    return [v['id'] for v in tournaments_with_results_mask]

# keep only useful ids in results
def clean_results(used_ids):
    useful_results = {}
    for id in used_ids:
        useful_results[id] = results[id]
    return useful_results

In [4]:
train_tournaments_id = get_tournaments_id('2019')
test_tournaments_id = get_tournaments_id('2020')
total_id = train_tournaments_id + test_tournaments_id

In [5]:
results = clean_results(total_id)
del tournaments

In [6]:
def get_active_players_kernel(teams_list: list, players_counter, id_to_name):
    for team in teams_list:
        if team['mask'] != None:
            team_members = team['teamMembers']
            member_ids = []

            # collect information on team members
            for member in team_members:
                member_ids.append(str(member['player']['id']))
                id_to_name[str(member['player']['id'])] = member['player']['name'] + ' ' + member['player']['surname']

            # update present players set
            players_counter.update(member_ids)
    return players_counter, id_to_name

def get_active_players(tournaments_ids):
    players_counter = Counter()
    id_to_name = {}
    for tourn_id, tourn_data in results.items():
        players_counter, id_to_name = get_active_players_kernel(tourn_data, players_counter, id_to_name)
    return players_counter, id_to_name

n = 1000
active_players, id_to_name = get_active_players(train_tournaments_id)
top_n_active_players = dict(active_players.most_common(n))

def player_id_to_index():
    id_to_index = {}
    index_to_id = {}
    i = 0
    for player_id in top_n_active_players:
        id_to_index[int(player_id)] = i
        index_to_id[i] = int(player_id)
        i = i + 1
    return id_to_index, index_to_id

id_to_index, index_to_id = player_id_to_index()

In [7]:
def parse_tournament(teams_list: list, cur_quest_cnt: int):
    """parse tournament information"""
    quest_player_dict = {}
    answers_count = Counter()
    numb_questions = 0
    
    # iterate over teams and collect information
    for team in teams_list:
        if team['mask'] != None:
            numb_questions = len(team['mask'])
            team_members = team['teamMembers']
            member_ids = []

            # collect information on team members
            for member in team_members:
                if str(member['player']['id']) in top_n_active_players:
                    member_ids.append(str(member['player']['id']))

            # in case of answered question append team members
            for i in range(cur_quest_cnt, numb_questions + cur_quest_cnt):
                if team['mask'][i - cur_quest_cnt] == '1':
                    temp = []
                    temp.append(i)
                    answers_count.update(temp)
                    if i in quest_player_dict:
                        quest_player_dict[i] = quest_player_dict[i] + member_ids
                    else:
                        quest_player_dict[i] = member_ids
                        
    complexity_dict = dict(answers_count)
    for key, value in complexity_dict.items():
        complexity_dict[key] = complexity_dict[key] / len(teams_list)
    
    return quest_player_dict, numb_questions, complexity_dict

In [8]:
def make_table(tournaments_ids):
    quest_count = 0
    quest_players = {}
    question_complexity = {}
    for tourn_id, tourn_data in results.items():
        # parse the given tournament
        quest_player_dict, numb_questions, cur_complexity = parse_tournament(tourn_data, quest_count)
        # update all dictionaries
        question_complexity.update(cur_complexity)
        quest_players.update(quest_player_dict)
        quest_count = quest_count + numb_questions        
    
    result = []
    max_len = 10000
    for quest_id, players_list in quest_players.items():
        if len(result) == max_len:
            break
        else:
            for player_id in players_list:
                new_row = [0] * (n + 1)
                new_row[id_to_index[int(player_id)]] = 1
                if quest_id in question_complexity:
                    new_row[-1] = question_complexity[quest_id]
                else:
                    new_row[-1] = 2.0
                result.append(new_row)
                if len(result) == max_len:
                    break
    return result

In [9]:
train_data = np.array(make_table(train_tournaments_id))
test_data = np.array(make_table(test_tournaments_id))

### 2. Baseline Model for Players Ratings

In [10]:
y = np.ones(n + 1)
y[0] = 0.99
base_model = LinearRegression()
base_model.fit(train_data.T, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
ratings_dict = {}
model_ratings = base_model.coef_[0:n]

for i in range(n):
    ratings_dict[index_to_id[i]] = model_ratings[i]
    
pred_ratings = list({k: v for k, v in sorted(ratings_dict.items(), key=lambda item: item[1])}.keys())

### 3. Model Accuracy Metric

Due to nature of the game ChGK, even one strong player in the team can strongly affect the tean's results. To get more accurate prediction we can take top-3 rated players of the team and computer their average. Based on those averages we will predict the tournament results and compare them to actual results. 