# Предсказание рейтингов игроков спортивного "Что? Где? Когда?"

**Background**: в спортивном “Что? Где? Когда?” соревнующиеся команды отвечают на одни и те же вопросы. После минуты обсуждения команды записывают и сдают свои ответы на карточках; побеждает тот, кто ответил на большее число вопросов. Турнир обычно состоит из нескольких десятков вопросов (обычно 36 или 45, иногда 60, больше редко). Часто бывают синхронные турниры, когда на одни и те же вопросы отвечают команды на сотнях игровых площадок по всему миру, т.е. в одном турнире могут играть сотни, а то и тысячи команд. Соответственно, нам нужно: <br> <br>
- построить рейтинг-лист, который способен нетривиально предсказывать результаты будущих турниров;<br> <br>
- при этом, поскольку ЧГК — это хобби, и контрактов тут никаких нет, игроки постоянно переходят из команды в команду, сильный игрок может на один турнир сесть поиграть за другую команду и т.д.; поэтому единицей рейтинг-листа должна быть не команда, а отдельный игрок;<br> <br>
- а что сильно упрощает задачу и переводит её в область домашних заданий на EM-алгоритм — это характер данных: начиная с какого-то момента, в базу результатов начали вносить все повопросные результаты команд, т.е. в данных будут записи вида “какая команда на какой вопрос правильно ответила”.

In [164]:
# imports
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, lil_matrix
import scipy
import pickle
from collections import Counter

from sklearn.linear_model import LogisticRegression, LinearRegression
from liblinear.liblinearutil import *

import pdb

### 1. Processing data 

In [11]:
tournaments = pickle.load(open('tournaments.pkl', 'rb'))
results = pickle.load(open('results.pkl', 'rb'))

In [12]:
# get id of tournaments for specific year
def get_tournaments_id(year):
    cur_tournaments = [v for k,v in tournaments.items() if v['dateStart'][:4] == year]
    tournaments_with_results = [v for idx, v in enumerate(cur_tournaments) if v['id'] in results and results[v['id']] != []]
    tournaments_with_results_mask = [v for idx, v in enumerate(tournaments_with_results) if 'mask' in results[v['id']][0]]
    return [v['id'] for v in tournaments_with_results_mask]

# keep only useful ids in results
def clean_results(used_ids):
    useful_results = {}
    for id in used_ids:
        useful_results[id] = results[id]
    return useful_results

In [13]:
train_tournaments_id = get_tournaments_id('2019')
test_tournaments_id = get_tournaments_id('2020')
total_id = train_tournaments_id + test_tournaments_id

In [14]:
results = clean_results(total_id)
del tournaments

In [15]:
def get_active_players_kernel(teams_list: list, players_counter, id_to_name):
    for team in teams_list:
        if team['mask'] != None:
            team_members = team['teamMembers']
            member_ids = []

            # collect information on team members
            for member in team_members:
                member_ids.append(str(member['player']['id']))
                id_to_name[str(member['player']['id'])] = member['player']['name'] + ' ' + member['player']['surname']

            # update present players set
            players_counter.update(member_ids)
    return players_counter, id_to_name

def get_active_players(tournaments_ids):
    players_counter = Counter()
    id_to_name = {}
    for tourn_id, tourn_data in results.items():
        players_counter, id_to_name = get_active_players_kernel(tourn_data, players_counter, id_to_name)
    return players_counter, id_to_name

n = 1000
active_players, id_to_name = get_active_players(train_tournaments_id)
top_n_active_players = dict(active_players.most_common(n))

def player_id_to_index():
    id_to_index = {}
    index_to_id = {}
    i = 0
    for player_id in top_n_active_players:
        id_to_index[int(player_id)] = i
        index_to_id[i] = int(player_id)
        i = i + 1
    return id_to_index, index_to_id

id_to_index, index_to_id = player_id_to_index()

In [182]:
def parse_tournament(teams_list: list, cur_quest_cnt: int):
    """parse tournament information"""
    id_complex_target_list = []
    answers_count = Counter()
    
    # estimate question complexity 
    for team in teams_list:
        if team['mask'] != None:
            numb_questions = len(team['mask'])
            for i in range(cur_quest_cnt, numb_questions + cur_quest_cnt):
                if team['mask'][i - cur_quest_cnt] == '1':
                    temp = []
                    temp.append(i)
                    answers_count.update(temp)       
    
    complexity_dict = {}
    for i in range(cur_quest_cnt, numb_questions + cur_quest_cnt):
        if answers_count[i] / len(teams_list) != 0:
            complexity_dict[i] = 1 / (answers_count[i] / len(teams_list))
        else:
            complexity_dict[i] = 1
        
    # iterate over teams and collect information
    for team in teams_list:
        if team['mask'] != None:
            numb_questions = len(team['mask'])
            team_members = team['teamMembers']
            # collect team members
            for member in team_members:
                member_id = member['player']['id']
                for i in range(cur_quest_cnt, numb_questions + cur_quest_cnt):
                    if team['mask'][i - cur_quest_cnt] == '1' or team['mask'][i - cur_quest_cnt] == '0':
                        id_complex_target_list.append([member_id, complexity_dict[i], int(team['mask'][i - cur_quest_cnt])])
                        
    return id_complex_target_list, numb_questions

In [217]:
def make_table(tournaments_ids):
    quest_count = 0
    id_complex_target_list = []
    #data = []
    target_rating = []
    target_complexity = []
    max_len = 100000
    data = lil_matrix((max_len, n), dtype=np.float32)
    row_cnt = 0
    
    for tourn_id, tourn_data in results.items():
        tourn_info, numb_questions = parse_tournament(tourn_data, quest_count)
        quest_count = quest_count + numb_questions
        for info in tourn_info:
            if info[0] in id_to_index:
                if len(target_rating) == max_len:
                    break 
                
                data[row_cnt, id_to_index[info[0]]] = 1
                #data[row_cnt, -1] = info[1]
                target_rating.append(info[2])
                target_complexity.append(info[1])
                row_cnt = row_cnt + 1
        if len(target_rating) == max_len:
            break
    return data.tocsr(), np.array(target_rating).astype(int), np.array(target_complexity).astype(int)

In [218]:
train_x, rating_y, complexity_y = make_table(train_tournaments_id)
#test_x, test_y = make_table(test_tournaments_id)

### 2. Baseline Model for Players Ratings

Below I will train two models. First models will be targeted towards predicting the rating of the player. I will use linear regression. Each coefficient of a linear regression will correspond to respective players rating. Higher the coefficient, higher will the players rating be. <br>

Second model will be targeted at predicting the complexity of the question/tournament. It will be done by setting the complexity of the problem as a target and fitting linear regression. Each coefficient will correspond to whether the player is good at answering complex questions. Also, this coefficients will be indirectly corresponding to players rating, because if the player is good at answering complex problems higher his rating should be.

In [223]:
rating_model = train(rating_y, train_x)

In [224]:
ratings_dict = {}
model_ratings = rating_model.coef_[0:n]

for i in range(n):
    ratings_dict[index_to_id[i]] = model_ratings[i]
    
pred_ratings = list({k: v for k, v in sorted(ratings_dict.items(), key=lambda item: item[1])}.keys())

AttributeError: 'model' object has no attribute 'coef_'

In [222]:
test = train_x.toarray()
for i in range(n):
    print(model.get_decfun_coef(i))

0.0
0.25300973049474973
0.4301861350230023
0.35852753815719407
0.3387298563913128
0.06964037010524782
0.47702327054209537
0.30825599574705165
0.31094888229784245
0.3873219600485883
0.40929426182899076
0.33401486576387895
0.12097224311076649
0.12773809030492955
0.05036068359715584
0.48355590841974627
0.5064583996168432
0.05054546558458339
0.02610452988580214
0.4836441777065401
0.11134664189236576
0.43288351273643766
0.3403662298384058
0.4311221661527438
0.32610378844424703
-0.09282278628642238
0.3411336498473425
0.4342878209586518
0.0414782900720394
0.01295219106186718
0.4534064051179322
-0.09412678219852055
0.44209856007129045
0.41747493699959126
0.19716904859839612
0.09320699992910508
0.08377242318497213
0.1703941779770224
-0.03682091585101732
0.23310236861606248
0.24568163759882655
0.31083536781918675
0.14585429259786342
0.4695639212730611
0.17038700574500687
0.3617616593946752
0.08331682893085457
0.2511051914998381
-0.07050797676563414
0.21668347491068496
0.29600293193305494
-0.0692

In [210]:
train_x[1].toarray()

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 

### 3. Model Accuracy Metric

Due to nature of the game ChGK, even one strong player in the team can strongly affect the team's results. To get more accurate prediction we can take top-3 rated players of the team and computer their average. Based on those averages we will predict the tournament results and compare them to actual results. 