In [6]:
! wget -q https://www.dropbox.com/s/s4qj0fpsn378m2i/chgk.zip
! unzip chgk.zip

Archive:  chgk.zip
  inflating: players.pkl             
  inflating: results.pkl             
  inflating: tournaments.pkl         


In [7]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import numpy as np
import scipy as sp
import scipy.stats as st
import scipy.integrate as integrate
#from scipy.stats import multivariate_normal
#from sklearn import linear_model
#from sklearn.linear_model import LinearRegression
#from sklearn.exceptions import ConvergenceWarning
import statsmodels.api as sm

import pandas as pd
import gc
import datetime
#from sklearn.preprocessing import MinMaxScaler

sns.set_style("whitegrid")
sns.set_palette("colorblind")
palette = sns.color_palette()
figsize = (12,6)
legend_fontsize = 16

from matplotlib import rc
rc('font',**{'family':'sans-serif'})
rc('text', usetex=True)
rc('text.latex',preamble=r'\usepackage[utf8]{inputenc}')
rc('text.latex',preamble=r'\usepackage[russian]{babel}')
rc('figure', **{'dpi': 200})
mpl.rcParams.update(mpl.rcParamsDefault)

In [8]:
from scipy.stats import kendalltau, spearmanr
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import itertools

SEED = 1488
import random
random.seed(SEED)
np.random.seed(SEED)

from tqdm import tqdm
tqdm.pandas()

import gc

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from scipy.stats import kendalltau, spearmanr
import torch
import torch.nn as nn
import torch.optim as optim

# Task 1

**Прочитайте и проанализируйте данные, выберите турниры, в которых есть данные о составах команд и повопросных результатах**

In [10]:
TRAIN_YEAR = 2019
TEST_YEAR = 2020

In [52]:
results = pickle.load(open('results.pkl', 'rb'))
players = pickle.load(open('players.pkl', 'rb'))
tournaments = pickle.load(open('tournaments.pkl', 'rb'))

In [53]:
players_df = pd.DataFrame.from_dict(players, orient="index")

In [54]:
tournaments_df = pd.DataFrame.from_dict(tournaments, orient="index")
tournaments_df["dateStart"] = pd.to_datetime(tournaments_df.dateStart)

In [55]:
results_list = []
for tournament_id, teams in tqdm(results.items()):
    tournament_year = tournaments_df[tournaments_df.id == tournament_id].dateStart.item().year
    if tournament_year != TRAIN_YEAR and tournament_year != TEST_YEAR:
        continue
    for team in teams:
        if team.get("mask") is None:
            continue
        for player in team["teamMembers"]:
            player_result = {}
            player_result["tournament_id"] = tournament_id
            player_result["tournament_year"] = tournament_year
            player_result["team_id"] = team["team"]["id"]
            player_result["mask"] = team["mask"]
            player_result["questions_total"] = team["questionsTotal"]
            player_result["num_questions"] = len(team["mask"])
            #player_result["synch_request"] = team["synchRequest"]
            player_result["position"] = team["position"]
            #player_result["controversials"] = team["controversials"]
            player_result["team_size"] = len(team["teamMembers"])
            #player_result["flags"] = team["flags"]
            player_result["player_flag"] = player["flag"]
            player_result["player_used_rating"] = player["usedRating"]
            player_result["player_rating"] = player["rating"]
            player_result["player_id"] = player["player"]["id"]
            results_list.append(player_result)

100%|██████████| 5528/5528 [00:04<00:00, 1370.68it/s]


In [56]:
results_df = pd.DataFrame.from_dict(results_list, orient="columns")

**Взять в тренировочный набор турниры с dateStart из 2019 года;   
В тестовый — турниры с dateStart из 2020 года.**

In [57]:
baseline_data_train = list()
baseline_data_test = list()
for ind, player_result in tqdm(results_df.iterrows(), total=results_df.shape[0]):
    baseline_data = baseline_data_train
    if player_result.tournament_year == TEST_YEAR:
        baseline_data = baseline_data_test
    baseline_data.extend(
        list(zip(
            itertools.repeat(player_result.player_id, len(player_result["mask"])),
            range(player_result.tournament_id * 1000, player_result.tournament_id * 1000 + len(player_result["mask"])), 
            player_result["mask"]
        ))
    )

100%|██████████| 564624/564624 [01:09<00:00, 8069.31it/s]


In [58]:
baseline_data_train_df = pd.DataFrame.from_records(baseline_data_train, columns=["player_id", "question_id", "is_correct"])
baseline_data_test_df = pd.DataFrame.from_records(baseline_data_test, columns=["player_id", "question_id", "is_correct"])

In [59]:
baseline_data_train_df.drop(
    baseline_data_train_df[(baseline_data_train_df.is_correct == 'X') | (baseline_data_train_df.is_correct == '?')].index, 
    inplace=True
)
baseline_data_test_df.drop(
    baseline_data_test_df[(baseline_data_test_df.is_correct == 'X') | (baseline_data_test_df.is_correct == '?')].index, 
    inplace=True
)

In [60]:
baseline_data_train_df["is_correct"] = baseline_data_train_df.is_correct.astype(int)
baseline_data_test_df["is_correct"] = baseline_data_test_df.is_correct.astype(int)

In [61]:
# Save baseline data
baseline_data_train_df.to_csv("baseline_data_train_df.csv", index=False)
baseline_data_test_df.to_csv("baseline_data_test_df.csv", index=False)

# Task 2

**Постройте baseline-модель на основе линейной или логистической регрессии, которая будет обучать рейтинг-лист игроков**

Построим логистическую регрессию интерпретируя выход модели как вероятность события "игрок $p_i$ ответит верно на вопрос $q_j$":  

$$logreg(p_i, q_j, weights) = P(true\_answer|player=p_i, question=q_j)$$
Тогда по формуле полной вероятности можем найти абсолютную вероятность игрока ответить правильно:
$$P(true\_answer|player=p_i) = \sum_{j=1}^{1000} P(true\_answer|player=p_i, question=q_j) * P(question=q_j) = \sum_{j=1}^{1000} P(true\_answer|player=p_i, question=q_j) * \frac{1}{1000}$$
и на основе этой вероятности построить рейтинг игроков

In [62]:
# Load baseline data
baseline_data_train_df = pd.read_csv("baseline_data_train_df.csv")
baseline_data_test_df = pd.read_csv("baseline_data_test_df.csv")

In [63]:
X_train = baseline_data_train_df[["player_id", "question_id"]]
y_train = baseline_data_train_df[["is_correct"]]
one_hot_enc = OneHotEncoder()
X_train_encoded = one_hot_enc.fit_transform(X_train)

In [64]:
lr = LogisticRegression(random_state=SEED, n_jobs=-1)
lr.fit(X_train_encoded, y_train);

In [65]:
# Save baseline model
pickle.dump(one_hot_enc, open("one_hot_enc.pkl", "wb"))
pickle.dump(lr, open("lr.pkl", "wb"))

In [None]:
# Load baseline model
one_hot_enc = pickle.load(open("one_hot_enc.pkl", "rb"))
lr = pickle.load(open("lr.pkl", "rb"))

In [66]:
num_questions_to_sample = 1000
sampled_question_ids = np.random.choice(baseline_data_train_df.question_id.unique(), size=num_questions_to_sample, replace=False)

In [67]:
player_ids = baseline_data_train_df.player_id.unique()

In [68]:
X_eval = pd.DataFrame({"player_id": np.repeat(player_ids, num_questions_to_sample), 
                       "question_id": np.tile(sampled_question_ids, player_ids.shape[0])})

In [69]:
X_eval["is_correct_prob"] = lr.predict_proba(one_hot_enc.transform(X_eval))[:, 1]

In [70]:
X_eval["is_correct_prob_x_question_prob"] = X_eval.is_correct_prob / num_questions_to_sample

In [71]:
player_score = X_eval.groupby("player_id")["is_correct_prob_x_question_prob"].sum()

In [72]:
players_rating = player_score.reset_index()\
.rename(columns={"is_correct_prob_x_question_prob": "is_correct_prob"})\
.sort_values(by="is_correct_prob", ascending=False).reset_index(drop=True,)

In [73]:
players_rating['name'] = players_rating['player_id'].map(dict(zip(players_df.id.values, (players_df.name + " " + players_df.surname).values)))

Рейтинг Сергея Игоревича Николенко: 27 (score: 0.77)

In [74]:
players_rating[players_rating.name == "Сергей Николенко"]

Unnamed: 0,player_id,is_correct_prob,name
18,22799,0.782032,Сергей Николенко


Топ 10 игроков по "мнению" baseline модели:

In [75]:
players_rating.head(10)

Unnamed: 0,player_id,is_correct_prob,name
0,27403,0.839018,Максим Руссо
1,28751,0.829054,Иван Семушин
2,4270,0.827459,Александра Брутер
3,27822,0.813458,Михаил Савченков
4,30152,0.809231,Артём Сорожкин
5,18036,0.804991,Михаил Левандовский
6,30270,0.803202,Сергей Спешков
7,87637,0.801508,Антон Саксонов
8,20691,0.801436,Станислав Мереминский
9,26089,0.79724,Ирина Прокофьева


In [None]:
players_rating.to_csv("players_rating.csv", index=False)

# Task 3

**Предложите способ предсказать результаты нового турнира с известными составами, но неизвестными вопросами, в виде ранжирования команд;**

Будем считать, что команда ответила правильно, если хотябы один участник ответил правильно. Тогда абсолютная вероятность команды ответить правильно:
$$P(true\_answer|team=t_k=[p_1, \dots, p_N]) = 1 - \prod_1^N (1 - P(true\_answer|player=p_i))$$
Будем ранжировать команды по этой вероятности

In [None]:
players_rating = pd.read_csv("players_rating.csv")

In [77]:
score_team_data_train = []
score_team_data_test = []
for tournament_id, teams in tqdm(results.items()):
    tournament_year = tournaments_df[tournaments_df.id == tournament_id].dateStart.item().year
    if tournament_year == TRAIN_YEAR:
        score_team_data = score_team_data_train
    elif tournament_year == TEST_YEAR:
        score_team_data = score_team_data_test
    else:
        continue
    for team in teams:
        if team.get("position") is None:
            continue
        score_team_datum = {}
        score_team_datum["tournament_id"] = tournament_id
        score_team_datum["tournament_year"] = tournament_year
        score_team_datum["team_id"] = team["team"]["id"]
        score_team_datum["true_team_position"] = team["position"]
        score_team_datum["team_players"] = [player["player"]["id"] for player in team["teamMembers"]]
        score_team_data.append(score_team_datum)

100%|██████████| 5528/5528 [00:03<00:00, 1727.99it/s]


In [78]:
score_team_data_train_df = pd.DataFrame.from_records(score_team_data_train)
score_team_data_test_df = pd.DataFrame.from_records(score_team_data_test)

In [79]:
# Save data for baseline ranking team model
score_team_data_train_df.to_csv("score_team_data_train_df.csv", index=False)
score_team_data_test_df.to_csv("score_team_data_test_df.csv", index=False)

In [80]:
def get_is_correct_prob(player_ids):
    players_info = players_rating[players_rating.player_id.isin(player_ids)]
    return 1 - (1 - players_info.is_correct_prob.values).prod()

In [81]:
score_team_data_train_df["is_correct_prob"] = score_team_data_train_df.team_players.apply(get_is_correct_prob)

In [82]:
score_team_data_test_df["is_correct_prob"] = score_team_data_test_df.team_players.apply(get_is_correct_prob)

In [None]:
# Save data for baseline ranking team model with scores
score_team_data_train_df.to_csv("score_team_data_train_df_with_scores.csv", index=False)
score_team_data_test_df.to_csv("score_team_data_test_df_with_scores.csv", index=False)

In [83]:
score_team_data_test_df["pred_team_position"] = score_team_data_test_df\
.groupby("tournament_id")["is_correct_prob"]\
.rank(method="dense", ascending=False)

**В качестве метрики качества на тестовом наборе давайте считать ранговые корреляции Спирмена и Кендалла  
(Для самопроверки: у меня средняя корреляция Спирмена на тестовом множестве 2020 года во всех моделях, включая baselines, получалась порядка 0.7-0.8, а корреляция Кендалла — порядка 0.5-0.6)**

In [84]:
spearman_correlation = \
score_team_data_test_df.groupby('tournament_id').apply(lambda x: spearmanr(x['true_team_position'], x['pred_team_position']).correlation).mean()
kendall_correlation = \
score_team_data_test_df.groupby('tournament_id').apply(lambda x: kendalltau(x['true_team_position'], x['pred_team_position']).correlation).mean()

In [85]:
print(f"Spearman correlation: {spearman_correlation:.3f}\nKendall correlation: {kendall_correlation:.3f}")

Spearman correlation: 0.761
Kendall correlation: 0.604


# Task 4

**Предложите способ учитывать то, что на вопрос отвечают сразу несколько игроков**  
Пусть $z_{ij}$ - скрытая переменная: правильный/неправильный ответ игрока $i$ на вопрос $j$.  
Тогда
$$P(z_{ij} - true\_answer) = 
P(z_{ij} - true\_answer|t_{kj} - true\_answer) * P(t_{kj} - true\_answer) 
+ P(z_{ij} - true\_answer|t_{kj} - false\_answer) * P(t_{kj} - false\_answer)$$
где $t_{kj}$ - ответ команды $k$ (участником которой является игрок $i$) на вопрос $j$.  
В предположении, что команда отвечает верно тогда и только тогда, когда хотя бы один участник команды отвечает верно, получим:
$$P(z_{ij} - true\_answer|t_{kj} - false\_answer) = 0$$
$$P(z_{ij} - true\_answer) = P(z_{ij} - true\_answer|t_{kj} - true\_answer) * P(t_{kj} - true\_answer)$$
$$P(z_{ij} - true\_answer|t_{kj} - true\_answer) = 
\frac{P(t_{kj} - true\_answer|z_{ij} - true\_answer) * P(z_{ij} - true\_answer)}{P(t_{kj} - true\_answer)}$$

**Разработайте EM-схему для обучения этой модели, реализуйте её в коде**  
M-шаг:  
Находим $P(z_{ij} - true\_answer) = logreg(p_i, q_j, weights)$  
E-шаг:  
Находим $P(z_{ij} - true\_answer|t_{kj} - true\_answer) = 
\frac{logreg(p_i, q_j, weights)}{1 - \prod_{s \in t_k} (1 - logreg(p_s, q_j, weights))}$  

In [12]:
em_train_data =  []
em_test_data = []
for tournament, teams in results.items():
    for team in teams:
        tournament_year = tournaments[tournament]["dateStart"][:4]
        if team.get("mask", None) and tournament_year in ['2019', '2020']:
            for member in team['teamMembers']:
                add = {}
                add['team_id'] = int(team['team']['id'])
                add['team_member'] = int(member['player']['id'])
                add['tournament_id'] = int(tournament)
                add['avg_score'] = team['mask'].count('1') / len(team['mask'])
                add = {
                    "tournament_id": tournament,
                    "tournament_name": tournaments.get(tournament).get("name"),
                    "team_id": team["team"]["id"],
                    "team_name": team["team"]["name"],
                    "player_id": member["player"]["id"],
                    "player_name": member["player"]["surname"] + " " + member["player"]["name"] + " " + member["player"]["patronymic"],
                    "position": team.get("position", None),
                }
                if tournament_year == '2019':
                    add["number_of_questions"] = sum(tournaments.get(tournament).get("questionQty").values())
                    add["questions_mask"] = team["mask"]
                    if add["number_of_questions"] == len(add["questions_mask"]):
                        em_train_data.append(add)
                elif tournament_year == '2020':
                    em_test_data.append(add)

In [23]:
em_test_data_df = pd.DataFrame(em_test_data)
del em_test_data

# Save test data for em model
em_test_data_df.to_csv("em_test_data_df.csv", index=False)

In [None]:
# Load test data for em model
em_test_data_df = pd.read_csv("em_test_data_df.csv")

In [13]:
em_train_data_by_question = []
for item in em_train_data:
    for _, result in enumerate(item["questions_mask"]):
        if result in ["0", "1"]:
            add = {
                "tournament_id": item["tournament_id"],
                "team_id": item["team_id"],
                "player_id": item["player_id"],
                "question_id": str(item["tournament_id"]) + "_" + str(_),
                "team_is_correct": int(result),
            }
            em_train_data_by_question.append(add)

In [14]:
del results
del em_train_data
gc.collect()

em_train_data_by_question_df = pd.DataFrame(em_train_data_by_question).astype({
    "tournament_id": np.int32,
    "team_id": np.int32,
    "player_id": np.int32,
    "team_is_correct": np.uint8})

del em_train_data_by_question
gc.collect()

0

In [15]:
# Save train data for em model
em_train_data_by_question_df.to_csv("em_train_data_by_question_df.csv", index=False)

In [None]:
# Load train data for em model
em_train_data_by_question_df = pd.read_csv("em_train_data_by_question_df.csv")

In [16]:
# Define label encoder for pytorch embeddings usage
player_le = LabelEncoder().fit(em_train_data_by_question_df.player_id)
question_le = LabelEncoder().fit(em_train_data_by_question_df.question_id)

em_train_data_by_question_df["player_id_encoded"] = player_le.transform(em_train_data_by_question_df.player_id)
em_train_data_by_question_df["question_id_encoded"] = question_le.transform(em_train_data_by_question_df.question_id)

In [19]:
class LogReg(nn.Module):
    def __init__(self, num_players, num_questions):
        super().__init__()
        self.players_weights = nn.Embedding(num_players, 1)
        self.questions_weights = nn.Embedding(num_questions, 1)
        self.bias = nn.Parameter(torch.zeros(1), requires_grad=True)
        self.sigmoid = nn.Sigmoid()
    
    def reset_parameters(self):
        self.players_weights.reset_parameters()
        self.questions_weights.reset_parameters()
        self.bias = self.bias = nn.Parameter(torch.zeros(1), requires_grad=True)
        
    def forward(self, player_inds, question_inds):
        return self.sigmoid(self.players_weights(player_inds) 
                            + self.questions_weights(question_inds) 
                            + self.bias)

    
def train(model, X_, y, lr=1, n_epochs=100):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr)
    for epoch in tqdm(range(n_epochs)):
        optimizer.zero_grad()
        loss = criterion(model(X_[:, 0], X_[:, 1]), y.reshape(-1, 1))
        loss.backward()
        optimizer.step()
    return model

In [17]:
# Define train tensors
X = torch.LongTensor(em_train_data_by_question_df[["player_id_encoded", "question_id_encoded"]].values)
targets = torch.FloatTensor(em_train_data_by_question_df["team_is_correct"].values)

In [20]:
model = LogReg(player_le.classes_.shape[0], question_le.classes_.shape[0])
model = train(model, X, targets, n_epochs=10)

100%|██████████| 10/10 [00:22<00:00,  2.29s/it]


In [21]:
PLAYERS_IDS = em_train_data_by_question_df.player_id.unique()

def calculate_correlations(data, player_le, question_le, parameters):
    player_weights = {}
    player_sum = 0
    player_count = 0
    question_sum = 0
    question_count = 0
    player_weights = dict(zip(PLAYERS_IDS, parameters[0][player_le.transform(PLAYERS_IDS)]))
    player_sum = parameters[0].sum()
    player_count = parameters[0].shape[0]
    question_sum = parameters[1].sum()
    question_count = parameters[1].shape[0]
    
    data["player_weights"] = data["player_id"].apply(lambda x: player_weights.get(x))
    data["player_weights"].fillna(player_sum / player_count, inplace=True)
    data["player_probability"] = data["player_weights"].apply(lambda x: 1 / (1 + np.exp( - (x + question_sum / question_count + parameters[2].item()))))
    probas = data.groupby(["tournament_id", "team_id"])["player_probability"].apply(lambda x: 1 - np.prod(1 - x))
    position = data.groupby(["tournament_id", "team_id"])["position"].first()
    group_data = pd.concat([probas, position], axis=1)
    group_data.sort_values(["tournament_id", "player_probability"], ascending=[True, True], inplace=True)
    spearman = group_data.groupby("tournament_id").apply(lambda x: - spearmanr(x["position"], x["player_probability"]).correlation).mean()
    kendall = group_data.groupby("tournament_id").apply(lambda x: - kendalltau(x["position"], x["player_probability"]).correlation).mean()
    return spearman, kendall

In [24]:
weights = [model.players_weights.weight.data.numpy().flatten(), model.questions_weights.weight.data.numpy().flatten(), model.bias.data.numpy().flatten()]
spearman_val, kendall_val = calculate_correlations(em_test_data_df, player_le, question_le, weights)
print(f"spearman {spearman_val}, kendall {kendall_val}")

spearman 0.766053259524078, kendall 0.6090105740076377


In [25]:
def e_step(data, predicts):
    data["player_is_correct"] = predicts
    data.loc[data["team_is_correct"] == 0, "player_is_correct"] = 0
    data.loc[data["team_is_correct"] == 1, "player_is_correct"] = \
    data.loc[data["team_is_correct"] == 1, "player_is_correct"] / data[data["team_is_correct"] == 1]\
                                                  .groupby(["team_id", "question_id"])["player_is_correct"]\
                                                  .progress_transform(lambda x: 1 - np.prod(1 - x.values))
    return data


def m_step(model, X, y, lr=0.001, n_epochs=10):
    model.reset_parameters()
    train(model, X, y, lr, n_epochs)
    return model

In [47]:
nrwos = 17_739_658
n_epochs = 100
model = LogReg(player_le.classes_.shape[0], question_le.classes_.shape[0])
predicts = model(X[:, 0], X[:, 1]).detach().numpy().flatten()[:nrwos]
em_train_data_by_question_df_with_probs = e_step(em_train_data_by_question_df.copy()[:nrwos], predicts[:nrwos])
targets = torch.FloatTensor(em_train_data_by_question_df_with_probs["player_is_correct"].values).view(-1, 1)
model = m_step(model, X[:nrwos], targets, lr=1, n_epochs=n_epochs)
weights = [model.players_weights.weight.data.numpy().flatten(), 
           model.questions_weights.weight.data.numpy().flatten(), 
           model.bias.data.numpy().flatten()]
spearman_val, kendall_val = calculate_correlations(em_test_data_df, player_le, question_le, weights)
print(f"|Epoch 0| Spearman correlation: {spearman_val:.3f}, Kendall correlation: {kendall_val:.3f}")

predicts = model(X[:, 0], X[:, 1]).detach().numpy().flatten()[:nrwos]
for i in range(1, 6):
    em_train_data_by_question_df_with_probs = e_step(em_train_data_by_question_df.copy()[:nrwos], predicts[:nrwos])
    targets = torch.FloatTensor(em_train_data_by_question_df_with_probs["player_is_correct"].values).view(-1, 1)
    model = m_step(model, X[:nrwos], targets, lr=1, n_epochs=n_epochs)
    predicts = model(X[:, 0], X[:, 1]).detach().numpy().flatten()
    weights = [model.players_weights.weight.data.numpy().flatten(), 
               model.questions_weights.weight.data.numpy().flatten(), 
               model.bias.data.numpy().flatten()]
    spearman_val, kendall_val = calculate_correlations(em_test_data_df, player_le, question_le, weights)
    print(f"|Epoch {i}| Spearman correlation: {spearman_val:.3f}, Kendall correlation: {kendall_val:.3f}")

100%|██████████| 1418391/1418391 [03:28<00:00, 6789.98it/s]
100%|██████████| 100/100 [03:31<00:00,  2.11s/it]


|Epoch 0| Spearman correlation: 0.722, Kendall correlation: 0.559


100%|██████████| 1418391/1418391 [03:24<00:00, 6939.11it/s] 
100%|██████████| 100/100 [03:26<00:00,  2.06s/it]


|Epoch 1| Spearman correlation: 0.772, Kendall correlation: 0.610


100%|██████████| 1418391/1418391 [03:23<00:00, 6966.67it/s] 
100%|██████████| 100/100 [03:23<00:00,  2.03s/it]


|Epoch 2| Spearman correlation: 0.782, Kendall correlation: 0.622


100%|██████████| 1418391/1418391 [03:23<00:00, 6957.19it/s] 
100%|██████████| 100/100 [03:31<00:00,  2.12s/it]


|Epoch 3| Spearman correlation: 0.787, Kendall correlation: 0.628


100%|██████████| 1418391/1418391 [03:22<00:00, 7006.99it/s] 
100%|██████████| 100/100 [03:24<00:00,  2.04s/it]


|Epoch 4| Spearman correlation: 0.789, Kendall correlation: 0.630


100%|██████████| 1418391/1418391 [03:22<00:00, 6997.26it/s] 
100%|██████████| 100/100 [03:32<00:00,  2.13s/it]


|Epoch 5| Spearman correlation: 0.790, Kendall correlation: 0.632


# Task 5

**Постройте “рейтинг-лист” турниров по сложности вопросов.**

In [49]:
QUESTIONS_IDS = em_train_data_by_question_df.question_id.unique()
question_weights = dict(zip(QUESTIONS_IDS, weights[1][question_le.transform(QUESTIONS_IDS)]))
tournament_weights = em_train_data_by_question_df_with_probs.groupby("tournament_id")["question_id"]\
.apply(lambda x: np.mean([question_weights[q] for q in x]))\
.sort_values()\
.reset_index()["tournament_id"]\
.apply(lambda x: tournaments[x]["name"])

**на чемпионате мира в целом должны быть сложные вопросы**

In [50]:
tournament_weights.head(10)

0    Чемпионат Санкт-Петербурга. Первая лига
1                     Чемпионат Таджикистана
2          (А) Славянка без раздаток. 4 этап
3     Зеркало мемориала памяти Михаила Басса
4                                Угрюмый Ёрш
5                         Воображаемый музей
6                 Синхрон высшей лиги Москвы
7            Чемпионат Мира. Этап 2 Группа С
8                            Поволжская лига
9               Первенство правого полушария
Name: tournament_id, dtype: object

**..., а на турнирах для школьников — простые**

In [51]:
tournament_weights.tail(10)

665           Школьный Синхрон-lite. Выпуск 3.3
666    (а)Синхрон-lite. Лига старта. Эпизод VII
667           Школьный Синхрон-lite. Выпуск 3.1
668                               Школьная лига
669                      Школьная лига. II тур.
670           Школьный Синхрон-lite. Выпуск 2.5
671      (а)Синхрон-lite. Лига старта. Эпизод V
672                       Школьная лига. I тур.
673                     Школьная лига. III тур.
674                            One ring - async
Name: tournament_id, dtype: object