In [1]:
import os
import gc
import pickle
from datetime import datetime

import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
DATA_PATH = 'data'
PLAYERS_DATA = os.path.join(DATA_PATH, 'players.pkl')
RESULTS_DATA = os.path.join(DATA_PATH, 'results.pkl')
TOURNAMENT_DATA = os.path.join(DATA_PATH, 'tournaments.pkl')

PLAYERS_CSV = os.path.join(DATA_PATH, 'players.csv')
RESULTS_CSV = os.path.join(DATA_PATH, 'results.csv')
TOURNAMENT_CSV = os.path.join(DATA_PATH, 'tournaments.csv')

TRAIN_DATA_CSV = os.path.join(DATA_PATH, 'train_data.csv')
TEST_DATA_CSV = os.path.join(DATA_PATH, 'test_data.csv')
FINAL_TRAIN_DATA_CSV = os.path.join(DATA_PATH, 'final_train_data.csv')
FINAL_TEST_DATA_CSV = os.path.join(DATA_PATH, 'final_test_data.csv')
TRAIN_TOURNAMENTS = os.path.join(DATA_PATH, 'train_tournaments.pkl')
TEST_TOURNAMENTS = os.path.join(DATA_PATH, 'test_tournaments.pkl')

MERGED_TOURNAMENTS_AND_RESULT = os.path.join(DATA_PATH, 'merged_tournaments_and_results.cvs')

## 1. Чтение и фильтрайия данных

In [3]:
if not os.path.exists(PLAYERS_CSV):
    print(f'Reading {PLAYERS_DATA}')
    with open(PLAYERS_DATA, 'rb') as fin:
        players_df = pickle.load(fin)
    players_df = pd.DataFrame(players_df).T
    players_df.to_csv(PLAYERS_CSV, index=False)
else:
    print(f'Reading {PLAYERS_CSV}')
    players_df = pd.read_csv(PLAYERS_CSV)

print(f'players shape: {players_df.shape}')
players_df.head()

Reading data/players.csv
players shape: (204063, 4)


Unnamed: 0,id,name,patronymic,surname
0,1,Алексей,,Абабилов
1,10,Игорь,,Абалов
2,11,Наталья,Юрьевна,Абалымова
3,12,Артур,Евгеньевич,Абальян
4,13,Эрик,Евгеньевич,Абальян


In [4]:
if not os.path.exists(TOURNAMENT_CSV):
    print(f'Reading {TOURNAMENT_DATA}')

    with open(TOURNAMENT_DATA, 'rb') as fin:
        tournament_dict = pickle.load(fin)

    tournament_df = {
        'id': [],
        'name': [],
        'date_start': [],
        'date_end': [],
        'type_id': [],
        'type_name': [],
        'season': [],
        # 'orgcommittee_id': [],
        # 'orgcommittee_name': [],
        # 'orgcommittee_patronymic': [],
        # 'orgcommittee_surname': [],
        'synch_data': [],
        'question_qty': []
    }

    for t_id in tournament_dict:
        tournament_df['id'].append(t_id)
        tournament_df['name'].append(tournament_dict[t_id]['name'])
        tournament_df['date_start'].append(tournament_dict[t_id]['dateStart'])
        tournament_df['date_end'].append(tournament_dict[t_id]['dateEnd'])
        tournament_df['type_id'].append(tournament_dict[t_id]['type']['id'])
        tournament_df['type_name'].append(tournament_dict[t_id]['type']['name'])
        tournament_df['season'].append(tournament_dict[t_id]['season'])
        tournament_df['synch_data'].append(tournament_dict[t_id]['synchData'])
        tournament_df['question_qty'].append(tournament_dict[t_id]['questionQty'])

    tournament_df = pd.DataFrame(tournament_df)
    tournament_df.to_csv(TOURNAMENT_CSV, index=False)
else:
    print(f'Reading {TOURNAMENT_CSV}')

    tournament_df = pd.read_csv(TOURNAMENT_CSV)

tournament_df.drop('synch_data', axis=1, inplace=True)
tournament_df.date_start = tournament_df.date_start.apply(lambda x: x.split('T')[0]).apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
tournament_df.date_end = tournament_df.date_end.apply(lambda x: x.split('T')[0]).apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
date_mask = tournament_df.date_start >= datetime(2019, 1, 1)
tournament_df = tournament_df.loc[date_mask, :]

print(f'tournament shape: {tournament_df.shape}')
tournament_df.head().T

Reading data/tournaments.csv
tournament shape: (1109, 8)


Unnamed: 0,3788,3921,4100,4115,4116
id,4628,4772,4957,4973,4974
name,Семь сорок,Синхрон северных стран. Зимний выпуск,Синхрон Биркиркары,Балтийский Берег. 3 игра,Балтийский Берег. 4 игра
date_start,2020-12-30 00:00:00,2019-01-05 00:00:00,2020-02-21 00:00:00,2019-01-25 00:00:00,2019-03-01 00:00:00
date_end,2020-12-30 00:00:00,2019-01-09 00:00:00,2020-02-27 00:00:00,2019-01-29 00:00:00,2019-03-05 00:00:00
type_id,3,3,3,3,3
type_name,Синхрон,Синхрон,Синхрон,Синхрон,Синхрон
season,,/seasons/52,/seasons/53,/seasons/52,/seasons/52
question_qty,"{'1': 12, '2': 12, '3': 12}","{'1': 12, '2': 12, '3': 12}","{'1': 13, '2': 13, '3': 13}","{'1': 12, '2': 12, '3': 12}","{'1': 12, '2': 12, '3': 12}"


In [5]:
if not os.path.exists(RESULTS_CSV):
    print(f'Reading {RESULTS_DATA}')

    with open(RESULTS_DATA, 'rb') as fin:
        results_dict = pickle.load(fin)

    results_df = {
        'id': [],
        'team_id': [],
        'team_name': [],
        'mask': [],
        'current_name': [],
        'questions_total': [],
        'synch_request': [],
        'position': [],
        'flags': [],
        'member_flag': [],
        'member_used_rating': [],
        'member_rating': [],
        'member_id': [],
        'member_name': []
    }

    for r_id in results_dict:
        for game in results_dict[r_id]:
            for member in game['teamMembers']:
                if 'mask' not in game.keys() or game['mask'] == None:
                    continue
                try:
                    results_df['id'].append(r_id)
                    results_df['team_id'].append(game['team']['id'])
                    results_df['team_name'].append(game['team']['name'])
                    results_df['mask'].append(game['mask'])
                    results_df['current_name'].append(game['current']['name'])
                    results_df['questions_total'].append(game['questionsTotal'])
                    results_df['synch_request'].append(game['synchRequest'])
                    results_df['position'].append(game['position'])
                    results_df['flags'].append(tuple(game['flags']))
                    results_df['member_flag'].append(member['flag'])
                    results_df['member_used_rating'].append(member['usedRating'])
                    results_df['member_rating'].append(member['usedRating'])
                    results_df['member_id'].append(member['player']['id'])
                    results_df['member_name'].append(
                        (member['player']['name'], member['player']['patronymic'], member['player']['surname'])
                    )
                except Exception as ex:
                    print(r_id)
                    print(game)
                    raise ex

    results_df = pd.DataFrame(results_df)
    results_df.to_csv(RESULTS_CSV, index=False)
else:
    print(f'Reading {RESULTS_CSV}')

    results_df = pd.read_csv(RESULTS_CSV)

if 'flags' in results_df.columns:
    results_df.drop(['flags'], axis=1, inplace=True)
if 'synch_request' in results_df.columns:
    results_df.drop(['synch_request'], axis=1, inplace=True)
results_df['mask'] = results_df['mask'].apply(
    lambda x: x.replace('X', '0').replace('?', '0')
)

print(f'results shape: {results_df.shape}')
results_df.head().T

Reading data/results.csv
results shape: (2331587, 12)


Unnamed: 0,0,1,2,3,4
id,22,22,22,22,22
team_id,1,1,1,1,1
team_name,Неспроста,Неспроста,Неспроста,Неспроста,Неспроста
mask,0111011101101110001101110011111111110011111100...,0111011101101110001101110011111111110011111100...,0111011101101110001101110011111111110011111100...,0111011101101110001101110011111111110011111100...,0111011101101110001101110011111111110011111100...
current_name,КП - Неспроста,КП - Неспроста,КП - Неспроста,КП - Неспроста,КП - Неспроста
questions_total,67,67,67,67,67
position,1.0,1.0,1.0,1.0,1.0
member_flag,,,,,
member_used_rating,0,0,0,0,0
member_rating,0,0,0,0,0


In [6]:
tournaments_and_results_df = pd.merge(results_df, tournament_df, on='id', how='inner')
tournaments_and_results_df.head().T

Unnamed: 0,0,1,2,3,4
id,4772,4772,4772,4772,4772
team_id,45556,45556,45556,45556,45556
team_name,Рабочее название,Рабочее название,Рабочее название,Рабочее название,Рабочее название
mask,111111111011111110111111111100010010,111111111011111110111111111100010010,111111111011111110111111111100010010,111111111011111110111111111100010010,111111111011111110111111111100010010
current_name,Рабочее название,Рабочее название,Рабочее название,Рабочее название,Рабочее название
questions_total,28,28,28,28,28
position,1.0,1.0,1.0,1.0,1.0
member_flag,Б,Б,Б,К,Б
member_used_rating,13507,10988,8534,6401,4252
member_rating,13507,10988,8534,6401,4252


In [7]:
tournaments_and_results_df['correct_ratio'] = tournaments_and_results_df['mask'].apply(
    lambda x: sum([int(i) for i in x]) / len(x)
)
tmp_df = tournaments_and_results_df.groupby('id')['mask'].agg(lambda x: len(set([len(itm) for itm in x])))
ids_with_different_mask_length_lst = tmp_df[tmp_df != 1].index

for idx in ids_with_different_mask_length_lst:
    tournaments_and_results_df.drop(tournaments_and_results_df[tournaments_and_results_df['id'] == idx].index, axis=0, inplace=True)

tournaments_and_results_df['mask'] = tournaments_and_results_df['mask'].apply(
    lambda x: np.array([int(i) for i in x])
)

quest_level_per_tour = tournaments_and_results_df.groupby('id')['mask', 'member_id'].agg({
    'mask': lambda x: np.sum(x, axis=0),
    'member_id': len
})
quest_level_per_tour['question_levels'] = (1 - quest_level_per_tour['mask'] / quest_level_per_tour['member_id'])
tournaments_and_results_df['question_levels'] = tournaments_and_results_df['id'].apply(
    lambda x: quest_level_per_tour.loc[x, 'question_levels']
)
tournaments_and_results_df['team_count'] = tournaments_and_results_df['id'].apply(
    lambda x: quest_level_per_tour.loc[x, 'member_id']
)
tournaments_and_results_df['position'] = tournaments_and_results_df['position'] / tournaments_and_results_df['team_count']

tournaments_and_results_df.to_csv(MERGED_TOURNAMENTS_AND_RESULT, index=False)
print(tournaments_and_results_df.shape)
tournaments_and_results_df.head().T

  quest_level_per_tour = tournaments_and_results_df.groupby('id')['mask', 'member_id'].agg({


(511351, 22)


Unnamed: 0,0,1,2,3,4
id,4772,4772,4772,4772,4772
team_id,45556,45556,45556,45556,45556
team_name,Рабочее название,Рабочее название,Рабочее название,Рабочее название,Рабочее название
mask,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ..."
current_name,Рабочее название,Рабочее название,Рабочее название,Рабочее название,Рабочее название
questions_total,28,28,28,28,28
position,0.000829,0.000829,0.000829,0.000829,0.000829
member_flag,Б,Б,Б,К,Б
member_used_rating,13507,10988,8534,6401,4252
member_rating,13507,10988,8534,6401,4252


Далее необходимо разбить данные на train и test части, после чего построить датасеты с повопросными результатами. При построении таблиц столкнулся с проблемой, что из-за большого размера данных при генерации таблиц происходило переполнение оперативной памяти, поэтому принял решение вынести вспомогательные функции для генерации таблиц в `utils.py`. Для генерации таблиц необходимо хапустить этот файл. Далее в ноуьбуке происходит чтение уже сгенерированных файлов с таблицами

In [3]:
train_data_df = pd.read_csv(TRAIN_DATA_CSV)
print(f'Train shape: {train_data_df.shape}')
train_data_df.head()

Train shape: (16247615, 10)


Unnamed: 0,player_id,tourn_question_id,team_id,correct_ratio,question_comp,team_position,team_count,pi,theta,answer
0,6212,4772_0,4772_45556,0.706069,0.116883,0.004329,231,0.166667,0.777778,1
1,6212,4772_1,4772_45556,0.706069,0.220779,0.004329,231,0.166667,0.777778,1
2,6212,4772_2,4772_45556,0.706069,0.554113,0.004329,231,0.166667,0.777778,1
3,6212,4772_3,4772_45556,0.706069,0.480519,0.004329,231,0.166667,0.777778,1
4,6212,4772_4,4772_45556,0.706069,0.121212,0.004329,231,0.166667,0.777778,1


In [4]:
test_data_df = pd.read_csv(TEST_DATA_CSV)
print(f'Test shape: {test_data_df.shape}')
test_data_df.head()

Test shape: (4172362, 10)


Unnamed: 0,player_id,tourn_question_id,team_id,correct_ratio,question_comp,team_position,team_count,pi,theta,answer
0,30152,4957_0,4957_49804,0.743437,0.717391,0.01087,92,0.166667,0.666667,1
1,30152,4957_1,4957_49804,0.743437,0.880435,0.01087,92,0.166667,0.666667,1
2,30152,4957_2,4957_49804,0.743437,0.141304,0.01087,92,0.166667,0.666667,1
3,30152,4957_3,4957_49804,0.743437,0.423913,0.01087,92,0.166667,0.666667,1
4,30152,4957_4,4957_49804,0.743437,0.73913,0.01087,92,0.166667,0.666667,1


In [5]:
train_data_df.nunique()

player_id            56937
tourn_question_id    30932
team_id              78405
correct_ratio        13239
question_comp         6917
team_position         7219
team_count             252
pi                      13
theta                  458
answer                   2
dtype: int64

In [6]:
test_data_df.nunique()

player_id            28566
tourn_question_id     7479
team_id              21121
correct_ratio         5586
question_comp         2500
team_position         2530
team_count             117
pi                      11
theta                  332
answer                   2
dtype: int64

## Baseline

На сгенерированных данных обучаем логистическую регрессию в качестве baseline

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

In [8]:
train_duplicates = train_data_df[['correct_ratio', 'question_comp', 'team_position', 'team_count']].duplicated()
train_data_df.drop(train_data_df.index[train_duplicates], axis=0, inplace=True)
X_train = train_data_df[['correct_ratio', 'question_comp', 'team_position', 'team_count']].values
y_train = train_data_df['answer'].values

test_duplicates = test_data_df[['correct_ratio', 'question_comp', 'team_position', 'team_count']].duplicated()
test_data_df.drop(test_data_df.index[test_duplicates], axis=0, inplace=True)
X_test = test_data_df[['correct_ratio', 'question_comp', 'team_position', 'team_count']].values
y_test = test_data_df['answer'].values

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape: (11860000, 4)
X_test shape: (2573106, 4)


In [9]:
logreg_model = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)
logreg_model.fit(X_train, y_train)
y_pred = logreg_model.predict(X_test)
y_pred_proba = logreg_model.predict_proba(X_test)
test_score = roc_auc_score(y_test, y_pred)
print(f'Roc-auc on test: {test_score}')

Roc-auc on test: 0.7836122415249057


Видно, что ROC-AUC получился не слишком высоким. Далее попробуем построить модель на основе EM-алгоритма

In [10]:
test_data_df['pred_proba'] = [itm[1] for itm in y_pred_proba]
test_data_df['pred'] = y_pred

In [11]:
del [[X_train, X_test, y_train, y_test, train_data_df, test_data_df, y_pred, y_pred_proba]]
gc.collect()

53

Перед запуском следующей секции важно удалить лишние таблицы и очистить память

## Переход к подсчету силы команды

In [9]:
from utils import get_corr_score


with open(TRAIN_TOURNAMENTS, 'rb') as fin:
    train_tournaments = pickle.load(fin)
with open(TEST_TOURNAMENTS, 'rb') as fin:
    test_tournaments = pickle.load(fin)

In [14]:
spearman_lst, kendall_lst = get_corr_score(test_tournaments, test_data_df)
gc.collect()
print(f'Spearman score: {np.mean(spearman_lst)}')
print(f'Kendall score: {np.mean(kendall_lst)}')

Spearman and Kendall corr ...


100%|██████████| 674/674 [00:01<00:00, 484.58it/s]


Spearman score: 0.6088671799586309
Kendall score: 0.4685550292716687


## EM-algo

Ниже приведен блок с реализацией EM-алгоритма. Код построен на основе кода с лекций

1. E-шаг: 
    $$p(Z_{nk}|X, \theta) = \frac{\pi_{k}p(x_{n}|\theta_{k})}{\sum\limits_{l}\pi_{l}p(x_{n}|\theta_{l})}$$
    $$p(x_{n}|\theta_{k}) = \theta_{k}^{x_{n}}(1 - \theta_{k})^{1 - x_{n}} = ( \theta_{k}x_{n} + (1 - \theta_{k})(1 - x_{n})) \sim Be(x_{n}|\theta_{k})$$ 
2. M-шаг:
    $$\theta_{k} = \frac{\sum\limits_{n}\mathbb{E}[Z_{nk}]x_{n}}{\sum\limits_{n}\mathbb{E}[Z_{nk}]}$$
    $$\pi_{k} = \frac{1}{n}\sum\limits_{n}\mathbb{E}[Z_{nk}]$$

In [7]:
from utils import em_step, get_corr_score


train_data_df = pd.read_csv(TRAIN_DATA_CSV)
print(f'Train shape: {train_data_df.shape}')

test_data_df = pd.read_csv(TEST_DATA_CSV)
print(f'Test shape: {test_data_df.shape}')

Train shape: (16247615, 10)
Test shape: (4172362, 10)


In [32]:
n_iterations = 10

for iter in tqdm(range(n_iterations)):
    train_data_df = train_data_df.groupby(["team_id"]).apply(em_step)
    player_power_agg = dict(train_data_df.groupby(["player_id"])["theta"].mean())
    train_data_df["theta"] = train_data_df["player_id"].apply(lambda x: player_power_agg[x])

train_data_df['pred_proba'] = train_data_df['theta']

100%|██████████| 5/5 [04:32<00:00, 54.52s/it]


Опыты показали, что после 10 запусков получается более-менее правдоподобный результат. Также видно, что корреляции также улучшились

In [38]:
spearman_lst, kendall_lst = get_corr_score(train_tournaments, train_data_df)
gc.collect()
print(f'Spearman score: {np.mean(spearman_lst)}')
print(f'Kendall score: {np.mean(kendall_lst)}')

Spearman and Kendall corr ...


100%|██████████| 674/674 [00:02<00:00, 328.40it/s]


Spearman score: 0.7961119232121043
Kendall score: 0.6324844414881746


## Построение рейтинга

Для построения рейтинг-системы будем использовать расчитанные показатели сложности вопросов. Т.е. рейтинг турнира будем считать как среднее по сложности вопросов на турнире, что кажется разумным, т.к. на турнирах высокого рейтинга должны быть сложные вопросы 

In [34]:
question_raiting = pd.DataFrame(train_data_df.groupby(["tourn_question_id"])["theta"].mean()).reset_index()
question_raiting["tourn_id"] = question_raiting["tourn_question_id"].apply(lambda x: x.split("_")[0])
question_raiting = pd.DataFrame(question_raiting.groupby(["tourn_id"])["theta"].mean())
question_raiting = question_raiting.sort_values(by="theta", ascending=False)

Ниже видно, что на вершине рейтинга `Чемпионат Мира. Финал. Группа А`, а на последнем месте `Чемпионат Таджикистана`, что (при всем уважении к чемпионату Таджикистана) кажется достаточно разумным распределением

In [36]:
train_tournaments[int(question_raiting.index[0])]

{'id': 5948,
 'name': 'Чемпионат Мира. Финал. Группа А',
 'dateStart': '2019-09-08T15:30:00+03:00',
 'dateEnd': '2019-09-08T17:30:00+03:00',
 'type': {'id': 2, 'name': 'Обычный'},
 'season': '/seasons/53',
 'orgcommittee': [{'id': 27247,
   'name': 'Александр',
   'patronymic': 'Аврамович',
   'surname': 'Рубин'},
  {'id': 46968,
   'name': 'Александр',
   'patronymic': 'Давидович',
   'surname': 'Двоскин'},
  {'id': 25882,
   'name': 'Максим',
   'patronymic': 'Оскарович',
   'surname': 'Поташев'},
  {'id': 144,
   'name': 'Сергей',
   'patronymic': 'Леонидович',
   'surname': 'Абрамов'},
  {'id': 11084,
   'name': 'Павел',
   'patronymic': 'Олегович',
   'surname': 'Забавский'},
  {'id': 37142,
   'name': 'Азизбек',
   'patronymic': 'Эльбек-угли',
   'surname': 'Юсуфов'},
  {'id': 31038,
   'name': 'Владимир',
   'patronymic': 'Владимирович',
   'surname': 'Сушков'}],
 'synchData': None,
 'questionQty': {'1': 15, '2': 15}}

In [37]:
train_tournaments[int(question_raiting.index[-1])]

{'id': 5717,
 'name': 'Чемпионат Таджикистана',
 'dateStart': '2019-06-23T14:00:00+03:00',
 'dateEnd': '2019-06-23T18:00:00+03:00',
 'type': {'id': 2, 'name': 'Обычный'},
 'season': '/seasons/52',
 'orgcommittee': [{'id': 37142,
   'name': 'Азизбек',
   'patronymic': 'Эльбек-угли',
   'surname': 'Юсуфов'},
  {'id': 98217,
   'name': 'Манучехр',
   'patronymic': 'Абдумаджидович',
   'surname': 'Салохудинов'}],
 'synchData': None,
 'questionQty': {'1': 15, '2': 15, '3': 15}}