In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import kendalltau, spearmanr

#### 1
Прочитайте и проанализируйте данные, выберите турниры, в которых есть данные о составах команд и повопросных результатах (поле mask в results.pkl).  
Для унификации предлагаю:
- взять в тренировочный набор турниры с dateStart из 2019 года; 
- в тестовый — турниры с dateStart из 2020 года.

In [2]:
tournaments = pickle.load(open('tournaments.pkl', 'rb'))
players = pickle.load(open('players.pkl', 'rb'))
results = pickle.load(open('results.pkl', 'rb'))

In [3]:
df_tournaments = pd.DataFrame(tournaments.values()).set_index("id")
df_players = pd.DataFrame(players.values()).set_index("id")

df_tournaments["year"] = df_tournaments["dateStart"].apply(lambda x: int(x[:4]))

In [4]:
train_tournaments = df_tournaments[df_tournaments["year"] == 2019]
test_tournaments = df_tournaments[df_tournaments["year"] == 2020]

In [5]:
train_tournaments.head()

Unnamed: 0_level_0,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4772,Синхрон северных стран. Зимний выпуск,2019-01-05T19:00:00+03:00,2019-01-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2019-01-09T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019
4973,Балтийский Берег. 3 игра,2019-01-25T19:05:00+03:00,2019-01-29T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-01-28T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019
4974,Балтийский Берег. 4 игра,2019-03-01T19:05:00+03:00,2019-03-05T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-03-04T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019
4975,Балтийский Берег. 5 игра,2019-04-05T19:05:00+03:00,2019-04-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-04-08T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019
4986,ОВСЧ. 6 этап,2019-02-15T20:00:00+03:00,2019-02-19T20:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 59140, 'name': 'Борис', 'patronymic': ...",{'dateRequestsAllowedTo': '2019-02-19T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019


In [6]:
train_tournaments.shape, test_tournaments.shape

((687, 9), (418, 9))

In [84]:
def create_dataset(indxs):
    # index processing
    indx_list = []
    for ind in indxs:
        try:
            if results[ind][0]['team'] and results[ind][0]['mask']:
                indx_list.append(ind)
        except:
            continue
    # create dataset
    result = []
    for ind in indx_list:
        for team in results[ind]:
            if team.get('mask') is not None:
                mask = str(team['mask']).replace('0','X').replace('0', '?')
                team_id = team['team']['id']
                players = team['teamMembers']
                position = team['position']
                for player in players:  
                    player_id = player['player']['id']
                    for q_mum, answer in enumerate(mask): 
                        result.append([ind, team_id, player_id, q_mum, answer, position])
    df = pd.DataFrame(result, columns=['tournament_id', 'team_id', 'player_id', 'question', 'answer', 'position'])
    return df

In [85]:
%%time
train_df = create_dataset(train_tournaments.index)
test_df = create_dataset(test_tournaments.index)

Wall time: 54.1 s


#### 2
Постройте baseline-модель на основе линейной или логистической регрессии, которая будет обучать рейтинг-лист игроков.

In [109]:
%%time
encoder = OneHotEncoder(handle_unknown='ignore')
cat_features = ['question', 'player_id']
X = encoder.fit_transform(train_df[cat_features])
y = train_df['answer']
base_model = LogisticRegression(solver='liblinear', random_state=42)
base_model.fit(X, y)

Wall time: 10min 1s


In [119]:
preds = base_model.coef_[0]
players_ids = np.unique(train_df['player_id'])
players_names = {players[pl]['id']: players[pl]['surname'] + ' ' + players[pl]['name'] for pl in players_ids}
rating = pd.DataFrame({'player_id': players_ids, 'player_name': players_names.values()})
rating['rating'] = preds[:len(players_ids)]
rating.sort_values(by='rating', ascending=False).reset_index(drop=True).head()

Unnamed: 0,player_id,player_name,rating
0,2796,Безбородкин Александр,2.900505
1,2594,Басецкий Олег,2.900505
2,2659,Крюкова Екатерина,2.900505
3,41135,Лозинский Богдан,2.896408
4,2475,Барчук Михаил,2.685836
