In [130]:
from google.colab import drive
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder
!pip install catboost
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [131]:
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/csgo

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/csgo


In [132]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
set_seed()

In [133]:
team_id_cols = ['team1_id',	'team2_id']
players_id_cols = ['p1_id_t1', 'p2_id_t1', 'p3_id_t1', 'p4_id_t1', 'p5_id_t1', 'p1_id_t2', 'p2_id_t2', 'p3_id_t2', 'p4_id_t2', 'p5_id_t2']
cat_features = []
cat_features.extend(team_id_cols)
cat_features.extend(players_id_cols)

def preprocess_data(match):
  players = pd.read_csv('players_feats.csv')
  data = pd.merge(match, players, left_on=['team1_id', 'map_id'], right_on=['team_id', 'map_id'])
  data = pd.merge(data, players, left_on=['team2_id', 'map_id'], right_on=['team_id', 'map_id'], suffixes=('_t1', '_t2'))

  team_id_data = match[team_id_cols]
  players_id_data = data[players_id_cols]

  drop_colnames = ['id','team1_id','team2_id', 'map_name_x','map_name_y', 'map_id','id','team_id','maps_played', 'rounds_played', 'kill_death']
  # примечание: kill_death дублирует kills_ratio, поэтому я и его тоже решил убрать

  drop_labels = list(filter(lambda s: s[3:-3] in drop_colnames or s in drop_colnames, data.columns))
  data.drop(labels=drop_labels, axis=1, inplace=True)

  encoder = LabelEncoder()
  feature = pd.Series(encoder.fit_transform(data['map_name']))
  data['map_name'] = feature

  # Эта категория признаков содержит в себе вторую степень некоторых признаков игроков а также специальные признаки
  for t in ['_t1', '_t2']:
      for p in ['p1_', 'p2_', 'p3_', 'p4_', 'p5_']:
          data[p + 'kills_assists_ratio' + t] = data[p + 'kills_per_round' + t]/data[p + 'assists_per_round' + t]
          data[p + 'kd&opening_kill_effective' + t] = data[p + 'opening_kill_ratio' + t] * data[p + 'kd_ratio' + t]

          data[p + 'headshots_deg2' + t] = data[p + 'headshots' + t] * data[p + 'headshots' + t]
          data[p + 'damage_per_round_deg2' + t] = data[p + 'damage_per_round' + t]*data[p + 'damage_per_round' + t]
          data[p + 'team_win_percent_after_first_kill_deg2' + t] = data[p + 'team_win_percent_after_first_kill' + t]*data[p + 'team_win_percent_after_first_kill' + t]
          data[p + 'first_kill_in_won_rounds_deg2' + t] = data[p + 'first_kill_in_won_rounds' + t]*data[p + 'first_kill_in_won_rounds' + t]
          data[p + 'rounds_with_kills_deg2' + t] = data[p + 'rounds_with_kills' + t]*data[p + 'rounds_with_kills' + t]

  
  # Эта категория признаков представляет собой средниче значения по признакам первой команды
  t1_cols = list(filter(lambda s: s[-2:] == 't1', list(data.columns)))
  for col in t1_cols:
      name = 'avg_' + col[3:]
      data[name] = 0
      similar_cols = list(filter(lambda s: s[3:] == col[3:], t1_cols))
      for s_col in similar_cols:
          data[name] += data[s_col]
      data[name] /= 5 

  # Эта категория признаков представляет собой средниче значения по признакам второй команды
  t2_cols = list(filter(lambda s: s[-2:] == 't2' and s[:3] != 'avg', data.columns))
  for col in t2_cols:
      name = 'avg_' + col[3:]
      data[name] = 0
      similar_cols = list(filter(lambda s: s[3:] == col[3:], t2_cols))
      for s_col in similar_cols:
          data[name] += data[s_col]
      data[name] /= 5 

  avgcols = list(filter(lambda s: s[:3] == 'avg' and s[-2:] == 't1', data.columns))

  # Эта категория признаков представляет собой отношение вещественных показателей одной команды к показателям другой
  for col in avgcols:
      data['ratio_' + col[4:-3]] = data[col]/data[col[:-3] + '_t2']
  data = pd.concat([team_id_data, data, players_id_data], axis=1)
  return data

match = pd.read_csv('train.csv')
data = preprocess_data(match)

In [134]:
target = data['who_win']
data.drop(labels=['who_win'], axis=1, inplace=True)
data

Unnamed: 0,team1_id,team2_id,p1_total_kills_t1,p1_headshots_t1,p1_total_deaths_t1,p1_kd_ratio_t1,p1_damage_per_round_t1,p1_grenade_damage_per_round_t1,p1_kills_per_round_t1,p1_assists_per_round_t1,...,p1_id_t1,p2_id_t1,p3_id_t1,p4_id_t1,p5_id_t1,p1_id_t2,p2_id_t2,p3_id_t2,p4_id_t2,p5_id_t2
0,6665,7718,258,36.0,293,0.88,71.1,6.3,0.57,0.20,...,4954,7412,9078,13300,15165,8371,9254,13093,19164,19509
1,4411,10577,178,39.3,208,0.86,64.1,6.5,0.56,0.14,...,8611,9278,9766,13666,14218,8488,11199,12810,15821,19069
2,11251,9455,494,52.8,397,1.24,94.0,5.8,0.83,0.16,...,7938,8574,8575,15835,19187,557,2023,8564,8568,12822
3,4608,7532,474,29.1,304,1.56,86.9,2.4,0.89,0.09,...,7998,8918,12731,16947,18987,5794,5796,7266,14932,19899
4,8637,6667,217,55.8,248,0.88,64.8,4.6,0.56,0.16,...,2898,7499,15370,20110,20304,429,4679,8183,10394,18053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708,6667,4773,320,47.8,396,0.81,71.2,11.4,0.57,0.15,...,429,8183,10394,11816,18053,6593,9482,16817,18141,18743
709,9215,5995,203,50.2,131,1.55,82.1,3.4,0.82,0.08,...,11154,14390,14394,17861,20987,284,3741,3972,9816,19230
710,4869,9565,447,44.3,538,0.83,65.2,5.7,0.55,0.13,...,922,11777,16080,16848,18221,7322,7398,9032,11893,14176
711,10426,4991,499,43.1,472,1.06,73.8,2.7,0.69,0.11,...,973,9574,14684,15428,15940,7528,8184,11271,13666,18462


In [135]:
cbc = CatBoostClassifier(task_type='GPU',
                         eval_metric='AUC', 
                         metric_period=50,
                         random_state=42,
                         cat_features=cat_features,
                         od_type = "Iter",
                         od_wait =  100
    )

param_grid = {
    'iterations': [1000],
    'learning_rate': [7e-2], 
    'depth': [4],
    'l2_leaf_reg': [5],
    'random_strength' :[1000],
    'border_count':[254],
    'bootstrap_type':['Poisson'],
}

#best_columns - столбцы, которые по моим оценкам имеют наибольшую степень влияния на результат работы модели
best_cols = list(filter(lambda s: s[:5] == 'ratio', data.columns))
best_cols.extend(cat_features)
best_cols.append('map_name')
res = cbc.grid_search(param_grid, data[best_cols], target, refit=True, cv=5, partition_random_seed=42)

0:	test: 0.6358995	best: 0.6358995 (0)	total: 44.8ms	remaining: 44.8s
50:	test: 0.7162215	best: 0.7162215 (50)	total: 1.83s	remaining: 34.1s
100:	test: 0.7295758	best: 0.7319325 (97)	total: 3.4s	remaining: 30.3s
150:	test: 0.7299686	best: 0.7319325 (97)	total: 4.86s	remaining: 27.3s
200:	test: 0.7291830	best: 0.7358602 (195)	total: 6.26s	remaining: 24.9s
250:	test: 0.7407699	best: 0.7460723 (239)	total: 9.43s	remaining: 28.1s
300:	test: 0.7285939	best: 0.7460723 (239)	total: 13.1s	remaining: 30.3s
bestTest = 0.7460722923
bestIteration = 239
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.7460723	best: 0.7460723 (0)	total: 16.5s	remaining: 0us
Estimating final quality...




Training on fold [0/5]
0:	test: 0.5869270	best: 0.5869270 (0)	total: 48.2ms	remaining: 48.1s
50:	test: 0.6243201	best: 0.6427739 (19)	total: 1.43s	remaining: 26.6s
100:	test: 0.6247087	best: 0.6427739 (19)	total: 2.77s	remaining: 24.7s
bestTest = 0.6427738667
bestIteration = 19
Training on fold [1/5]
0:	test: 0.6309524	best: 0.6309524 (0)	total: 28ms	remaining: 28s
50:	test: 0.6354782	best: 0.6685360 (12)	total: 1.42s	remaining: 26.4s
100:	test: 0.6375443	best: 0.6685360 (12)	total: 2.77s	remaining: 24.6s
bestTest = 0.6685360074
bestIteration = 12
Training on fold [2/5]
0:	test: 0.6698302	best: 0.6698302 (0)	total: 27.7ms	remaining: 27.6s
50:	test: 0.7794206	best: 0.7814186 (43)	total: 1.42s	remaining: 26.4s
100:	test: 0.7788212	best: 0.7870130 (74)	total: 2.77s	remaining: 24.6s
150:	test: 0.7756244	best: 0.7870130 (74)	total: 4.13s	remaining: 23.2s
bestTest = 0.7870129943
bestIteration = 74
Training on fold [3/5]
0:	test: 0.5783217	best: 0.5783217 (0)	total: 27.7ms	remaining: 27.7s
50

In [136]:
print(res['params'])
res['cv_results']['test-AUC-mean'][-1]

{'border_count': 254, 'random_strength': 1000, 'depth': 4, 'l2_leaf_reg': 5, 'iterations': 1000, 'learning_rate': 0.07, 'bootstrap_type': 'Poisson'}


0.6990440011024475

In [137]:
match = pd.read_csv('test.csv')
tmp = match['index']
data = preprocess_data(match)

pred = pd.DataFrame(cbc.predict_proba(data[best_cols]))
pred['index'] = tmp
pred.set_index('index', inplace = True)
pred.iloc[:,1].to_csv('result.csv')