In [86]:
DEATH_COLUMNS = ["match_id", "last_death", "mean_deaths_diff"]
PLAYER_TIME_JOIN_COLUMNS = ["match_id", "times"]
JOIN_COLUMNS = ["match_id", "time"]
PLAYER_TIME_COLUMNS = PLAYER_TIME_JOIN_COLUMNS + ["gold_mean_diff", "lh_mean_diff"]
DEATHS_JOIN_COLUMNS = ["match_id", "last_death"]


In [88]:
import pandas as pd
import numpy as np

# load ids for training and testing
train_match_ids = pd.read_csv("../datasets/model_input/train_match_ids.csv")
test_match_ids = pd.read_csv("../datasets/model_input/test_match_ids.csv")

# training and testing labels
y_train = pd.read_csv("../datasets/model_input/y_train.csv")
y_test = pd.read_csv("../datasets/model_input/y_test.csv")

objectives_df = pd.read_csv("../datasets/processed/objectives.csv")
abilities_df = pd.read_csv("../datasets/processed/ability_upgrades.csv")
deaths_df = (pd.read_csv("../datasets/processed/deaths.csv")[DEATH_COLUMNS]
 .groupby(["match_id", "last_death"])
 .last()
 .reset_index()
)
player_time_df = pd.read_csv("../datasets/processed/player_time.csv")

In [90]:

def transform_to_model_ready_dataset(player_time_df, deaths_df):
   unpivoted_df = (player_time_df[PLAYER_TIME_COLUMNS]
   .merge(
      right=deaths_df,
      left_on=PLAYER_TIME_JOIN_COLUMNS,
      right_on=DEATHS_JOIN_COLUMNS,
      how="left",
      validate='m:1',
   )
   .drop(columns="last_death")
   # front-fill missing values as repetitions
   .fillna(method="ffill")
   #  in the begining the mean difference in deaths would be 0 till fights start
   .fillna(value=0)
   # join with objectives 
   #  .merge(
   #    right=objectives_df,
   #    left_on=PLAYER_TIME_JOIN_COLUMNS,
   #    right_on=JOIN_COLUMNS,
   #    how="left",
   #  )
   # the same procedure for objectives
   #  .fillna(method="ffill")
   #  .fillna(value=0)
   #  .drop(columns=["time"])
   #  .merge(
   #     right=abilities_df,
   #     left_on=PLAYER_TIME_COLUMNS,
   #     right_on=JOIN_COLUMNS,
   #     how="left"
   #  )
   )
   return unpivoted_df

unpivoted_df = transform_to_model_ready_dataset(player_time_df, deaths_df)
unpivoted_df.match_id.nunique()

50000

### pivot the table

In [91]:
X_train = (unpivoted_df
 .assign(times=lambda df: df["times"].astype(str))
 .pivot(index="match_id", columns="times")
 .reset_index()
)
# df.set_index("match_id")
X_train.columns = X_train.columns.map('_'.join).str.strip('_')
X_train

Unnamed: 0,match_id,gold_mean_diff_0,gold_mean_diff_1,gold_mean_diff_10,gold_mean_diff_100,gold_mean_diff_101,gold_mean_diff_102,gold_mean_diff_103,gold_mean_diff_104,gold_mean_diff_105,...,mean_deaths_diff_90,mean_deaths_diff_91,mean_deaths_diff_92,mean_deaths_diff_93,mean_deaths_diff_94,mean_deaths_diff_95,mean_deaths_diff_96,mean_deaths_diff_97,mean_deaths_diff_98,mean_deaths_diff_99
0,0,0.0,-51.4,-85.4,,,,,,,...,,,,,,,,,,
1,1,0.0,-96.0,-316.2,,,,,,,...,,,,,,,,,,
2,2,0.0,54.6,-241.2,,,,,,,...,,,,,,,,,,
3,3,0.0,-97.4,-40.2,,,,,,,...,,,,,,,,,,
4,4,0.0,66.6,828.6,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.0,18.8,-42.4,,,,,,,...,,,,,,,,,,
49996,49996,0.0,-23.8,269.6,,,,,,,...,,,,,,,,,,
49997,49997,0.0,23.8,698.6,,,,,,,...,,,,,,,,,,
49998,49998,0.0,-41.0,404.0,,,,,,,...,,,,,,,,,,


### Training a logistic regression

In [119]:
from functools import reduce
from operator import add

def get_features_at(all_features, at: int) -> list[str]:
    columns_at_minute = all_features[X_train.columns.str.endswith(f"_{at}")]
    return columns_at_minute

def get_features_for_window(all_features, at, window=5):
    window = list(range(at - 6, at))
    window_features = np.concatenate([get_features_at(all_features, at=i) for i in window])
    return window_features


current_features = get_features_for_window(X_train.columns, at=30, window=5)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

X_train_filled = X_train.fillna(X_train.mean())
# X_train[list(current_features)].isna().sum(axis=0)



In [116]:
y_train = pd.read_csv("../datasets/raw/match.csv")["radiant_win"]

### TODO:
- reproduce it for each minute with logistic regression on gold and lh
- split for train and test (evaluation) sets
- add a model selection dataset
- include more features
- include more models (first on the entire set with minute as predictor and then as a window)
    - Random Forest
    - Boosting Trees

In [120]:
lr.fit(X=X_train_filled[list(current_features)], y=y_train)
lr.score(X_train_filled[list(current_features)], y=y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.80254