In [92]:
import pandas as pd
import numpy as np
from itables import show
from datetime import datetime
from sklearn.model_selection import train_test_split

### Load match results

In [101]:
results_df = pd.read_csv("../datasets/raw/match.csv").assign(
    start_time=lambda df: pd.to_datetime(df["start_time"].apply(datetime.utcfromtimestamp))
)
player_time_df = pd.read_csv("../datasets/raw/player_time.csv")

### Aggregations on player_time

In [102]:
variables = [
    "xp",
    "lh", 
    "gold",
]

def aggr(df: pd.DataFrame, variable: str, agg: callable):
    columns_team_1 = [f"{variable}_t_{i}" for i in range(5)]
    columns_team_2 = [f"{variable}_t_{128 + i}" for i in range(5)]
    return agg(df[columns_team_1]) - agg(df[columns_team_2])


aggregations = [
    ("q1", lambda df: df.quantile(0.25, axis=1)),
    ("q3", lambda df: df.quantile(0.75, axis=1)),
    ("median", lambda df: df.median(axis=1)),
    ("mean", lambda df: df.mean(axis=1)),
]

df = player_time_df.copy()
columns_to_drop = df.columns[df.columns.str.contains("_t_")]

player_time_aggregated_df = (df
 .assign(**{
    f"{var}_{agg}_diff": aggr(df, var, fn) for var in variables for agg, fn in aggregations
 })
 .drop(columns=columns_to_drop)
 .assign(times=lambda df: (df["times"] / 60).astype(int))
)

# write the aggregations to a csv file in processed
player_time_aggregated_df.to_csv("../datasets/processed/player_time.csv", index=False)
show(player_time_aggregated_df)

match_id,times,xp_q1_diff,xp_q3_diff,xp_median_diff,xp_mean_diff,lh_q1_diff,lh_q3_diff,lh_median_diff,lh_mean_diff,gold_q1_diff,gold_q3_diff,gold_median_diff,gold_mean_diff
Loading... (need help?),,,,,,,,,,,,,


In [103]:
ability_upgrades_df = pd.read_csv("../datasets/raw/ability_upgrades.csv").assign(
    time=lambda df: np.floor(df["time"] / 60).astype(int),
)    
# pivot table into a more feature-like format
abilities_df = (ability_upgrades_df
 .assign(player_slot=lambda df: df["player_slot"].astype(str))
 .pivot_table(
    index=["match_id", "time"],
    columns="player_slot",
    values=["ability", "level"],
    aggfunc="last",
 )
)
# join column names and reset index
abilities_df.columns = abilities_df.columns.map('_'.join)
abilities_df = abilities_df.reset_index()

In [104]:
filled_abilities_df = (abilities_df
 .groupby("match_id")
 # front fill with the same value but of course w.r.t. time
 .apply(lambda df: df.set_index("time").fillna(method="ffill"))
 .drop(columns="match_id")
 .reset_index()
 # fill nans with 0 as a first value for starting 
 .fillna(0)
 # turn everything to numeric
 .apply(pd.to_numeric, downcast="integer")
)

# visualize
show(filled_abilities_df)

# turn abilities to categoricals
ability_columns = filled_abilities_df.columns[filled_abilities_df.columns.str.startswith("ability")]
filled_abilities_df[ability_columns] = filled_abilities_df[ability_columns].apply(pd.Categorical)

# write it to a csv
filled_abilities_df.to_csv("../datasets/processed/ability_upgrades.csv", index=False)

match_id,time,ability_0,ability_1,ability_128,ability_129,ability_130,ability_131,ability_132,ability_2,ability_3,ability_4,level_0,level_1,level_128,level_129,level_130,level_131,level_132,level_2,level_3,level_4
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,


### NEXT: Attach deaths and objectives
<!-- pd.read_csv("../datasets/raw/") -->

In [105]:
def to_minute(time_series):
    return np.floor(time_series / 60).astype(int)
teamfights_df = pd.read_csv("../datasets/raw/teamfights.csv").assign(
    start=lambda df: to_minute(df["start"]),
    end=lambda df: to_minute(df["end"]),
)

In [106]:
teamfights_players_df = pd.read_csv("../datasets/raw/teamfights_players.csv")

aggregated_deaths_df = (teamfights_players_df
 .groupby(["match_id", "player_slot"])["deaths"]
 .agg(list)
 .unstack(level=1)
 .explode([0, 1, 2, 3, 4, 128, 129, 130, 131, 132])
 .add_prefix('deaths_')
 .reset_index()
 .rename_axis(None, axis=1)
)

In [107]:
aggregated_deaths_df.index.name = None
deaths_team_1_columns = [f"deaths_{i}" for i in range(5)]
deaths_team_2_columns = [f"deaths_{i}" for i in range(128, 133)]
deaths_columns = deaths_team_1_columns + deaths_team_2_columns

# get mean in deaths
team_1_mean_deaths = aggregated_deaths_df[deaths_team_1_columns].mean(axis=1)
team_2_mean_deaths = aggregated_deaths_df[deaths_team_2_columns].mean(axis=1)

aggregated_deaths_df = aggregated_deaths_df.assign(
    mean_deaths_diff=team_1_mean_deaths - team_2_mean_deaths,
    sum_deaths=lambda df: df[deaths_columns].sum(axis=1),
)

joined_deaths_df = pd.concat(
    objs=[
        aggregated_deaths_df.drop(columns=["match_id"]),
        teamfights_df.drop(columns=["deaths"])
    ],
    axis=1,
).assign(
    start=lambda df: to_minute(df["start"]),
    end=lambda df: to_minute(df["end"]),
    last_death=lambda df: to_minute(df["last_death"]),
)

# write it to a csv
joined_deaths_df.to_csv("../datasets/processed/deaths.csv", index=False)

In [11]:
# Plan for what to do next
# [x] generate total difference in deaths to use as feature for predicting final result
# [x] attach to the other dataframe
# [x] save it
# [ ] when you use it, ffill it so that it can be joined
# [ ] add cumulative deaths
# [x] repeat the process for objectives
# clean and split the code into separate modules for data processing
# [x] split the dataset for train and test 
# generate models
#       1. logistic regression on gold, xp, lh by each minute
#       2. Markov Chain
#       3. add more features to the logistic regression, but remove colinear
#       4. train xgboost, random forest, cnn, rnn
#       6. Compare old models to new one
# add feature importance with shap values
# include death prediction, neural networks, etc. read papers. 

In [113]:
objectives_df = pd.read_csv("../datasets/raw/objectives.csv").assign(
    time=lambda df: to_minute(df["time"]),
)

Сбиваме информацията за това кой върху кого е изпълнил определен ефект (разрушаване на вражеска крепост,
първа атака, тн) от променливите за играч 1, играч 2, стойност на щетата, в една променлива
която е стойността на щетата ако е от отбор А към отбор 2, същата стойност умножена по -1 ако е обратно,
и 0 ако е неутрално действие.

In [114]:
# if player1 < player2
np.array(range(10)) % 9
team_mapping = {i: 2 for i in range(5)} | {i: 1 for i in range(5, 10)}

(objectives_df
 .assign(
    player1=lambda df: df["player1"].replace(team_mapping),
    player2=lambda df: df["player2"].replace(team_mapping),
 )
 .assign(affect=lambda df: df["player1"] - df["player2"])
 .assign(affect=lambda df: df["affect"].replace({3: 1, 2: -1}))
 .assign(value=lambda df: df["value"] * df["affect"])
 .drop(columns=["key", "player1", "player2", "slot", "team", "affect"])
 .to_csv("../datasets/processed/objectives.csv", index=False)
)

### Split the dataset for training and final evaluation

In [100]:
# is_train_set = results_df.start_time.dt.day_of_year <= 320


results_df = results_df.sort_values(by="start_time")

train_match_ids, test_match_ids, y_train, y_test = train_test_split(
    results_df[["match_id"]],
    results_df[["match_id", "radiant_win"]],
    shuffle=False,
    test_size=1000,
)

train_match_ids.to_csv("../datasets/model_input/train_match_ids.csv", index=False)
test_match_ids.to_csv("../datasets/model_input/test_match_ids.csv", index=False)
y_train.to_csv("../datasets/model_input/y_train.csv", index=False)
y_test.to_csv("../datasets/model_input/y_test.csv", index=False)

# train_df = results_df[["match_id", "radiant_win"]][is_train_set]
# test_df = results_df[["match_id", "radiant_win"]][~is_train_set]

# train_df.to_csv("../datasets/model_input/train.csv")
# test_df.to_csv("../datasets/model_input/test.csv")

### How to turn it in a better format for when we split by windows and train

In [None]:
# pivot longer for a splitting and training
df = (player_time_aggregated_df
 .assign(times=lambda df: df["times"].astype(str))
 .pivot(index="match_id", columns="times")
 .reset_index()
)
# df.set_index("match_id")
# df
df.columns = df.columns.map('_'.join).str.strip('_')
df



Unnamed: 0,match_id,lh_mean_diff_0,lh_mean_diff_1,lh_mean_diff_10,lh_mean_diff_100,lh_mean_diff_101,lh_mean_diff_102,lh_mean_diff_103,lh_mean_diff_104,lh_mean_diff_105,...,gold_mean_diff_90,gold_mean_diff_91,gold_mean_diff_92,gold_mean_diff_93,gold_mean_diff_94,gold_mean_diff_95,gold_mean_diff_96,gold_mean_diff_97,gold_mean_diff_98,gold_mean_diff_99
0,0,0.0,-1.6,-7.8,,,,,,,...,,,,,,,,,,
1,1,0.0,-0.4,2.4,,,,,,,...,,,,,,,,,,
2,2,0.0,0.4,1.0,,,,,,,...,,,,,,,,,,
3,3,0.0,-0.2,-4.2,,,,,,,...,,,,,,,,,,
4,4,0.0,0.8,16.2,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.0,0.4,1.6,,,,,,,...,,,,,,,,,,
49996,49996,0.0,-0.6,2.4,,,,,,,...,,,,,,,,,,
49997,49997,0.0,0.6,4.8,,,,,,,...,,,,,,,,,,
49998,49998,0.0,0.0,6.4,,,,,,,...,,,,,,,,,,
