In [4]:
import pandas as pd
import numpy as np
from itables import show

### Load match results

In [5]:
results_df = pd.read_csv("../datasets/raw/match.csv")[["match_id", "radiant_win"]]

account_id,total_wins,total_matches,trueskill_mu,trueskill_sigma
Loading... (need help?),,,,


In [14]:
player_time_df = pd.read_csv("../datasets/raw/player_time.csv")

### Aggregations on player_time

In [89]:
variables = [
    "xp",
    "lh", 
    "gold",
]

def aggr(df: pd.DataFrame, variable: str, agg: callable):
    columns_team_1 = [f"{variable}_t_{i}" for i in range(5)]
    columns_team_2 = [f"{variable}_t_{128 + i}" for i in range(5)]
    return agg(df[columns_team_1]) - agg(df[columns_team_2])


aggregations = [
    ("q1", lambda df: df.quantile(0.25, axis=1)),
    ("q3", lambda df: df.quantile(0.75, axis=1)),
    ("median", lambda df: df.median(axis=1)),
    ("mean", lambda df: df.mean(axis=1)),
]

df = player_time_df.copy()
columns_to_drop = df.columns[df.columns.str.contains("_t_")]

player_time_aggregated_df = (df
 .assign(**{
    f"{var}_{agg}_diff": aggr(df, var, fn) for var in variables for agg, fn in aggregations
 })
 .drop(columns=columns_to_drop)
 .assign(times=lambda df: (df["times"] / 60).astype(int))
)

# write the aggregations to a csv file in processed
player_time_aggregated_df.to_csv("../datasets/processed/player_time.csv")
show(player_time_aggregated_df)

match_id,times,xp_q1_diff,xp_q3_diff,xp_median_diff,xp_mean_diff,lh_q1_diff,lh_q3_diff,lh_median_diff,lh_mean_diff,gold_q1_diff,gold_q3_diff,gold_median_diff,gold_mean_diff
Loading... (need help?),,,,,,,,,,,,,


In [27]:
ability_upgrades_df = pd.read_csv("../datasets/raw/ability_upgrades.csv").assign(
    time=lambda df: np.floor(df["time"] / 60).astype(int),
)    

In [80]:
# pivot table into a more feature-like format
abilities_df = (ability_upgrades_df
 .assign(player_slot=lambda df: df["player_slot"].astype(str))
 .pivot_table(
    index=["match_id", "time"],
    columns="player_slot",
    values=["ability", "level"],
    aggfunc="last",
 )
)
# join column names and reset index
abilities_df.columns = abilities_df.columns.map('_'.join)
abilities_df = abilities_df.reset_index()

In [83]:
 
filled_abilities_df = (abilities_df
 .groupby("match_id")
 # front fill with the same value but of course w.r.t. time
 .apply(lambda df: df.set_index("time").fillna(method="ffill"))
 .drop(columns="match_id")
 .reset_index()
 # fill nans with 0 as a first value for starting 
 .fillna(0)
 # turn everything to numeric
 .apply(pd.to_numeric, downcast="integer")
)

# visualize
show(filled_abilities_df)

# turn abilities to categoricals
ability_columns = filled_abilities_df.columns[filled_abilities_df.columns.str.startswith("ability")]
filled_abilities_df[ability_columns] = filled_abilities_df[ability_columns].apply(pd.Categorical)

# write it to a csv
filled_abilities_df.to_csv("../datasets/processed/ability_upgrades.csv")

### NEXT: Attach deaths and objectives
<!-- pd.read_csv("../datasets/raw/") -->

In [102]:
def to_minute(time_series):
    return np.floor(time_series / 60).astype(int)
teamfights_df = pd.read_csv("../datasets/raw/teamfights.csv").assign(
    start=lambda df: to_minute(df["start"]),
    end=lambda df: to_minute(df["end"]),
)
teamfights_df

Unnamed: 0,match_id,start,end,last_death,deaths
0,0,3,4,237,3
1,0,7,7,460,3
2,0,15,15,921,3
3,0,21,22,1313,3
4,0,26,27,1651,5
...,...,...,...,...,...
539042,49999,21,22,1308,3
539043,49999,23,24,1429,3
539044,49999,32,33,1969,4
539045,49999,38,39,2364,7


In [113]:
teamfights_players_df = pd.read_csv("../datasets/raw/teamfights_players.csv")

In [124]:

# teamfights_players_df[["match_id", "player_slot", "deaths"]]
teamfights_players_df["index"] = list(range(1, 11)) * teamfights_df.shape[0]

True

In [109]:
(teamfights_players_df[["match_id", "player_slot", "deaths", "damage"]]
 .pivot(
    index=["match_id"],
    columns="player_slot",
    values=["damage", "deaths"],
 ))

ValueError: Index contains duplicate entries, cannot reshape

### How to turn it in a better format for when we split by windows and train

In [None]:
# pivot longer for a splitting and training
df = (player_time_aggregated_df
 .assign(times=lambda df: df["times"].astype(str))
 .pivot(index="match_id", columns="times")
 .reset_index()
)
# df.set_index("match_id")
# df
df.columns = df.columns.map('_'.join).str.strip('_')
df



Unnamed: 0,match_id,lh_mean_diff_0,lh_mean_diff_1,lh_mean_diff_10,lh_mean_diff_100,lh_mean_diff_101,lh_mean_diff_102,lh_mean_diff_103,lh_mean_diff_104,lh_mean_diff_105,...,gold_mean_diff_90,gold_mean_diff_91,gold_mean_diff_92,gold_mean_diff_93,gold_mean_diff_94,gold_mean_diff_95,gold_mean_diff_96,gold_mean_diff_97,gold_mean_diff_98,gold_mean_diff_99
0,0,0.0,-1.6,-7.8,,,,,,,...,,,,,,,,,,
1,1,0.0,-0.4,2.4,,,,,,,...,,,,,,,,,,
2,2,0.0,0.4,1.0,,,,,,,...,,,,,,,,,,
3,3,0.0,-0.2,-4.2,,,,,,,...,,,,,,,,,,
4,4,0.0,0.8,16.2,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.0,0.4,1.6,,,,,,,...,,,,,,,,,,
49996,49996,0.0,-0.6,2.4,,,,,,,...,,,,,,,,,,
49997,49997,0.0,0.6,4.8,,,,,,,...,,,,,,,,,,
49998,49998,0.0,0.0,6.4,,,,,,,...,,,,,,,,,,
