# INITIALIZE

In [None]:
import pandas as pd
import numpy as np
import os
import sys

%load_ext lab_black
pd.set_option("display.max_rows", 60)

In [None]:
IS_LOCAL = True
SEED = 14
GOOGLE_DRIVE_PATH = "./drive/MyDrive/kaggle/Dota 2 prediction/"

PATH_TO_DATA = "./data/"
PATH_TO_ADD_DATA = "./add_data/"
if not IS_LOCAL:
    PATH_TO_DATA = GOOGLE_DRIVE_PATH + PATH_TO_DATA
    PATH_TO_ADD_DATA = GOOGLE_DRIVE_PATH + PATH_TO_ADD_DATA

# LOAD DATA

In [None]:
import os

try:
    import ujson as json
except ModuleNotFoundError:
    import json

    print("Please install ujson to read JSON oblects faster")

try:
    from tqdm import notebook
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print("Please install tqdm to track progress with Python loops")


def read_matches(matches_file):

    MATCHES_COUNT = {
        "test_matches.jsonl": 10000,
        "train_matches.jsonl": 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)

    with open(matches_file) as fin:
        for line in notebook.tqdm(fin, total=total_matches):
            yield json.loads(line)

In [None]:
ROLES = {
    0: "Durable",
    1: "Carry",
    2: "Disabler",
    3: "Support",
    4: "Pusher",
    5: "Nuker",
    6: "Escape",
    7: "Initiator",
    8: "Jungler",
}

PRIMARY_ATTR = {"agi": 0, "int": 1, "str": 2}

ATTACK_TYPE = {"Melee": 0, "Ranged": 1}


def load_heroes():
    i = 0
    heroes = []
    with open(PATH_TO_ADD_DATA + "heroes.json") as f:
        heroes_data = json.load(f)

    for k in heroes_data.keys():
        hero = {}
        hero["id"] = heroes_data[k]["id"]
        hero["name"] = heroes_data[k]["name"][14:]

        for attr in PRIMARY_ATTR:
            hero[attr] = 1 if attr in heroes_data[k]["primary_attr"] else 0

        for attack_type in ATTACK_TYPE:
            hero[attack_type.lower()] = (
                1 if attack_type in heroes_data[k]["attack_type"] else 0
            )

        for role in ROLES.values():
            hero[role.lower()] = 1 if role in heroes_data[k]["roles"] else 0
        heroes.append(hero)
    return heroes


heroes = load_heroes()

In [None]:
SIDES = ["radiant", "dire"]
BARRACKS = {
    "radiant": {
        "all": ["1", "2", "4", "8", "16", "32"],
        "top": ["1", "2"],
        "middle": ["4", "8"],
        "bottom": ["16", "32"],
    },
    "dire": {
        "all": ["64", "128", "256", "512", "1024", "2048"],
        "top": ["64", "128"],
        "middle": ["256", "512"],
        "bottom": ["1024", "2048"],
    },
}


def process_objectives(objectives):
    destroyed_barracks = {"radiant": [], "dire": []}
    for objective in objectives:
        if (
            objective
            and "type" in objective.keys()
            and objective["type"] == "CHAT_MESSAGE_BARRACKS_KILL"
        ):
            if objective["key"] in BARRACKS["radiant"]["all"]:
                destroyed_barracks["dire"] += [objective["key"]]
            elif objective["key"] in BARRACKS["dire"]["all"]:
                destroyed_barracks["radiant"] += [objective["key"]]
            else:
                print('Error: undefined key "%s"in counting of barracks' % s["key"])
    extended_destroyed_barracks = {
        "r_destroyed_barracks": len(destroyed_barracks["radiant"]),
        "d_destroyed_barracks": len(destroyed_barracks["dire"]),
    }
    reversed_sides = ["dire", "radiant"]
    for side in ["radiant", "dire"]:
        opposite_side = reversed_sides.pop(0)
        side_dict = {}
        for position in ["top", "middle", "bottom"]:
            side_dict[f"{side[0]}_destroyed_{position}"] = (
                np.all(
                    [
                        True if barrack in destroyed_barracks[side] else False
                        for barrack in BARRACKS[opposite_side][position]
                    ]
                )
            ) * 1
        side_dict[f"{side[0]}_supercreeps"] = np.all(list(side_dict.values())) * 1
        extended_destroyed_barracks.update(side_dict)
    return extended_destroyed_barracks

In [None]:
def process_teamfights(teamfights):
    feats_to_process = ["gold_delta", "xp_delta", "damage", "buybacks"]
    to_row = {}
    if teamfights:
        td = {
            col: np.sum(
                [[pl[col] for pl in teamfight["players"]] for teamfight in teamfights],
                axis=0,
            )
            for col in feats_to_process
        }
        teamfights_dict = {}
        for side, idxs in zip(SIDES, [range(5), range(5, 10)]):
            teamfights_dict.update({side: {}})
            for col in feats_to_process:
                teamfights_dict[side].update({col: {}})
                teamfights_dict[side][col].update(
                    {f"{side[0]}{idx%5+1}_{col}": td[col][idx] for idx in idxs}
                )
                teamfights_dict[side][col][f"{side[0]}_pt_{col}"] = sum(
                    teamfights_dict[side][col].values()
                )
                to_row.update(teamfights_dict[side][col])
        to_row["was_teamfight"] = True * 1
    else:
        for side in SIDES:
            for col in feats_to_process:
                for n in range(5):
                    to_row[f"{side[0]}{n+1}_{col}"] = 0
                    to_row[f"{side[0]}_pt_{col}"] = 0
            to_row[f"{side[0]}_pt_{col}"] = 0
        to_row["was_teamfight"] = False * 1
    return to_row

In [None]:
PLAYER_COLS = [
    "pred_vict",
    "gold",
    "lh",
    "xp",
    "x",
    "y",
    "max_health",
    "max_mana",
    "kills",
    "deaths",
    "assists",
    "denies",
    "nearby_creep_death_count",
    "roshans_killed",
    "hero_id",
    "account_id_hash",
    "obs_placed",
    "sen_placed",
    "creeps_stacked",
    "camps_stacked",
    "rune_pickups",
    "firstblood_claimed",
    "teamfight_participation",
    "towers_killed",
    "stuns",
]
TIMES_COLS = [
    "times",
    "gold_t",
    "lh_t",
    "dn_t",
    "xp_t",
    "obs_log",
    "sen_log",
    "obs_left_log",
    "sen_left_log",
    "buyback_log",
]
COUNT_ = [
    "purchase",
    "actions",
    "gold_reasons",
    "xp_reasons",
    "item_uses",
    "hero_hits",
    "kill_streaks",
    "multi_kills",
    "ability_uses",
    "healing",
]

POWERFUL_ITEMS = ["rapier", "aegis", "moon_shard", "gem", "cheese"]

TIME_COLS = ["gold_t", "lh_t", "xp_t"]

In [None]:
def process_players(players):
    players_info = {}
    for player_num in range(len(players)):
        if player_num < 5:
            side = "r"  # radiant
        else:
            side = "d"  # dire
        temp_player = players[player_num]
        prefix = (
            side + str(player_num + 1) + "_"
            if side == "r"
            else side + str(player_num - 4) + "_"
        )
        for col in PLAYER_COLS:
            players_info[prefix + col] = temp_player[col]
        for col in COUNT_:
            players_info[prefix + col] = sum(temp_player[col].values())
        player_items = [el["id"][5:] for el in temp_player["hero_inventory"]]
        for col in TIME_COLS:
            if len(temp_player[col]) > 1:
                players_info[prefix + col] = temp_player[col][-1] - temp_player[col][-2]
            else:
                players_info[prefix + col] = 0
        for item in POWERFUL_ITEMS:
            players_info[prefix + "item_" + item] = 1 if item in player_items else 0

    return players_info

In [None]:
def load_data_jsonl(name, start=0, end=-1, save=False, infunc=None):
    i = 0
    rows = []
    for match in read_matches(os.path.join(PATH_TO_DATA, "%s_matches.jsonl" % name)):
        while i <= start:
            i += 1
            continue

        row = {}

        # GENERAL INFO ABOUT MATCH
        row["match_id_hash"] = match["match_id_hash"]
        row["game_time"] = match["game_time"]
        row["game_mode_id"] = match["game_mode"]
        row["lobby_type_id"] = match["lobby_type"]

        # OBJECTIVES
        row.update(process_objectives(match["objectives"]))

        # TEAMFIGHTS
        row.update(process_teamfights(match["teamfights"]))

        # PLAYERS
        row.update(process_players(match["players"]))

        # TARGETS
        if name == "train":
            for k in match["targets"].keys():
                row["target_" + k] = match["targets"][k]

        rows.append(row)

        if i == end:
            break
        i += 1
    df = pd.DataFrame(rows)
    try:
        if save:
            df.to_csv(f"./data/{name}_hands_features.csv", index="match_id_hash")
    except:
        print("Error: saving failed!")
    return df, match

In [None]:
test_ = load_data_jsonl('test', save=True)
train_ = load_data_jsonl('train', save=True)

# CREATE DF

In [None]:
def intel_delete_column_rd(df, cols, output=False, get_pcols=False):
    if type(cols) is str:
        cols = [cols]

    pcols = []
    for col in cols:
        if col.startswith("FEP_"):
            pcols += [f"{s}{n}_{col[4:]}" for n in list("12345") for s in ["r", "d"]]
        elif col.startswith("FET_"):
            pcols += [f"{s}_{col[4:]}" for s in ["r", "d"]]
        elif col.startswith("FEM_"):
            pcols.append(col[4:])
    if get_pcols:
        return pcols
    fcols = []
    for col in pcols:
        if col in df.columns:
            fcols.append(col)
        elif output:
            print('there is no "%s" column!' % col)

    return df.drop(fcols, axis=1)

In [None]:
df_train = pd.read_csv(
    os.path.join(PATH_TO_DATA, "train_hands_features.csv"), index_col="match_id_hash"
).drop("Unnamed: 0", axis=1)
df_test = pd.read_csv(
    os.path.join(PATH_TO_DATA, "test_hands_features.csv"), index_col="match_id_hash"
).drop("Unnamed: 0", axis=1)

In [None]:
info_cols = [col for idx, col in enumerate(df_train.columns) if col[1].isalpha()]
match_info = [col for col in info_cols if col[:6] != "target"]
target_info = [col for col in info_cols if col[:6] == "target"]

players_info = sorted(list(set(df_train.columns) - set(info_cols))[::-1])
radiant_info = [col for col in players_info if col[0] == "r" and col[1].isnumeric()]
dire_info = [col for col in players_info if col[0] == "d" and col[1].isnumeric()]

radiant_team_info = [col for col in players_info if col.startswith("r_")]
dire_team_info = [col for col in players_info if col.startswith("d_")]

other_info = set(df_train.columns) - set(
    info_cols
    + match_info
    + target_info
    + radiant_info
    + dire_info
    + radiant_team_info
    + dire_team_info
)

In [None]:
uv = df_train.copy()
uv = df_train[
    pd.Index(
        match_info
        + dire_info
        + dire_team_info
        + radiant_info
        + radiant_team_info
        + target_info
    )
]
uv.columns = pd.Index(
    match_info
    + radiant_info
    + radiant_team_info
    + dire_info
    + dire_team_info
    + target_info
)
xs_ys = [
    col for col in uv.columns if (col[-1] == "x" or col[-1] == "y") and len(col) <= 4
]
uv.loc[:, xs_ys] = 256 - uv.loc[:, xs_ys]
uv["target_radiant_win"] = ~uv["target_radiant_win"]

In [None]:
uv.index = pd.Index(["mirror_" + id_[7:] for id_ in uv.index])

uv["is_mirror"] = 1

In [None]:
tdf = df_train.append(df_test)
tdf["is_mirror"] = 0

df = tdf.append(uv)

In [None]:
df = df.sort_values(by=["target_radiant_win"])

In [None]:
df.insert(2, "is_ranked_lobby", (df["lobby_type_id"] == 7).astype(np.uint8))
df.insert(2, "is_public_lobby", (df["lobby_type_id"] == 0).astype(np.uint8))
df.drop("lobby_type_id", axis=1, inplace=True)

In [None]:
def get_feats(
    cols,
    prefixes="",
    seps="_",
    suffixes="",
    feats_type="fep",
    sides=["r", "d"],
    together=True,
):
    # For Each Player => fep
    # For Each Team => fet
    tolist = lambda x: [x] if type(x) is not list else x
    cols, prefixes, seps, suffixes, sides = map(
        tolist, [cols, prefixes, seps, suffixes, sides]
    )
    cols = tolist(cols)
    seps = tolist(seps)

    players_num = range(1, 6) if feats_type == "fep" else [""]

    prepared_feats = np.array(
        [
            [
                f"{prefix}{side}{player_num}{sep}{col}{suffix}"
                for col in cols
                for prefix in prefixes
                for suffix in suffixes
                for sep in seps
                for player_num in players_num
            ]
            for side in sides
        ]
    )
    if together:
        return prepared_feats.flatten()
    else:
        return prepared_feats

In [None]:
df.insert(1, "is_turbo_mode", (df["game_mode_id"] == 23).astype(np.uint8).values)
df.drop("game_mode_id", axis=1, inplace=True)

df[get_feats("pred_vict")] = df[get_feats("pred_vict")].astype(np.uint8)

In [None]:
pheroes = pd.DataFrame(heroes)
pheroes.set_index("id", inplace=True)
for s in ["r", "d"]:
    for n in range(1, 6):
        prefix = f"{s}{n}_"
        df_heroes_info = (
            df[[f"{prefix}hero_id"]]
            .merge(pheroes, left_on=f"{prefix}hero_id", right_on="id", how="left")
            .iloc[:, 1:]
            .add_prefix(f"{prefix}hero_")
        )
        for col in df_heroes_info.columns:
            df.insert(0, col, df_heroes_info.loc[:, col].values)
            if df[col].nunique() == 2:
                df[col] = df[col].astype(np.uint8)

In [None]:
# Create features xy if x & y are similar and large near dire throne, if similar and low near radiant throne
df[get_feats("xy")] = df[get_feats("x")].values + df[get_feats("y")].values

In [None]:
# in account_id_hash we see very often the same account_id_hash, maybe it is valuable for us
most_frequent_acc_id_hash = (
    df[get_feats("account_id_hash")].mode().mode(axis=1)[0].values[0]
)

df[get_feats("is_mf_hash")] = (
    df[get_feats("account_id_hash")] == most_frequent_acc_id_hash
) * 1
df.drop(get_feats("account_id_hash"), axis=1, inplace=True)

In [None]:
# kda of every hero
df[get_feats("kda")] = (
    df[get_feats("kills")].values + df[get_feats("assists")].values
) / (df[get_feats("deaths")].values + 1)

In [None]:
# create feats gold per minute (gold_pm), xp_pm, kda_pm
df[get_feats(["gold", "xp", "kda"], suffixes="_pm")] = df[
    get_feats(["gold", "xp", "kda"])
].values / (np.tile(df["game_time"].replace({0: 99999}).values, (30, 1)).T)

In [None]:
%%time
# Get info about feats of evry player by sum, mean, max, min
# ex.: 'r1_gold', 'r2_gold', 'r3_gold', 'r4_gold', 'r5_gold' -> 'r_pt_gold', 'r_mean_gold', 'r_max_gold', 'r_min_gold'
feats_to_group = [
    "hero_agi",
    "hero_disabler",
    "firstblood_claimed",
    "healing",
    "xy",
    "hero_nuker",
    "gold",
    "lh",
    "hero_jungler",
    "hero_str",
    "hero_carry",
    "rune_pickups",
    "hero_ranged",
    "roshans_killed",
    "hero_escape",
    "kda",
    "obs_placed",
    "xp",
    "hero_pusher",
    "damage",
    "towers_killed",
    "is_mf_hash",
    "denies",
    "hero_initiator",
    "assists",
    "hero_support",
    "hero_melee",
    "kills",
    "hero_durable",
    "deaths",
    "item_aegis",
    "hero_hits",
    "multi_kills",
    "hero_int",
    "sen_placed",
    "pred_vict",
    "camps_stacked",
    "creeps_stacked",
    "max_health",
    "max_mana",
    "stuns",
    "teamfight_participation",
    "gold_t",
    "lh_t",
    "xp_t",
]

# Choose way
GROUPED_FEATS = {
    "Fastest but hard explainable": 1,
    "Middle in speed and explainability": 2,
    "Slowest but clear": 3,
}
THIS_TIME_WAY = GROUPED_FEATS["Fastest but hard explainable"]

if THIS_TIME_WAY < 3:
    sum_mean_max_min = lambda feat: np.array(
        [feat.sum(axis=1), feat.mean(axis=1), feat.max(axis=1), feat.min(axis=1)]
    ).T

    GROUPS = ["_pt_", "_mean_", "_max_", "_min_"]
    N_GROUPS = len(GROUPS)
    N_PLAYERS = 5

if THIS_TIME_WAY == GROUPED_FEATS['Fastest but hard explainable']:
    feats_to_group_fet_radiant, feats_to_group_fet_dire = get_feats(
        feats_to_group, seps=GROUPS, feats_type="fet", together=False
    )
    feats_to_group_fep_radiant, feats_to_group_fep_dire = get_feats(
        feats_to_group, together=False
    )
    for feat_id, feat_name in enumerate(feats_to_group):
        df.loc[
            :,
            feats_to_group_fet_radiant[feat_id * N_GROUPS : feat_id * N_GROUPS + N_GROUPS],
        ] = sum_mean_max_min(
            df.loc[
                :,
                feats_to_group_fep_radiant[
                    feat_id * N_PLAYERS : feat_id * N_PLAYERS + N_PLAYERS
                ],
            ]
        )
        df.loc[
            :, feats_to_group_fet_dire[feat_id * N_GROUPS : feat_id * N_GROUPS + N_GROUPS]
        ] = sum_mean_max_min(
            df.loc[
                :,
                feats_to_group_fep_dire[
                    feat_id * N_PLAYERS : feat_id * N_PLAYERS + N_PLAYERS
                ],
            ]
        )
elif THIS_TIME_WAY == GROUPED_FEATS['Middle in speed and explainability']:
    for feat in feats_to_group:

        fet_radiant_feats, fet_dire_feats = get_feats(
            feat, seps=GROUPS, feats_type="fet", together=False
        )
        fep_radiant_feats, fep_dire_feats = get_feats(feat, together=False)

        df.loc[:, fet_radiant_feats] = sum_mean_max_min(df.loc[:, fep_radiant_feats])
        df.loc[:, fet_dire_feats] = sum_mean_max_min(df.loc[:, fep_dire_feats])

elif THIS_TIME_WAY == GROUPED_FEATS["Slowest but clear"]:
    for col in feats_to_group:
        radiant_summary = [f"r{n}_{col}" for n in range(1, 6)]
        dire_summary = [f"d{n}_{col}" for n in range(1, 6)]

        df["r_pt_" + col] = df[radiant_summary].sum(axis=1)
        df["d_pt_" + col] = df[dire_summary].sum(axis=1)

        df["r_mean_" + col] = df[radiant_summary].mean(axis=1)
        df["d_mean_" + col] = df[dire_summary].mean(axis=1)

        df["r_max_" + col] = df[radiant_summary].max(axis=1)
        df["d_max_" + col] = df[dire_summary].max(axis=1)

        df["r_min_" + col] = df[radiant_summary].min(axis=1)
        df["d_min_" + col] = df[dire_summary].min(axis=1)
else:
    print("Choose wisely your dear, stalker", file=sys.stderr)

In [None]:
import math

map_center = (127.0, 127.0)


def rotate(origin, point, angle):
    # Rotate a point counterclockwise by a given angle around a given origin.
    # The angle should be given in radians.
    ox, oy = origin
    px, py = point

    qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
    qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
    return qx, qy


for col in [
    f"{s}{n}_{col}_rot" for col in ["x", "y"] for s in ["r", "d"] for n in range(1, 6)
]:
    df[col] = df.apply(
        lambda row: rotate(
            map_center, (row[f"{col[:2]}_x"], row[f"{col[:2]}_y"]), 0.785398
        )[0],
        axis=1,
    )

In [None]:
from pathlib import Path
import re
import os
import numpy as np
import pandas as pd

import ast
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    ShuffleSplit,
)

from sklearn.feature_extraction.text import CountVectorizer

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
matplotlib.style.use("ggplot")
plt.rc("axes", titlesize=18)
plt.rc("axes", labelsize=18)
plt.rc("xtick", labelsize=16)
plt.rc("ytick", labelsize=16)
sns.set_style("whitegrid")

import eli5
from IPython.display import display_html

In [None]:
def load_items():

    train_df = pd.read_csv(
        os.path.join(PATH_TO_ADD_DATA, "train_items.csv"), index_col="match_id_hash"
    )
    test_df = pd.read_csv(
        os.path.join(PATH_TO_ADD_DATA, "test_items.csv"), index_col="match_id_hash"
    )
    train_df.drop("Unnamed: 0", axis=1, inplace=True)
    test_df.drop("Unnamed: 0", axis=1, inplace=True)
    y_train = (
        pd.read_csv(PATH_TO_DATA + "train_targets.csv", index_col="match_id_hash")[
            "radiant_win"
        ]
        * 1
    )

    radiant_items, dire_items = [
        [f"{side}{i}_items" for i in range(1, 6)] for side in ["r", "d"]
    ]

    for dataframe in [train_df, test_df]:
        for col in dataframe.columns:
            dataframe[col] = dataframe[col].apply(ast.literal_eval)

    mirror_train_df = train_df[dire_items + radiant_items]
    mirror_train_df.columns = train_df.columns
    mirror_train_df.index = ["mirror_" + idx[7:] for idx in mirror_train_df.index]
    train_df = train_df.append(mirror_train_df)

    mirror_train_y = ~y_train.astype(bool) * 1
    y_train = y_train.append(mirror_train_y)
    return train_df, test_df

In [None]:
def item_transform():
    train_df, test_df = load_items()
    radiant_items, dire_items = [
        [f"{side}{i}_items" for i in range(1, 6)] for side in ["r", "d"]
    ]

    ltostr = lambda row: " ".join([" ".join(i) for i in row])
    r_temp, d_temp, r_temp_test, d_temp_test = [
        df_[side_items].apply(ltostr, axis=1)
        for df_ in [train_df, test_df]
        for side_items in [radiant_items, dire_items]
    ]

    rx = r"{0}[0-9]".format("river_painter")
    r_items, d_items, r_items_test, d_items_test = [
        [re.sub(rx, "river_painter", x.replace("recipe_", "")) for x in side]
        for side in [r_temp, d_temp, r_temp_test, d_temp_test]
    ]

    vectorizer = CountVectorizer().fit(r_items)
    r, d, r_test, d_test = [
        vectorizer.transform(side_items).toarray()
        for side_items in [r_items, d_items, r_items_test, d_items_test]
    ]

    count_vect_df = pd.DataFrame(r - d, columns=vectorizer.get_feature_names())
    count_vect_df_test = pd.DataFrame(
        r_test - d_test, columns=vectorizer.get_feature_names()
    )

    consumables = [
        "tango",
        "tpscroll",
        "bottle",
        "flask",
        "enchanted_mango",
        "courier",
        "clarity",
        "faerie_fire",
        "ward_observer",
        "ward_sentry",
        "river_painter",
    ]
    count_vect_df.drop(columns=consumables, inplace=True)
    count_vect_df_test.drop(columns=consumables, inplace=True)

    count_vect_df.index = train_df.index
    count_vect_df_test.index = test_df.index
    return count_vect_df.append(count_vect_df_test)

In [None]:
df = df.join(item_transform())

In [None]:
X_heroes_train = pd.read_csv(
    os.path.join(PATH_TO_ADD_DATA, "bag_of_heroes_and_win_prob_train.csv")
)
X_heroes_test = pd.read_csv(
    os.path.join(PATH_TO_ADD_DATA, "bag_of_heroes_and_win_prob_test.csv")
)

X_heroes_train.index = df_train.index
X_heroes_test.index = df_test.index

X_heroes_train_flip = X_heroes_train.copy()


def train_boh_flip(X_heroes_train_flip, X_heroes_train):
    players = [f"f{i}" for i in range(0, 115)]
    for player in players:
        X_heroes_train_flip[player] = X_heroes_train[player].map({1: -1, -1: 1, 0: 0})
    return X_heroes_train_flip


X_heroes_train_flip = train_boh_flip(X_heroes_train_flip, X_heroes_train)


def rd_hero_win(X_heroes_train_flip, X_heroes_train):
    X_heroes_train_flip["r_win_prob"] = X_heroes_train["d_win_prob"]
    X_heroes_train_flip["d_win_prob"] = X_heroes_train["r_win_prob"]
    X_heroes_train_flip["win_prob"] = (
        X_heroes_train_flip["r_win_prob"] - X_heroes_train_flip["d_win_prob"]
    )
    return X_heroes_train_flip


X_heroes_train_flip = rd_hero_win(X_heroes_train_flip, X_heroes_train)
X_heroes_train_flip.index = ["mirror_" + idx[7:] for idx in X_heroes_train.index]
X_heroes_train_flip.to_csv(
    os.path.join(PATH_TO_ADD_DATA, "bag_of_heroes_and_win_prob_train_flip.csv"),
    index=False,
)

df = df.join(X_heroes_train.append([X_heroes_train_flip, X_heroes_test]))

# SAVE DF

In [None]:
df = intel_delete_column_rd(
    df, ["FEP_hero_name", "FEM_target_next_roshan_team", "FEP_hero_id"], output=True
)
df.to_csv(os.path.join(PATH_TO_DATA, "df_final.csv"), index="match_id_hash")

# LOAD DF

In [None]:
%%time
df = (
    pd.read_csv(
        os.path.join(PATH_TO_DATA, "df_final.csv"), dtype={"target_radiant_win": object}
    )
    .rename(columns={"Unnamed: 0": "match_id_hash"})
    .set_index("match_id_hash")
)

In [None]:
target_cols = [col for col in df.columns if col.startswith("target")]

train = df[~df["target_radiant_win"].isnull()].drop(target_cols, axis=1)
test = df[df["target_radiant_win"].isnull()].drop(target_cols, axis=1)

train_targets = (
    df[~df["target_radiant_win"].isnull()]["target_radiant_win"]
    .replace({"True": 1, "False": 0})
    .astype(np.uint8)
)

train_def = train.copy()

# FEATURE SELECTION

In [None]:
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft


# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt

plt.rcParams["font.size"] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings

warnings.filterwarnings("ignore")

# modeling
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# memory management
import gc

import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
def to_generalize_features(features, threshold=-1):
    fs = []
    fc = []
    for f in features:
        if f[1].isnumeric() and f[0] != "f":
            fs.append("FEP_" + f[3:])
        elif f[1] == "_":
            fc.append("FET_" + f[2:])
        else:
            fc.append("FEM_" + f)

    sfs = pd.Series(fs)
    sfs.value_counts()[sfs.value_counts() > threshold].index

    fsfc = list(sfs.value_counts()[sfs.value_counts() > threshold].index) + fc
    return fsfc

## Remove Collinear Variables

In [None]:
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = train[:200].corr().abs()
corr_matrix.head(2)

In [None]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head(2)

In [None]:
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print("There are %d columns to remove." % (len(to_drop)))

In [None]:
# to_drop
to_drop = [
    "d5_hero_melee",
    "d4_hero_melee",
    "d3_hero_melee",
    "d2_hero_melee",
    "d1_hero_melee",
    "r5_hero_melee",
    "r4_hero_melee",
    "r3_hero_melee",
    "r2_hero_melee",
    "r1_hero_melee",
    "is_ranked_lobby",
    "r_pt_xp_delta",
    "r2_xp_delta",
    "r3_xp_delta",
    "r4_xp_delta",
    "r5_xp_delta",
    "r_pt_damage",
    "d_pt_xp_delta",
    "d2_xp_delta",
    "d3_xp_delta",
    "d4_xp_delta",
    "d5_xp_delta",
    "d_pt_damage",
    "r1_xp",
    "r1_nearby_creep_death_count",
    "r1_camps_stacked",
    "r1_gold_reasons",
    "r1_xp_reasons",
    "r2_xp",
    "r2_nearby_creep_death_count",
    "r2_camps_stacked",
    "r2_gold_reasons",
    "r2_xp_reasons",
    "r3_xp",
    "r3_nearby_creep_death_count",
    "r3_camps_stacked",
    "r3_gold_reasons",
    "r3_xp_reasons",
    "r4_xp",
    "r4_nearby_creep_death_count",
    "r4_camps_stacked",
    "r4_gold_reasons",
    "r4_xp_reasons",
    "r5_xp",
    "r5_nearby_creep_death_count",
    "r5_camps_stacked",
    "r5_gold_reasons",
    "r5_xp_reasons",
    "d1_xp",
    "d1_nearby_creep_death_count",
    "d1_camps_stacked",
    "d1_gold_reasons",
    "d1_xp_reasons",
    "d2_xp",
    "d2_nearby_creep_death_count",
    "d2_camps_stacked",
    "d2_gold_reasons",
    "d2_xp_reasons",
    "d3_xp",
    "d3_nearby_creep_death_count",
    "d3_camps_stacked",
    "d3_gold_reasons",
    "d3_xp_reasons",
    "d4_xp",
    "d4_nearby_creep_death_count",
    "d4_camps_stacked",
    "d4_gold_reasons",
    "d4_xp_reasons",
    "d5_xp",
    "d5_nearby_creep_death_count",
    "d5_camps_stacked",
    "d5_gold_reasons",
    "d5_xp_reasons",
    "r1_xp_pm",
    "r2_gold_pm",
    "r2_xp_pm",
    "r3_gold_pm",
    "r3_xp_pm",
    "r4_gold_pm",
    "r4_xp_pm",
    "r5_gold_pm",
    "r5_xp_pm",
    "d1_xp_pm",
    "d2_gold_pm",
    "d2_xp_pm",
    "d3_gold_pm",
    "d3_xp_pm",
    "d4_gold_pm",
    "d4_xp_pm",
    "d5_gold_pm",
    "d5_xp_pm",
    "r_mean_hero_agi",
    "d_mean_hero_agi",
    "r_mean_hero_disabler",
    "d_mean_hero_disabler",
    "r_mean_firstblood_claimed",
    "d_mean_firstblood_claimed",
    "r_max_firstblood_claimed",
    "d_max_firstblood_claimed",
    "r_mean_healing",
    "d_mean_healing",
    "r_max_healing",
    "d_max_healing",
    "r_mean_xy",
    "d_mean_xy",
    "r_mean_hero_nuker",
    "d_mean_hero_nuker",
    "r_pt_gold",
    "d_pt_gold",
    "r_mean_gold",
    "d_mean_gold",
    "r_max_gold",
    "d_max_gold",
    "r_min_gold",
    "d_min_gold",
    "r_pt_lh",
    "d_pt_lh",
    "r_mean_lh",
    "d_mean_lh",
    "r_max_lh",
    "d_max_lh",
    "r_mean_hero_jungler",
    "d_mean_hero_jungler",
    "r_max_hero_jungler",
    "d_max_hero_jungler",
    "r_mean_hero_str",
    "d_mean_hero_str",
    "r_mean_hero_carry",
    "d_mean_hero_carry",
    "r_mean_rune_pickups",
    "d_mean_rune_pickups",
    "r_max_rune_pickups",
    "d_max_rune_pickups",
    "r_mean_hero_ranged",
    "d_mean_hero_ranged",
    "r_mean_roshans_killed",
    "d_mean_roshans_killed",
    "r_max_roshans_killed",
    "d_max_roshans_killed",
    "r_mean_hero_escape",
    "d_mean_hero_escape",
    "r_mean_kda",
    "d_mean_kda",
    "r_max_kda",
    "d_max_kda",
    "r_mean_obs_placed",
    "d_mean_obs_placed",
    "r_max_obs_placed",
    "d_max_obs_placed",
    "r_pt_xp",
    "d_pt_xp",
    "r_mean_xp",
    "d_mean_xp",
    "r_max_xp",
    "d_max_xp",
    "r_min_xp",
    "d_min_xp",
    "r_mean_hero_pusher",
    "d_mean_hero_pusher",
    "r_mean_damage",
    "d_mean_damage",
    "r_max_damage",
    "d_max_damage",
    "r_mean_towers_killed",
    "d_mean_towers_killed",
    "r_max_towers_killed",
    "d_max_towers_killed",
    "r_mean_is_mf_hash",
    "d_mean_is_mf_hash",
    "r_mean_denies",
    "d_mean_denies",
    "r_max_denies",
    "d_max_denies",
    "r_mean_hero_initiator",
    "d_mean_hero_initiator",
    "r_mean_assists",
    "d_mean_assists",
    "r_max_assists",
    "d_max_assists",
    "r_min_assists",
    "d_min_assists",
    "r_mean_hero_support",
    "d_mean_hero_support",
    "r_pt_hero_melee",
    "d_pt_hero_melee",
    "r_mean_hero_melee",
    "d_mean_hero_melee",
    "r_max_hero_melee",
    "d_max_hero_melee",
    "r_min_hero_melee",
    "d_min_hero_melee",
    "r_pt_kills",
    "d_pt_kills",
    "r_mean_kills",
    "d_mean_kills",
    "r_max_kills",
    "d_max_kills",
    "r_mean_hero_durable",
    "d_mean_hero_durable",
    "r_pt_deaths",
    "d_pt_deaths",
    "r_mean_deaths",
    "d_mean_deaths",
    "r_max_deaths",
    "d_max_deaths",
    "r_mean_item_aegis",
    "d_mean_item_aegis",
    "r_max_item_aegis",
    "d_max_item_aegis",
    "r_mean_hero_hits",
    "d_mean_hero_hits",
    "r_max_hero_hits",
    "d_max_hero_hits",
    "r_mean_multi_kills",
    "d_mean_multi_kills",
    "r_max_multi_kills",
    "d_max_multi_kills",
    "r_mean_hero_int",
    "d_mean_hero_int",
    "r_mean_sen_placed",
    "d_mean_sen_placed",
    "r_max_sen_placed",
    "d_max_sen_placed",
    "r_mean_pred_vict",
    "d_mean_pred_vict",
    "r_mean_camps_stacked",
    "d_mean_camps_stacked",
    "r_max_camps_stacked",
    "d_max_camps_stacked",
    "r_pt_creeps_stacked",
    "d_pt_creeps_stacked",
    "r_mean_creeps_stacked",
    "d_mean_creeps_stacked",
    "r_max_creeps_stacked",
    "d_max_creeps_stacked",
    "r_min_creeps_stacked",
    "d_min_creeps_stacked",
    "r_pt_max_health",
    "d_pt_max_health",
    "r_mean_max_health",
    "d_mean_max_health",
    "r_max_max_health",
    "d_max_max_health",
    "r_min_max_health",
    "d_min_max_health",
    "r_pt_max_mana",
    "d_pt_max_mana",
    "r_mean_max_mana",
    "d_mean_max_mana",
    "r_max_max_mana",
    "d_max_max_mana",
    "r_mean_stuns",
    "d_mean_stuns",
    "r_max_stuns",
    "d_max_stuns",
    "r_mean_teamfight_participation",
    "d_mean_teamfight_participation",
    "r_mean_gold_t",
    "d_mean_gold_t",
    "r_max_gold_t",
    "d_max_gold_t",
    "r_mean_lh_t",
    "d_mean_lh_t",
    "r_mean_xp_t",
    "d_mean_xp_t",
    "r1_y_rot",
    "r2_y_rot",
    "r3_y_rot",
    "r4_y_rot",
    "r5_y_rot",
    "d1_y_rot",
    "d2_y_rot",
    "d3_y_rot",
    "d4_y_rot",
    "d5_y_rot",
]

In [None]:
# TODO: maybe better to delete all of this
#
# bf = bad_feats(to_drop, [])
# to_delete_cols = intel_delete_column_rd(train, bf, get_pcols=True)
# for col in list(set(to_delete_cols) - set(to_drop)):
#     print(col, upper[col].max())
#####

train = train.drop(columns=to_drop)
test = test.drop(columns=to_drop)

print("Training shape: ", train.shape)
print("Testing shape: ", test.shape)

## Feature Selection through Feature Importances

In [None]:
def plot_feature_importances(df, threshold=0.9, doplot=True):

    plt.rcParams["font.size"] = 18

    # Sort features according to importance
    df = df.sort_values("importance", ascending=False).reset_index()

    # Normalize the feature importances to add up to one
    df["importance_normalized"] = df["importance"] / df["importance"].sum()
    df["cumulative_importance"] = np.cumsum(df["importance_normalized"])

    if doplot:
        # Make a horizontal bar chart of feature importances
        plt.figure(figsize=(10, 6))
        ax = plt.subplot()

        # Need to reverse the index to plot most important on top
        ax.barh(
            list(reversed(list(df.index[:15]))),
            df["importance_normalized"].head(15),
            align="center",
            edgecolor="k",
        )

        # Set the yticks and labels
        ax.set_yticks(list(reversed(list(df.index[:15]))))
        ax.set_yticklabels(df["feature"].head(15))

        # Plot labeling
        plt.xlabel("Normalized Importance")
        plt.title("Feature Importances")
        plt.show()

        # Cumulative importance plot
        plt.figure(figsize=(8, 6))
        plt.plot(list(range(len(df))), df["cumulative_importance"], "r-")
        plt.xlabel("Number of Features")
        plt.ylabel("Cumulative Importance")
        plt.title("Cumulative Feature Importance")
        plt.show()

        importance_index = np.min(np.where(df["cumulative_importance"] > threshold))
        print(
            "%d features required for %0.2f of cumulative importance"
            % (importance_index + 1, threshold)
        )

    return df

In [None]:
def identify_zero_importance_features(train, train_labels, iterations=2):
    # Initialize an empty array to hold feature importances
    feature_importances = np.zeros(train.shape[1])

    # Create the model with several hyperparameters
    model = lgb.LGBMClassifier(
        objective="binary", boosting_type="goss", n_estimators=10000
    )

    # Fit the model multiple times to avoid overfitting
    for i in range(iterations):

        # Split into training and validation set
        train_features, valid_features, train_y, valid_y = train_test_split(
            train, train_labels, test_size=0.25, random_state=SEED
        )

        # Train using early stopping
        model.fit(
            train_features,
            train_y,
            early_stopping_rounds=100,
            eval_set=[(valid_features, valid_y)],
            eval_metric="auc",
            verbose=200,
        )

        # Record the feature importances
        feature_importances += model.feature_importances_ / iterations

    feature_importances = pd.DataFrame(
        {"feature": list(train.columns), "importance": feature_importances}
    ).sort_values("importance", ascending=False)

    # Find the features with zero importance
    zero_features = list(
        feature_importances[feature_importances["importance"] == 0.0]["feature"]
    )
    print("\nThere are %d features with 0.0 importance" % len(zero_features))

    return zero_features, feature_importances

In [None]:
def get_cols_under_treshold(
    df,
    threshold,
    targets,
    get_over_threshold=False,
    iterations=5,
    feature_importances_=None,
):
    if feats_importances_ is None:
        _, feature_importances_ = identify_zero_importance_features(
            df, targets, iterations
        )

    norm_feature_importances = plot_feature_importances(
        feature_importances_, threshold=threshold, doplot=False
    )
    cols_under_cum_imp_threshold = norm_feature_importances[
        norm_feature_importances["cumulative_importance"] < threshold
    ]["feature"].values
    if get_over_threshold:
        return (
            df[cols_under_cum_imp_threshold],
            df[set(df.columns) - set(cols_under_cum_imp_threshold)],
        )
    return df[cols_under_cum_imp_threshold]

# FINAL DATASET

In [None]:
from sklearn.model_selection import cross_validate
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


np.random.seed(SEED)
permutation = np.random.permutation(train_def.shape[0])
TEST_STATUS = True

if TEST_STATUS:
    N_ESTIMATORS = 100
    N_CV = 2
    RESTRICTION = 200
    N_ITERATIONS = 2
else:
    N_ESTIMATORS = 10000
    N_CV = 5
    RESTRICTION = 10 ** 5
    N_ITERATIONS = 5

summaries = []
feats_importances_ = None
lgb_classifier = lgb.LGBMClassifier(
    objective="binary", boosting_type="goss", n_estimators=N_ESTIMATORS
)
for drop_ in [True, False]:

    if drop_:
        dropped_train = train_def.drop(columns=to_drop).iloc[permutation][:RESTRICTION]
    else:
        dropped_train = train_def.copy().iloc[permutation][:RESTRICTION]

    if feats_importances_ is None:
        _, feats_importances_ = identify_zero_importance_features(
            train[:RESTRICTION],
            train_targets[permutation][:RESTRICTION],
            iterations=N_ITERATIONS,
        )

    for threshold in [0.95, 0.97, 0.98, 0.99, 999]:
        summary = {"threshold": threshold, "drop": drop_}

        train = dropped_train.copy()
        train = get_cols_under_treshold(
            train[:RESTRICTION],
            threshold,
            train_targets[permutation][:RESTRICTION],
            iterations=N_ITERATIONS,
            feature_importances_=feats_importances_,
        )

        summary["cv_score"] = cross_validate(
            estimator=lgb_classifier,
            X=train[:RESTRICTION],
            y=train_targets[permutation][:RESTRICTION],
            cv=N_CV,
            scoring="roc_auc",
            n_jobs=-1,
        )
        print(summary["threshold"], summary["drop"])
        summaries.append(summary)
        print(summary)

    feats_importances_ = None

In [None]:
summaries = [
    {
        "threshold": 0.95,
        "drop": False,
        "cv_score": {
            "fit_time": np.array(
                [
                    1551.17084408,
                    1549.20896697,
                    1553.70101213,
                    1547.86408401,
                    1551.00747991,
                ]
            ),
            "score_time": np.array(
                [87.98920989, 90.25121522, 87.72942209, 88.29967999, 89.2156179]
            ),
            "test_score": np.array(
                [0.82418862, 0.82313957, 0.83282496, 0.82868008, 0.82804857]
            ),
        },
    },
    {
        "threshold": 0.97,
        "drop": False,
        "cv_score": {
            "fit_time": np.array(
                [
                    1801.17132711,
                    1801.79713631,
                    1807.40070891,
                    1802.5189991,
                    1802.36746407,
                ]
            ),
            "score_time": np.array(
                [95.15213203, 94.58149195, 94.04338193, 94.53755307, 94.73621297]
            ),
            "test_score": np.array(
                [0.82593396, 0.82426528, 0.83298792, 0.82841072, 0.82996017]
            ),
        },
    },
    {
        "threshold": 0.98,
        "drop": False,
        "cv_score": {
            "fit_time": np.array(
                [
                    1983.7437439,
                    1983.00015306,
                    1982.55795097,
                    1987.76440716,
                    1981.96202707,
                ]
            ),
            "score_time": np.array(
                [101.19300508, 100.53555989, 99.56558895, 98.53505683, 99.54745865]
            ),
            "test_score": np.array(
                [0.82565301, 0.82328516, 0.83246367, 0.83179779, 0.8302153]
            ),
        },
    },
    {
        "threshold": 999,
        "drop": False,
        "cv_score": {
            "fit_time": np.array(
                [
                    2613.45792913,
                    2624.33366704,
                    2615.64451814,
                    2615.51148796,
                    2616.90589714,
                ]
            ),
            "score_time": np.array(
                [93.09312987, 90.40313911, 94.90813398, 93.90510297, 93.01988196]
            ),
            "test_score": np.array(
                [0.82622764, 0.8279041, 0.83426052, 0.82976092, 0.82882073]
            ),
        },
    },
    {
        "threshold": 0.98,
        "drop": True,
        "cv_score": {
            "fit_time": np.array(
                [
                    2098.179322,
                    2095.39246678,
                    2059.01631832,
                    2039.11520362,
                    1362.83123827,
                ]
            ),
            "score_time": np.array(
                [31.95075321, 31.42989707, 25.74079227, 25.20989347, 18.99360013]
            ),
            "test_score": np.array(
                [0.82021793, 0.82154949, 0.83004849, 0.82684412, 0.82608877]
            ),
        },
    },
    {
        "threshold": 999,
        "drop": True,
        "cv_score": {
            "fit_time": np.array(
                [
                    1776.4761889,
                    1776.09254622,
                    1780.76119709,
                    1772.02593589,
                    1777.50545907,
                ]
            ),
            "score_time": np.array(
                [94.13745904, 94.74571395, 91.6106019, 92.95641589, 93.26678681]
            ),
            "test_score": np.array(
                [0.82331943, 0.82002151, 0.82986343, 0.82779059, 0.82518124]
            ),
        },
    },
    {
        "threshold": 0.99,
        "drop": True,
        "cv_score": {
            "fit_time": np.array(
                [
                    2135.04629016,
                    2137.68749356,
                    2106.97155023,
                    2139.73715734,
                    1419.81394053,
                ]
            ),
            "score_time": np.array(
                [33.91747642, 31.37089729, 24.38686776, 23.97090578, 18.97910929]
            ),
            "test_score": np.array(
                [0.81959551, 0.81977512, 0.82911504, 0.82423267, 0.82619395]
            ),
        },
    },
    {
        "threshold": 0.95,
        "drop": True,
        "cv_score": {
            "fit_time": np.array(
                [
                    1284.54605222,
                    1285.85682988,
                    1288.0978291,
                    1286.75951481,
                    1288.83159399,
                ]
            ),
            "score_time": np.array(
                [89.94498301, 90.11361003, 89.30421901, 89.89981794, 89.86208701]
            ),
            "test_score": np.array(
                [0.81918739, 0.81988874, 0.82735385, 0.82314564, 0.82279818]
            ),
        },
    },
    {
        "threshold": 0.99,
        "drop": False,
        "cv_score": {
            "fit_time": np.array(
                [
                    2894.65649366,
                    2894.18757915,
                    2896.68515563,
                    2891.68123913,
                    1958.80399776,
                ]
            ),
            "score_time": np.array(
                [60.33294344, 59.83159113, 58.85685635, 58.79200721, 26.69429708]
            ),
            "test_score": np.array(
                [0.826693, 0.8251328, 0.83216345, 0.83043508, 0.83017442]
            ),
        },
    },
    {
        "threshold": 0.97,
        "drop": True,
        "cv_score": {
            "fit_time": np.array(
                [
                    1970.23303366,
                    1972.56270075,
                    1937.96423578,
                    1965.52252698,
                    1296.1558075,
                ]
            ),
            "score_time": np.array(
                [30.78945732, 35.06625056, 22.92393661, 23.19641376, 18.31756473]
            ),
            "test_score": np.array(
                [0.81853699, 0.82126441, 0.82815407, 0.82634223, 0.82464998]
            ),
        },
    },
]

In [None]:
test_scores = list(map(lambda el: el["cv_score"]["test_score"], summaries))
df_summaries = pd.DataFrame(
    {
        "threshold": list(map(lambda el: el["threshold"], summaries)),
        "drop": list(map(lambda el: el["drop"], summaries)),
        "mean": np.mean(test_scores, axis=1),
        "std": np.std(test_scores, axis=1),
        "min": np.min(test_scores, axis=1),
        "max": np.max(test_scores, axis=1),
    }
).sort_values(by=["mean", "std"], ascending=False)
df_summaries

In [None]:
COMFORTABLE_FOR_WORK_DF = True

if COMFORTABLE_FOR_WORK_DF:
    f_drop = True
    f_threshold = 0.98
else:
    f_drop, f_threshold = df_summaries.sort_values(
        by="mean_score", ascending=False
    ).iloc[0][["drop", "threshold"]]

In [None]:
if f_drop:
    train = train_def.drop(columns=to_drop)
else:
    train = train_def.copy()
train = get_cols_under_treshold(
    train, f_threshold, train_targets, iterations=N_ITERATIONS
)
test = test[train.columns]

In [None]:
train.to_csv(os.path.join(PATH_TO_DATA, "final_train.csv"), index="match_id_hash")
test.to_csv(os.path.join(PATH_TO_DATA, "final_test.csv"), index="match_id_hash")
train_targets.to_csv(
    os.path.join(PATH_TO_DATA, "final_train_targets.csv"), index="match_id_hash"
)

# Train model & GET FINAL PREDICTIONS

In [None]:
final_predictions = {}

## Prepare data for training

In [None]:
train = pd.read_csv(
    os.path.join(PATH_TO_DATA, "final_train.csv"), index_col="match_id_hash"
)
test = pd.read_csv(
    os.path.join(PATH_TO_DATA, "final_test.csv"), index_col="match_id_hash"
)
train_targets = pd.read_csv(
    os.path.join(PATH_TO_DATA, "final_train_targets.csv"), index_col="match_id_hash"
)

In [None]:
np.random.seed(SEED)
permutation = np.random.permutation(train.shape[0])

TEST_RIDE = False
if TEST_RIDE:
    RESTRICTIONS = 2 * 10 ** 3
    N_ITERATIONS = 10
    MAX_EVALS = 2
    EARLY_STOP = 200
else:
    RESTRICTIONS = 10 ** 5
    N_ITERATIONS = 10 ** 3
    MAX_EVALS = 50
    EARLY_STOP = 200

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.iloc[permutation][:RESTRICTIONS],
    train_targets.iloc[permutation][:RESTRICTIONS]["target_radiant_win"].values,
    test_size=0.20,
    random_state=SEED,
)

In [None]:
cats = []
idxs = []
for col in train.columns:
    cat = {}
    if train[col].nunique() <= 3:
        idxs.append(train.columns.get_loc(col))
        cat["name"] = col
        cat["count"] = train[col].nunique()
        cat["values"] = train[col].unique()
        cats.append(cat)

cats_df = pd.DataFrame(cats, index=idxs).sort_values(by=["count", "name"])
categorical_features = cats_df["name"].values
categorical_features_indices = cats_df["name"].index

## CatBoost

In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

In [None]:
X, y = train.iloc[permutation][:RESTRICTIONS], train_targets.iloc[permutation][:RESTRICTIONS]['target_radiant_win'].values
X_validation, y_validation = X_val, y_val

In [None]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        bagging_temperature=params['bagging_temperature'],
        iterations=N_ITERATIONS,
        eval_metric='AUC',
        random_seed=SEED,
        verbose=False,
        loss_function='Logloss'
    )
    print(params)
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-AUC-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises



params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
    'bagging_temperature' : hyperopt.hp.uniform('bagging_temperature', 0, 5e-3),
    
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=1, #0 BOL`SHE
    trials=trials,
    rstate=np.random.RandomState(SEED)
)

print(best)

In [None]:
cbfinal = CatBoostClassifier(**best,
                            iterations=N_ITERATIONS,
                            eval_metric='AUC',
                            random_seed=SEED,
                            verbose=False,
                            loss_function='Logloss',
                            od_wait=EARLY_STOP,
                            od_type='Iter',
                           )

cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), fmodel.get_params())
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-AUC-mean'])))

In [None]:
from sklearn.metrics import roc_auc_score

cbfinal.fit(train, train_targets)

## LGBMClassifier

### Simple lgmbclassifier

In [None]:
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier

lgb_classifier = lgb.LGBMClassifier(
    objective="binary", boosting_type="goss", n_estimators=N_ESTIMATORS
)

In [None]:
lgb_classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score

predicted = lgb_classifier.predict(X_train)
print(roc_auc_score(predicted, y_train))

predicted = lgb_classifier.predict(X_val)
print(roc_auc_score(predicted, y_val))

### Learning rate shrinkage

In [None]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate * np.power(0.99, current_iter)
    return lr if lr > 1e-3 else 1e-3


def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate * np.power(0.995, current_iter)
    return lr if lr > 1e-3 else 1e-3


def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate * np.power(0.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [None]:
# Use test subset for early stopping criterion
# This allows us to avoid overtraining and we do not need to optimise the number of trees
fit_params = {
    "early_stopping_rounds": 30,
    "eval_metric": "auc",
    "eval_set": [(X_val, y_val)],
    "eval_names": ["valid"],
    #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
    "verbose": 100,
    "categorical_feature": "auto",
}

In [None]:
# We use random search, which is more flexible and more efficient than a grid search
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

param_test = {
    "num_leaves": sp_randint(6, 50),
    "min_child_samples": sp_randint(100, 500),
    "min_child_weight": [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    "subsample": sp_uniform(loc=0.2, scale=0.8),
    "colsample_bytree": sp_uniform(loc=0.4, scale=0.6),
    "reg_alpha": [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    "reg_lambda": [0, 1e-1, 1, 5, 10, 20, 50, 100],
}

In [None]:
# This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100
# return_train_score=True
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(
    max_depth=-1,
    random_state=SEED,
    silent=True,
    metric="None",
    n_jobs=-1,
    n_estimators=5000,
)
gs = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_test,
    n_iter=n_HP_points_to_test,
    scoring="roc_auc",
    cv=3,
    refit=True,
    random_state=SEED,
    verbose=True,
    return_train_score=True,
)

In [None]:
gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
# opt_parameters = {
#     "colsample_bytree": 0.5645098230355559,
#     "min_child_samples": 250,
#     "min_child_weight": 100.0,
#     "num_leaves": 12,
#     "reg_alpha": 0,
#     "reg_lambda": 100,
#     "subsample": 0.8265673984661308,
# }

In [None]:
clf_sw = lgb.LGBMClassifier(**clf.get_params())
# set optimal parameters
clf_sw.set_params(**opt_parameters)

In [None]:
gs_sample_weight = GridSearchCV(
    estimator=clf_sw,
    param_grid={"scale_pos_weight": [1, 2, 6, 12]},
    scoring="roc_auc",
    cv=5,
    refit=True,
    verbose=True,
    return_train_score=True,
)

In [None]:
gs_sample_weight.fit(X_train, y_train)
print(
    "Best score reached: {} with params: {} ".format(
        gs_sample_weight.best_score_, gs_sample_weight.best_params_
    )
)

In [None]:
print("Valid+-Std     Train  :   Parameters")
for i in np.argsort(gs_sample_weight.cv_results_["mean_test_score"])[-5:]:
    print(
        "{1:.3f}+-{3:.3f}     {2:.3f}   :  {0}".format(
            gs_sample_weight.cv_results_["params"][i],
            gs_sample_weight.cv_results_["mean_test_score"][i],
            gs_sample_weight.cv_results_["mean_train_score"][i],
            gs_sample_weight.cv_results_["std_test_score"][i],
        )
    )

### Final model

In [None]:
clf_final = lgb.LGBMClassifier(**clf.get_params())
# set optimal parameters
clf_final.set_params(**opt_parameters)

In [None]:
clf_final.fit(
    train,
    train_targets,
    **fit_params,
    callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)]
)

In [None]:
lgmb_predictions = clf_final.predict_proba(test)[:, 1]

In [None]:
final_predictions.update({"LGBMClassifier": lgmb_predictions})

# TO SUBMIT

In [None]:
ss = pd.read_csv("./data/sample_submission.csv", index_col="match_id_hash")

In [None]:
ss.radiant_win_prob = final_predictions["LGBMClassifier"]

In [None]:
import datetime

df_submission = ss.copy()
submission_filename = "submission_{}.csv".format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
df_submission.to_csv(submission_filename)
print("Submission saved to {}".format(submission_filename))